diff --git a/sn_node_manager/src/add_services/tests.rs b/sn_node_manager/src/add_services/tests.rs index b7fab7bdc5..b8497b68d2 100644 --- a/sn_node_manager/src/add_services/tests.rs +++ b/sn_node_manager/src/add_services/tests.rs @@ -31,7 +31,7 @@ use sn_service_management::{ use std::{ ffi::OsString, net::{IpAddr, Ipv4Addr, SocketAddr}, - path::PathBuf, + path::{Path, PathBuf}, str::FromStr, }; @@ -54,7 +54,7 @@ mock! { fn create_service_user(&self, username: &str) -> ServiceControlResult<()>; fn get_available_port(&self) -> ServiceControlResult; fn install(&self, install_ctx: ServiceInstallCtx) -> ServiceControlResult<()>; - fn get_process_pid(&self, name: &str) -> ServiceControlResult; + fn get_process_pid(&self, bin_path: &Path) -> ServiceControlResult; fn is_service_process_running(&self, pid: u32) -> bool; fn start(&self, service_name: &str) -> ServiceControlResult<()>; fn stop(&self, service_name: &str) -> ServiceControlResult<()>; diff --git a/sn_node_manager/src/cmd/mod.rs b/sn_node_manager/src/cmd/mod.rs index 4634d3b09d..07e38f8c83 100644 --- a/sn_node_manager/src/cmd/mod.rs +++ b/sn_node_manager/src/cmd/mod.rs @@ -75,6 +75,13 @@ pub fn print_upgrade_summary(upgrade_summary: Vec<(String, UpgradeResult)>) { service_name ); } + UpgradeResult::UpgradedButNotStarted(previous_version, new_version, _) => { + println!( + "{} {} was upgraded from {previous_version} to {new_version} but it did not start", + "✕".red(), + service_name + ); + } UpgradeResult::Forced(previous_version, target_version) => { println!( "{} Forced {} version change from {previous_version} to {target_version}.", diff --git a/sn_node_manager/src/cmd/node.rs b/sn_node_manager/src/cmd/node.rs index 16c8ed6eb6..a8352752cb 100644 --- a/sn_node_manager/src/cmd/node.rs +++ b/sn_node_manager/src/cmd/node.rs @@ -18,7 +18,7 @@ use crate::{ helpers::{download_and_extract_release, get_bin_version}, refresh_node_registry, status_report, ServiceManager, VerbosityLevel, }; -use color_eyre::{eyre::eyre, Result}; +use color_eyre::{eyre::eyre, Help, Result}; use colored::Colorize; use libp2p_identity::PeerId; use semver::Version; @@ -358,9 +358,18 @@ pub async fn upgrade( } } - print_upgrade_summary(upgrade_summary); - node_registry.save()?; + print_upgrade_summary(upgrade_summary.clone()); + + if upgrade_summary.iter().any(|(_, r)| { + matches!(r, UpgradeResult::Error(_)) + || matches!(r, UpgradeResult::UpgradedButNotStarted(_, _, _)) + }) { + return Err(eyre!("There was a problem upgrading one or more nodes").suggestion( + "For any services that were upgraded but did not start, you can attempt to start them \ + again using the 'start' command.")); + } + Ok(()) } diff --git a/sn_node_manager/src/lib.rs b/sn_node_manager/src/lib.rs index b60860d75e..c16c6016f7 100644 --- a/sn_node_manager/src/lib.rs +++ b/sn_node_manager/src/lib.rs @@ -44,6 +44,7 @@ use sn_service_management::{ NodeRegistry, NodeServiceData, ServiceStateActions, ServiceStatus, UpgradeOptions, UpgradeResult, }; +use tracing::debug; pub const DAEMON_DEFAULT_PORT: u16 = 12500; pub const DAEMON_SERVICE_NAME: &str = "safenodemand"; @@ -91,6 +92,31 @@ impl ServiceManager { self.service_control.start(&self.service.name())?; self.service_control.wait(RPC_START_UP_DELAY_MS); + // This is an attempt to see whether the service process has actually launched. You don't + // always get an error from the service infrastructure. + // + // There might be many different `safenode` processes running, but since each service has + // its own isolated binary, we use the binary path to uniquely identify it. + match self + .service_control + .get_process_pid(&self.service.bin_path()) + { + Ok(pid) => { + debug!( + "Service process started for {} with PID {}", + self.service.name(), + pid + ); + } + Err(sn_service_management::error::Error::ServiceProcessNotFound(_)) => { + return Err(eyre!( + "The '{}' service has failed to start", + self.service.name() + )); + } + Err(e) => return Err(e.into()), + } + self.service.on_start().await?; println!("{} Started {} service", "✓".green(), self.service.name()); @@ -236,7 +262,18 @@ impl ServiceManager { )?; if options.start_service { - self.start().await?; + match self.start().await { + Ok(()) => {} + Err(e) => { + self.service + .set_version(&options.target_version.to_string()); + return Ok(UpgradeResult::UpgradedButNotStarted( + current_version.to_string(), + options.target_version.to_string(), + e.to_string(), + )); + } + } } self.service .set_version(&options.target_version.to_string()); @@ -444,14 +481,14 @@ mod tests { use predicates::prelude::*; use service_manager::ServiceInstallCtx; use sn_service_management::{ - error::Result as ServiceControlResult, + error::{Error as ServiceControlError, Result as ServiceControlResult}, node::{NodeService, NodeServiceData}, rpc::{NetworkInfo, NodeInfo, RecordAddress, RpcActions}, UpgradeOptions, UpgradeResult, }; use std::{ net::{IpAddr, Ipv4Addr, SocketAddr}, - path::PathBuf, + path::{Path, PathBuf}, str::FromStr, }; @@ -475,7 +512,7 @@ mod tests { fn create_service_user(&self, username: &str) -> ServiceControlResult<()>; fn get_available_port(&self) -> ServiceControlResult; fn install(&self, install_ctx: ServiceInstallCtx) -> ServiceControlResult<()>; - fn get_process_pid(&self, name: &str) -> ServiceControlResult; + fn get_process_pid(&self, bin_path: &Path) -> ServiceControlResult; fn is_service_process_running(&self, pid: u32) -> bool; fn start(&self, service_name: &str) -> ServiceControlResult<()>; fn stop(&self, service_name: &str) -> ServiceControlResult<()>; @@ -499,6 +536,13 @@ mod tests { .with(eq(3000)) .times(1) .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(PathBuf::from( + "/var/safenode-manager/services/safenode1/safenode", + ))) + .times(1) + .returning(|_| Ok(100)); mock_rpc_client.expect_node_info().times(1).returning(|| { Ok(NodeInfo { pid: 1000, @@ -575,6 +619,13 @@ mod tests { .with(eq(3000)) .times(1) .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(PathBuf::from( + "/var/safenode-manager/services/safenode1/safenode", + ))) + .times(1) + .returning(|_| Ok(100)); mock_rpc_client.expect_node_info().times(1).returning(|| { Ok(NodeInfo { pid: 1000, @@ -697,6 +748,11 @@ mod tests { let mut mock_service_control = MockServiceControl::new(); let mut mock_rpc_client = MockRpcClient::new(); + mock_service_control + .expect_is_service_process_running() + .with(eq(1000)) + .times(1) + .returning(|_| false); mock_service_control .expect_start() .with(eq("safenode1")) @@ -708,10 +764,12 @@ mod tests { .times(1) .returning(|_| ()); mock_service_control - .expect_is_service_process_running() - .with(eq(1000)) + .expect_get_process_pid() + .with(eq(PathBuf::from( + "/var/safenode-manager/services/safenode1/safenode", + ))) .times(1) - .returning(|_| false); + .returning(|_| Ok(100)); mock_rpc_client.expect_node_info().times(1).returning(|| { Ok(NodeInfo { pid: 1000, @@ -775,6 +833,65 @@ mod tests { Ok(()) } + #[tokio::test] + async fn start_should_return_an_error_if_the_process_was_not_found() -> Result<()> { + let mut mock_service_control = MockServiceControl::new(); + + mock_service_control + .expect_start() + .with(eq("safenode1")) + .times(1) + .returning(|_| Ok(())); + mock_service_control + .expect_wait() + .with(eq(3000)) + .times(1) + .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(PathBuf::from( + "/var/safenode-manager/services/safenode1/safenode", + ))) + .times(1) + .returning(|_| { + Err(ServiceControlError::ServiceProcessNotFound( + "/var/safenode-manager/services/safenode1/safenode".to_string(), + )) + }); + + let mut service_data = NodeServiceData { + connected_peers: None, + data_dir_path: PathBuf::from("/var/safenode-manager/services/safenode1"), + genesis: false, + listen_addr: None, + local: false, + log_dir_path: PathBuf::from("/var/log/safenode/safenode1"), + number: 1, + peer_id: None, + pid: None, + rpc_socket_addr: SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 8081), + safenode_path: PathBuf::from("/var/safenode-manager/services/safenode1/safenode"), + service_name: "safenode1".to_string(), + status: ServiceStatus::Added, + user: "safe".to_string(), + version: "0.98.1".to_string(), + }; + let service = NodeService::new(&mut service_data, Box::new(MockRpcClient::new())); + let mut service_manager = ServiceManager::new( + service, + Box::new(mock_service_control), + VerbosityLevel::Normal, + ); + + let result = service_manager.start().await; + match result { + Ok(_) => panic!("This test should have resulted in an error"), + Err(e) => assert_eq!("The 'safenode1' service has failed to start", e.to_string()), + } + + Ok(()) + } + #[tokio::test] async fn stop_should_stop_a_running_service() -> Result<()> { let mut mock_service_control = MockServiceControl::new(); @@ -994,6 +1111,11 @@ mod tests { .with(eq(3000)) .times(1) .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(current_node_bin.to_path_buf().clone())) + .times(1) + .returning(|_| Ok(100)); mock_rpc_client.expect_node_info().times(1).returning(|| { Ok(NodeInfo { pid: 2000, @@ -1185,6 +1307,11 @@ mod tests { .with(eq(3000)) .times(1) .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(current_node_bin.to_path_buf().clone())) + .times(1) + .returning(|_| Ok(100)); mock_rpc_client.expect_node_info().times(1).returning(|| { Ok(NodeInfo { pid: 2000, @@ -1401,6 +1528,121 @@ mod tests { Ok(()) } + #[tokio::test] + async fn upgrade_should_return_upgraded_but_not_started_if_service_did_not_start() -> Result<()> + { + let current_version = "0.1.0"; + let target_version = "0.2.0"; + + let tmp_data_dir = assert_fs::TempDir::new()?; + let current_install_dir = tmp_data_dir.child("safenode_install"); + current_install_dir.create_dir_all()?; + + let current_node_bin = current_install_dir.child("safenode"); + current_node_bin.write_binary(b"fake safenode binary")?; + let target_node_bin = tmp_data_dir.child("safenode"); + target_node_bin.write_binary(b"fake safenode binary")?; + + let current_node_bin_str = current_node_bin.to_path_buf().to_string_lossy().to_string(); + + let mut mock_service_control = MockServiceControl::new(); + + // before binary upgrade + mock_service_control + .expect_is_service_process_running() + .with(eq(1000)) + .times(1) + .returning(|_| true); + mock_service_control + .expect_stop() + .with(eq("safenode1")) + .times(1) + .returning(|_| Ok(())); + + // after binary upgrade + mock_service_control + .expect_uninstall() + .with(eq("safenode1")) + .times(1) + .returning(|_| Ok(())); + mock_service_control + .expect_install() + .with(always()) + .times(1) + .returning(|_| Ok(())); + + // after service restart + mock_service_control + .expect_start() + .with(eq("safenode1")) + .times(1) + .returning(|_| Ok(())); + mock_service_control + .expect_wait() + .with(eq(3000)) + .times(1) + .returning(|_| ()); + mock_service_control + .expect_get_process_pid() + .with(eq(current_node_bin.to_path_buf().clone())) + .times(1) + .returning(move |_| { + Err(ServiceControlError::ServiceProcessNotFound( + current_node_bin_str.clone(), + )) + }); + + let mut service_data = NodeServiceData { + connected_peers: None, + data_dir_path: PathBuf::from("/var/safenode-manager/services/safenode1"), + genesis: false, + listen_addr: None, + local: false, + log_dir_path: PathBuf::from("/var/log/safenode/safenode1"), + number: 1, + peer_id: Some(PeerId::from_str( + "12D3KooWS2tpXGGTmg2AHFiDh57yPQnat49YHnyqoggzXZWpqkCR", + )?), + pid: Some(1000), + rpc_socket_addr: SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 8081), + safenode_path: current_node_bin.to_path_buf(), + service_name: "safenode1".to_string(), + status: ServiceStatus::Running, + user: "safe".to_string(), + version: current_version.to_string(), + }; + let service = NodeService::new(&mut service_data, Box::new(MockRpcClient::new())); + let mut service_manager = ServiceManager::new( + service, + Box::new(mock_service_control), + VerbosityLevel::Normal, + ); + + let upgrade_result = service_manager + .upgrade(UpgradeOptions { + bootstrap_peers: Vec::new(), + env_variables: None, + force: false, + start_service: true, + target_bin_path: target_node_bin.to_path_buf(), + target_version: Version::parse(target_version).unwrap(), + }) + .await?; + + match upgrade_result { + UpgradeResult::UpgradedButNotStarted(old_version, new_version, _) => { + assert_eq!(old_version, current_version); + assert_eq!(new_version, target_version); + } + _ => panic!( + "Expected UpgradeResult::UpgradedButNotStarted but was {:#?}", + upgrade_result + ), + } + + Ok(()) + } + #[tokio::test] async fn remove_should_remove_an_added_node() -> Result<()> { let temp_dir = assert_fs::TempDir::new()?; diff --git a/sn_service_management/src/control.rs b/sn_service_management/src/control.rs index 11bc2e7218..8a8a6f5565 100644 --- a/sn_service_management/src/control.rs +++ b/sn_service_management/src/control.rs @@ -12,7 +12,10 @@ use service_manager::{ ServiceInstallCtx, ServiceLabel, ServiceManager, ServiceStartCtx, ServiceStopCtx, ServiceUninstallCtx, }; -use std::net::{SocketAddr, TcpListener}; +use std::{ + net::{SocketAddr, TcpListener}, + path::Path, +}; use sysinfo::{Pid, System}; /// A thin wrapper around the `service_manager::ServiceManager`, which makes our own testing @@ -26,7 +29,7 @@ pub trait ServiceControl: Sync { fn create_service_user(&self, username: &str) -> Result<()>; fn get_available_port(&self) -> Result; fn install(&self, install_ctx: ServiceInstallCtx) -> Result<()>; - fn get_process_pid(&self, name: &str) -> Result; + fn get_process_pid(&self, path: &Path) -> Result; fn is_service_process_running(&self, pid: u32) -> bool; fn start(&self, service_name: &str) -> Result<()>; fn stop(&self, service_name: &str) -> Result<()>; @@ -161,17 +164,21 @@ impl ServiceControl for ServiceController { Ok(port) } - fn get_process_pid(&self, name: &str) -> Result { + fn get_process_pid(&self, bin_path: &Path) -> Result { let mut system = System::new_all(); system.refresh_all(); for (pid, process) in system.processes() { - if process.name() == name { - // There does not seem to be any easy way to get the process ID from the `Pid` - // type. Probably something to do with representing it in a cross-platform way. - return Ok(pid.to_string().parse::()?); + if let Some(path) = process.exe() { + if bin_path == path { + // There does not seem to be any easy way to get the process ID from the `Pid` + // type. Probably something to do with representing it in a cross-platform way. + return Ok(pid.to_string().parse::()?); + } } } - Err(Error::ServiceProcessNotFound(name.to_string())) + Err(Error::ServiceProcessNotFound( + bin_path.to_string_lossy().to_string(), + )) } fn install(&self, install_ctx: ServiceInstallCtx) -> Result<()> { diff --git a/sn_service_management/src/daemon.rs b/sn_service_management/src/daemon.rs index 72c8da6135..4822303e60 100644 --- a/sn_service_management/src/daemon.rs +++ b/sn_service_management/src/daemon.rs @@ -96,7 +96,7 @@ impl<'a> ServiceStateActions for DaemonService<'a> { // get_process_pid causes errors for the daemon. Maybe because it is being run as root? if let Ok(pid) = self .service_control - .get_process_pid(&self.service_data.service_name) + .get_process_pid(&self.service_data.daemon_path) { self.service_data.pid = Some(pid); } diff --git a/sn_service_management/src/error.rs b/sn_service_management/src/error.rs index 354e5ea864..726775ee69 100644 --- a/sn_service_management/src/error.rs +++ b/sn_service_management/src/error.rs @@ -43,7 +43,7 @@ pub enum Error { RpcNodeUpdateError(String), #[error("Could not obtain record addresses through RPC: {0}")] RpcRecordAddressError(String), - #[error("Could not find process '{0}'")] + #[error("Could not find process at '{0}'")] ServiceProcessNotFound(String), #[error("The user may have removed the '{0}' service outwith the node manager")] ServiceRemovedManually(String), diff --git a/sn_service_management/src/faucet.rs b/sn_service_management/src/faucet.rs index 7607278bef..fc30f6ad91 100644 --- a/sn_service_management/src/faucet.rs +++ b/sn_service_management/src/faucet.rs @@ -102,7 +102,7 @@ impl<'a> ServiceStateActions for FaucetService<'a> { async fn on_start(&mut self) -> Result<()> { let pid = self .service_control - .get_process_pid(&self.service_data.service_name)?; + .get_process_pid(&self.service_data.faucet_path)?; self.service_data.pid = Some(pid); self.service_data.status = ServiceStatus::Running; Ok(()) diff --git a/sn_service_management/src/lib.rs b/sn_service_management/src/lib.rs index 457e0926bd..9a2dca1f5e 100644 --- a/sn_service_management/src/lib.rs +++ b/sn_service_management/src/lib.rs @@ -49,6 +49,7 @@ pub enum UpgradeResult { Forced(String, String), NotRequired, Upgraded(String, String), + UpgradedButNotStarted(String, String, String), Error(String), }