From 5b6a7035af9148c9ba77fb6d3277b728f09ee4df Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Wed, 19 Nov 2025 10:22:58 -0500 Subject: [PATCH 1/6] need python_shell for pipes --- salt/salt/engines/master/virtual_node_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/salt/salt/engines/master/virtual_node_manager.py b/salt/salt/engines/master/virtual_node_manager.py index 6d88bd688..ccc063d64 100644 --- a/salt/salt/engines/master/virtual_node_manager.py +++ b/salt/salt/engines/master/virtual_node_manager.py @@ -727,7 +727,8 @@ def check_hypervisor_disk_space(hypervisor: str, size_gb: int) -> Tuple[bool, Op result = local.cmd( hypervisor_minion, 'cmd.run', - ["df -BG /nsm/libvirt/volumes | tail -1 | awk '{print $4}' | sed 's/G//'"] + ["df -BG /nsm/libvirt/volumes | tail -1 | awk '{print $4}' | sed 's/G//'"], + kwarg={'python_shell': True} ) if not result or hypervisor_minion not in result: From dd0b4c38201dea4d050f50043fcdb93be87b98b2 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Wed, 19 Nov 2025 15:48:53 -0500 Subject: [PATCH 2/6] fix failed or hung qcow2 image download --- salt/_runners/setup_hypervisor.py | 138 ++++++++++++++++++++++-------- 1 file changed, 102 insertions(+), 36 deletions(-) diff --git a/salt/_runners/setup_hypervisor.py b/salt/_runners/setup_hypervisor.py index 929801783..b23734654 100644 --- a/salt/_runners/setup_hypervisor.py +++ b/salt/_runners/setup_hypervisor.py @@ -172,7 +172,15 @@ MANAGER_HOSTNAME = socket.gethostname() def _download_image(): """ - Download and validate the Oracle Linux KVM image. + Download and validate the Oracle Linux KVM image with retry logic and progress monitoring. + + Features: + - Detects stalled downloads (no progress for 30 seconds) + - Retries up to 3 times on failure + - Connection timeout of 30 seconds + - Read timeout of 60 seconds + - Cleans up partial downloads on failure + Returns: bool: True if successful or file exists with valid checksum, False on error """ @@ -185,45 +193,103 @@ def _download_image(): os.unlink(IMAGE_PATH) log.info("Starting image download process") + + # Retry configuration + max_attempts = 3 + stall_timeout = 30 # seconds without progress before considering download stalled + connection_timeout = 30 # seconds to establish connection + read_timeout = 60 # seconds to wait for data chunks + + for attempt in range(1, max_attempts + 1): + log.info("Download attempt %d of %d", attempt, max_attempts) + + try: + # Download file with timeouts + log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH) + response = requests.get( + IMAGE_URL, + stream=True, + timeout=(connection_timeout, read_timeout) + ) + response.raise_for_status() - try: - # Download file - log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH) - response = requests.get(IMAGE_URL, stream=True) - response.raise_for_status() + # Get total file size for progress tracking + total_size = int(response.headers.get('content-length', 0)) + downloaded_size = 0 + last_log_time = 0 + last_progress_time = time.time() + last_downloaded_size = 0 - # Get total file size for progress tracking - total_size = int(response.headers.get('content-length', 0)) - downloaded_size = 0 - last_log_time = 0 + # Save file with progress logging and stall detection + with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + downloaded_size += len(chunk) + current_time = time.time() + + # Check for stalled download + if downloaded_size > last_downloaded_size: + # Progress made, reset stall timer + last_progress_time = current_time + last_downloaded_size = downloaded_size + elif current_time - last_progress_time > stall_timeout: + # No progress for stall_timeout seconds + raise Exception( + f"Download stalled: no progress for {stall_timeout} seconds " + f"at {downloaded_size}/{total_size} bytes" + ) + + # Log progress every second + if current_time - last_log_time >= 1: + progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0 + log.info("Progress - %.1f%% (%d/%d bytes)", + progress, downloaded_size, total_size) + last_log_time = current_time - # Save file with progress logging - with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - f.write(chunk) - downloaded_size += len(chunk) + # Validate downloaded file + log.info("Download complete, validating checksum...") + if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256): + log.error("Checksum validation failed on attempt %d", attempt) + os.unlink(IMAGE_PATH) + if attempt < max_attempts: + log.info("Will retry download...") + continue + else: + log.error("All download attempts failed due to checksum mismatch") + return False + + log.info("Successfully downloaded and validated Oracle Linux KVM image") + return True + + except requests.exceptions.Timeout as e: + log.error("Download attempt %d failed: Timeout - %s", attempt, str(e)) + if os.path.exists(IMAGE_PATH): + os.unlink(IMAGE_PATH) + if attempt < max_attempts: + log.info("Will retry download...") + else: + log.error("All download attempts failed due to timeout") - # Log progress every second - current_time = time.time() - if current_time - last_log_time >= 1: - progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0 - log.info("Progress - %.1f%% (%d/%d bytes)", - progress, downloaded_size, total_size) - last_log_time = current_time - - # Validate downloaded file - if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256): - os.unlink(IMAGE_PATH) - return False - - log.info("Successfully downloaded and validated Oracle Linux KVM image") - return True - - except Exception as e: - log.error("Error downloading hypervisor image: %s", str(e)) - if os.path.exists(IMAGE_PATH): - os.unlink(IMAGE_PATH) - return False + except requests.exceptions.RequestException as e: + log.error("Download attempt %d failed: Network error - %s", attempt, str(e)) + if os.path.exists(IMAGE_PATH): + os.unlink(IMAGE_PATH) + if attempt < max_attempts: + log.info("Will retry download...") + else: + log.error("All download attempts failed due to network errors") + + except Exception as e: + log.error("Download attempt %d failed: %s", attempt, str(e)) + if os.path.exists(IMAGE_PATH): + os.unlink(IMAGE_PATH) + if attempt < max_attempts: + log.info("Will retry download...") + else: + log.error("All download attempts failed") + + return False def _check_ssh_keys_exist(): """ From 841ce6b6ec1de5bc06fd91fdcce47ce5ac51db07 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Thu, 20 Nov 2025 13:55:22 -0500 Subject: [PATCH 3/6] update hypervisor annotation for image download or ssh key creation failure --- salt/_runners/setup_hypervisor.py | 36 ++++++++++++------- .../hypervisor/soc_hypervisor.yaml.jinja | 8 +++++ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/salt/_runners/setup_hypervisor.py b/salt/_runners/setup_hypervisor.py index b23734654..5efdca021 100644 --- a/salt/_runners/setup_hypervisor.py +++ b/salt/_runners/setup_hypervisor.py @@ -196,6 +196,7 @@ def _download_image(): # Retry configuration max_attempts = 3 + retry_delay = 5 # seconds to wait between retry attempts stall_timeout = 30 # seconds without progress before considering download stalled connection_timeout = 30 # seconds to establish connection read_timeout = 60 # seconds to wait for data chunks @@ -267,7 +268,8 @@ def _download_image(): if os.path.exists(IMAGE_PATH): os.unlink(IMAGE_PATH) if attempt < max_attempts: - log.info("Will retry download...") + log.info("Will retry download in %d seconds...", retry_delay) + time.sleep(retry_delay) else: log.error("All download attempts failed due to timeout") @@ -276,7 +278,8 @@ def _download_image(): if os.path.exists(IMAGE_PATH): os.unlink(IMAGE_PATH) if attempt < max_attempts: - log.info("Will retry download...") + log.info("Will retry download in %d seconds...", retry_delay) + time.sleep(retry_delay) else: log.error("All download attempts failed due to network errors") @@ -285,7 +288,8 @@ def _download_image(): if os.path.exists(IMAGE_PATH): os.unlink(IMAGE_PATH) if attempt < max_attempts: - log.info("Will retry download...") + log.info("Will retry download in %d seconds...", retry_delay) + time.sleep(retry_delay) else: log.error("All download attempts failed") @@ -485,25 +489,29 @@ def _ensure_hypervisor_host_dir(minion_id: str = None): log.error(f"Error creating hypervisor host directory: {str(e)}") return False -def _apply_dyanno_hypervisor_state(): +def _apply_dyanno_hypervisor_state(status='Initialized'): """ Apply the soc.dyanno.hypervisor state on the salt master. This function applies the soc.dyanno.hypervisor state on the salt master to update the hypervisor annotation and ensure all hypervisor host directories exist. + Args: + status: Status to set for the base domain (default: 'Initialized') + Valid values: 'PreInit', 'Initialized', 'ImageDownloadFailed', 'SSHKeySetupFailed' + Returns: bool: True if state was applied successfully, False otherwise """ try: - log.info("Applying soc.dyanno.hypervisor state on salt master") + log.info(f"Applying soc.dyanno.hypervisor state on salt master with status: {status}") # Initialize the LocalClient local = salt.client.LocalClient() # Target the salt master to apply the soc.dyanno.hypervisor state target = MANAGER_HOSTNAME + '_*' - state_result = local.cmd(target, 'state.apply', ['soc.dyanno.hypervisor', "pillar={'baseDomain': {'status': 'PreInit'}}", 'concurrent=True'], tgt_type='glob') + state_result = local.cmd(target, 'state.apply', ['soc.dyanno.hypervisor', f"pillar={{'baseDomain': {{'status': '{status}'}}}}", 'concurrent=True'], tgt_type='glob') log.debug(f"state_result: {state_result}") # Check if state was applied successfully if state_result: @@ -520,17 +528,17 @@ def _apply_dyanno_hypervisor_state(): success = False if success: - log.info("Successfully applied soc.dyanno.hypervisor state") + log.info(f"Successfully applied soc.dyanno.hypervisor state with status: {status}") return True else: - log.error("Failed to apply soc.dyanno.hypervisor state") + log.error(f"Failed to apply soc.dyanno.hypervisor state with status: {status}") return False else: - log.error("No response from salt master when applying soc.dyanno.hypervisor state") + log.error(f"No response from salt master when applying soc.dyanno.hypervisor state with status: {status}") return False except Exception as e: - log.error(f"Error applying soc.dyanno.hypervisor state: {str(e)}") + log.error(f"Error applying soc.dyanno.hypervisor state with status: {status}: {str(e)}") return False def _apply_cloud_config_state(): @@ -664,8 +672,8 @@ def setup_environment(vm_name: str = 'sool9', disk_size: str = '220G', minion_id log.warning("Failed to apply salt.cloud.config state, continuing with setup") # We don't return an error here as we want to continue with the setup process - # Apply the soc.dyanno.hypervisor state on the salt master - if not _apply_dyanno_hypervisor_state(): + # Apply the soc.dyanno.hypervisor state on the salt master with PreInit status + if not _apply_dyanno_hypervisor_state('PreInit'): log.warning("Failed to apply soc.dyanno.hypervisor state, continuing with setup") # We don't return an error here as we want to continue with the setup process @@ -685,6 +693,8 @@ def setup_environment(vm_name: str = 'sool9', disk_size: str = '220G', minion_id log.info("Starting image download/validation process") if not _download_image(): log.error("Image download failed") + # Update hypervisor annotation with failure status + _apply_dyanno_hypervisor_state('ImageDownloadFailed') return { 'success': False, 'error': 'Image download failed', @@ -697,6 +707,8 @@ def setup_environment(vm_name: str = 'sool9', disk_size: str = '220G', minion_id log.info("Setting up SSH keys") if not _setup_ssh_keys(): log.error("SSH key setup failed") + # Update hypervisor annotation with failure status + _apply_dyanno_hypervisor_state('SSHKeySetupFailed') return { 'success': False, 'error': 'SSH key setup failed', diff --git a/salt/soc/dyanno/hypervisor/soc_hypervisor.yaml.jinja b/salt/soc/dyanno/hypervisor/soc_hypervisor.yaml.jinja index ac2fd6fea..d4b88b091 100644 --- a/salt/soc/dyanno/hypervisor/soc_hypervisor.yaml.jinja +++ b/salt/soc/dyanno/hypervisor/soc_hypervisor.yaml.jinja @@ -43,6 +43,14 @@ No Virtual Machines Found {%- endif %} +{%- elif baseDomainStatus == 'ImageDownloadFailed' %} +#### ERROR + +Base domain image download failed. Please check the salt-master log for details and verify network connectivity. +{%- elif baseDomainStatus == 'SSHKeySetupFailed' %} +#### ERROR + +SSH key setup failed. Please check the salt-master log for details. {%- else %} #### WARNING From fbe97221bbc4919069ad8bbf2569dcd1adb2639a Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Thu, 20 Nov 2025 14:43:09 -0500 Subject: [PATCH 4/6] set initialized status --- salt/_runners/setup_hypervisor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/salt/_runners/setup_hypervisor.py b/salt/_runners/setup_hypervisor.py index 5efdca021..e1bf0a45a 100644 --- a/salt/_runners/setup_hypervisor.py +++ b/salt/_runners/setup_hypervisor.py @@ -733,6 +733,10 @@ def setup_environment(vm_name: str = 'sool9', disk_size: str = '220G', minion_id success = vm_result.get('success', False) log.info("Setup environment completed with status: %s", "SUCCESS" if success else "FAILED") + # Update hypervisor annotation with success status + if success: + _apply_dyanno_hypervisor_state('Initialized') + # If setup was successful and we have a minion_id, run highstate if success and minion_id: log.info("Running highstate on hypervisor %s", minion_id) From 97c1a460133ddac1e5e6f4c7461e207680068009 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Thu, 20 Nov 2025 15:08:04 -0500 Subject: [PATCH 5/6] update annotation for general failure --- salt/_runners/setup_hypervisor.py | 2 ++ salt/soc/dyanno/hypervisor/soc_hypervisor.yaml.jinja | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/salt/_runners/setup_hypervisor.py b/salt/_runners/setup_hypervisor.py index e1bf0a45a..b30f9c6d6 100644 --- a/salt/_runners/setup_hypervisor.py +++ b/salt/_runners/setup_hypervisor.py @@ -736,6 +736,8 @@ def setup_environment(vm_name: str = 'sool9', disk_size: str = '220G', minion_id # Update hypervisor annotation with success status if success: _apply_dyanno_hypervisor_state('Initialized') + else: + _apply_dyanno_hypervisor_state('SetupFailed') # If setup was successful and we have a minion_id, run highstate if success and minion_id: diff --git a/salt/soc/dyanno/hypervisor/soc_hypervisor.yaml.jinja b/salt/soc/dyanno/hypervisor/soc_hypervisor.yaml.jinja index d4b88b091..97170a55f 100644 --- a/salt/soc/dyanno/hypervisor/soc_hypervisor.yaml.jinja +++ b/salt/soc/dyanno/hypervisor/soc_hypervisor.yaml.jinja @@ -51,6 +51,10 @@ Base domain image download failed. Please check the salt-master log for details #### ERROR SSH key setup failed. Please check the salt-master log for details. +{%- elif baseDomainStatus == 'SetupFailed' %} +#### WARNING + +Setup failed. Please check the salt-master log for details. {%- else %} #### WARNING From 2d716b44a8665514f3720ef73d564986062ee102 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Thu, 20 Nov 2025 15:52:21 -0500 Subject: [PATCH 6/6] update comment --- salt/_runners/setup_hypervisor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/salt/_runners/setup_hypervisor.py b/salt/_runners/setup_hypervisor.py index b30f9c6d6..7d521bd62 100644 --- a/salt/_runners/setup_hypervisor.py +++ b/salt/_runners/setup_hypervisor.py @@ -497,8 +497,7 @@ def _apply_dyanno_hypervisor_state(status='Initialized'): to update the hypervisor annotation and ensure all hypervisor host directories exist. Args: - status: Status to set for the base domain (default: 'Initialized') - Valid values: 'PreInit', 'Initialized', 'ImageDownloadFailed', 'SSHKeySetupFailed' + status: Status passed to the hypervisor annotation state Returns: bool: True if state was applied successfully, False otherwise