Merge pull request #15247 from Security-Onion-Solutions/bravo

Notify user of hypervisor environment setup failures
This commit is contained in:
Josh Patterson
2025-11-20 16:04:49 -05:00
committed by GitHub
3 changed files with 142 additions and 46 deletions

View File

@@ -172,7 +172,15 @@ MANAGER_HOSTNAME = socket.gethostname()
def _download_image(): def _download_image():
""" """
Download and validate the Oracle Linux KVM image. Download and validate the Oracle Linux KVM image with retry logic and progress monitoring.
Features:
- Detects stalled downloads (no progress for 30 seconds)
- Retries up to 3 times on failure
- Connection timeout of 30 seconds
- Read timeout of 60 seconds
- Cleans up partial downloads on failure
Returns: Returns:
bool: True if successful or file exists with valid checksum, False on error bool: True if successful or file exists with valid checksum, False on error
""" """
@@ -186,44 +194,106 @@ def _download_image():
log.info("Starting image download process") log.info("Starting image download process")
try: # Retry configuration
# Download file max_attempts = 3
log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH) retry_delay = 5 # seconds to wait between retry attempts
response = requests.get(IMAGE_URL, stream=True) stall_timeout = 30 # seconds without progress before considering download stalled
response.raise_for_status() connection_timeout = 30 # seconds to establish connection
read_timeout = 60 # seconds to wait for data chunks
# Get total file size for progress tracking for attempt in range(1, max_attempts + 1):
total_size = int(response.headers.get('content-length', 0)) log.info("Download attempt %d of %d", attempt, max_attempts)
downloaded_size = 0
last_log_time = 0
# Save file with progress logging try:
with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f: # Download file with timeouts
for chunk in response.iter_content(chunk_size=8192): log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH)
f.write(chunk) response = requests.get(
downloaded_size += len(chunk) IMAGE_URL,
stream=True,
timeout=(connection_timeout, read_timeout)
)
response.raise_for_status()
# Log progress every second # Get total file size for progress tracking
current_time = time.time() total_size = int(response.headers.get('content-length', 0))
if current_time - last_log_time >= 1: downloaded_size = 0
progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0 last_log_time = 0
log.info("Progress - %.1f%% (%d/%d bytes)", last_progress_time = time.time()
progress, downloaded_size, total_size) last_downloaded_size = 0
last_log_time = current_time
# Validate downloaded file # Save file with progress logging and stall detection
if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256): with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f:
os.unlink(IMAGE_PATH) for chunk in response.iter_content(chunk_size=8192):
return False if chunk: # filter out keep-alive new chunks
f.write(chunk)
downloaded_size += len(chunk)
current_time = time.time()
log.info("Successfully downloaded and validated Oracle Linux KVM image") # Check for stalled download
return True if downloaded_size > last_downloaded_size:
# Progress made, reset stall timer
last_progress_time = current_time
last_downloaded_size = downloaded_size
elif current_time - last_progress_time > stall_timeout:
# No progress for stall_timeout seconds
raise Exception(
f"Download stalled: no progress for {stall_timeout} seconds "
f"at {downloaded_size}/{total_size} bytes"
)
except Exception as e: # Log progress every second
log.error("Error downloading hypervisor image: %s", str(e)) if current_time - last_log_time >= 1:
if os.path.exists(IMAGE_PATH): progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0
os.unlink(IMAGE_PATH) log.info("Progress - %.1f%% (%d/%d bytes)",
return False progress, downloaded_size, total_size)
last_log_time = current_time
# Validate downloaded file
log.info("Download complete, validating checksum...")
if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256):
log.error("Checksum validation failed on attempt %d", attempt)
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
continue
else:
log.error("All download attempts failed due to checksum mismatch")
return False
log.info("Successfully downloaded and validated Oracle Linux KVM image")
return True
except requests.exceptions.Timeout as e:
log.error("Download attempt %d failed: Timeout - %s", attempt, str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download in %d seconds...", retry_delay)
time.sleep(retry_delay)
else:
log.error("All download attempts failed due to timeout")
except requests.exceptions.RequestException as e:
log.error("Download attempt %d failed: Network error - %s", attempt, str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download in %d seconds...", retry_delay)
time.sleep(retry_delay)
else:
log.error("All download attempts failed due to network errors")
except Exception as e:
log.error("Download attempt %d failed: %s", attempt, str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download in %d seconds...", retry_delay)
time.sleep(retry_delay)
else:
log.error("All download attempts failed")
return False
def _check_ssh_keys_exist(): def _check_ssh_keys_exist():
""" """
@@ -419,25 +489,28 @@ def _ensure_hypervisor_host_dir(minion_id: str = None):
log.error(f"Error creating hypervisor host directory: {str(e)}") log.error(f"Error creating hypervisor host directory: {str(e)}")
return False return False
def _apply_dyanno_hypervisor_state(): def _apply_dyanno_hypervisor_state(status='Initialized'):
""" """
Apply the soc.dyanno.hypervisor state on the salt master. Apply the soc.dyanno.hypervisor state on the salt master.
This function applies the soc.dyanno.hypervisor state on the salt master This function applies the soc.dyanno.hypervisor state on the salt master
to update the hypervisor annotation and ensure all hypervisor host directories exist. to update the hypervisor annotation and ensure all hypervisor host directories exist.
Args:
status: Status passed to the hypervisor annotation state
Returns: Returns:
bool: True if state was applied successfully, False otherwise bool: True if state was applied successfully, False otherwise
""" """
try: try:
log.info("Applying soc.dyanno.hypervisor state on salt master") log.info(f"Applying soc.dyanno.hypervisor state on salt master with status: {status}")
# Initialize the LocalClient # Initialize the LocalClient
local = salt.client.LocalClient() local = salt.client.LocalClient()
# Target the salt master to apply the soc.dyanno.hypervisor state # Target the salt master to apply the soc.dyanno.hypervisor state
target = MANAGER_HOSTNAME + '_*' target = MANAGER_HOSTNAME + '_*'
state_result = local.cmd(target, 'state.apply', ['soc.dyanno.hypervisor', "pillar={'baseDomain': {'status': 'PreInit'}}", 'concurrent=True'], tgt_type='glob') state_result = local.cmd(target, 'state.apply', ['soc.dyanno.hypervisor', f"pillar={{'baseDomain': {{'status': '{status}'}}}}", 'concurrent=True'], tgt_type='glob')
log.debug(f"state_result: {state_result}") log.debug(f"state_result: {state_result}")
# Check if state was applied successfully # Check if state was applied successfully
if state_result: if state_result:
@@ -454,17 +527,17 @@ def _apply_dyanno_hypervisor_state():
success = False success = False
if success: if success:
log.info("Successfully applied soc.dyanno.hypervisor state") log.info(f"Successfully applied soc.dyanno.hypervisor state with status: {status}")
return True return True
else: else:
log.error("Failed to apply soc.dyanno.hypervisor state") log.error(f"Failed to apply soc.dyanno.hypervisor state with status: {status}")
return False return False
else: else:
log.error("No response from salt master when applying soc.dyanno.hypervisor state") log.error(f"No response from salt master when applying soc.dyanno.hypervisor state with status: {status}")
return False return False
except Exception as e: except Exception as e:
log.error(f"Error applying soc.dyanno.hypervisor state: {str(e)}") log.error(f"Error applying soc.dyanno.hypervisor state with status: {status}: {str(e)}")
return False return False
def _apply_cloud_config_state(): def _apply_cloud_config_state():
@@ -598,8 +671,8 @@ def setup_environment(vm_name: str = 'sool9', disk_size: str = '220G', minion_id
log.warning("Failed to apply salt.cloud.config state, continuing with setup") log.warning("Failed to apply salt.cloud.config state, continuing with setup")
# We don't return an error here as we want to continue with the setup process # We don't return an error here as we want to continue with the setup process
# Apply the soc.dyanno.hypervisor state on the salt master # Apply the soc.dyanno.hypervisor state on the salt master with PreInit status
if not _apply_dyanno_hypervisor_state(): if not _apply_dyanno_hypervisor_state('PreInit'):
log.warning("Failed to apply soc.dyanno.hypervisor state, continuing with setup") log.warning("Failed to apply soc.dyanno.hypervisor state, continuing with setup")
# We don't return an error here as we want to continue with the setup process # We don't return an error here as we want to continue with the setup process
@@ -619,6 +692,8 @@ def setup_environment(vm_name: str = 'sool9', disk_size: str = '220G', minion_id
log.info("Starting image download/validation process") log.info("Starting image download/validation process")
if not _download_image(): if not _download_image():
log.error("Image download failed") log.error("Image download failed")
# Update hypervisor annotation with failure status
_apply_dyanno_hypervisor_state('ImageDownloadFailed')
return { return {
'success': False, 'success': False,
'error': 'Image download failed', 'error': 'Image download failed',
@@ -631,6 +706,8 @@ def setup_environment(vm_name: str = 'sool9', disk_size: str = '220G', minion_id
log.info("Setting up SSH keys") log.info("Setting up SSH keys")
if not _setup_ssh_keys(): if not _setup_ssh_keys():
log.error("SSH key setup failed") log.error("SSH key setup failed")
# Update hypervisor annotation with failure status
_apply_dyanno_hypervisor_state('SSHKeySetupFailed')
return { return {
'success': False, 'success': False,
'error': 'SSH key setup failed', 'error': 'SSH key setup failed',
@@ -655,6 +732,12 @@ def setup_environment(vm_name: str = 'sool9', disk_size: str = '220G', minion_id
success = vm_result.get('success', False) success = vm_result.get('success', False)
log.info("Setup environment completed with status: %s", "SUCCESS" if success else "FAILED") log.info("Setup environment completed with status: %s", "SUCCESS" if success else "FAILED")
# Update hypervisor annotation with success status
if success:
_apply_dyanno_hypervisor_state('Initialized')
else:
_apply_dyanno_hypervisor_state('SetupFailed')
# If setup was successful and we have a minion_id, run highstate # If setup was successful and we have a minion_id, run highstate
if success and minion_id: if success and minion_id:
log.info("Running highstate on hypervisor %s", minion_id) log.info("Running highstate on hypervisor %s", minion_id)

View File

@@ -727,7 +727,8 @@ def check_hypervisor_disk_space(hypervisor: str, size_gb: int) -> Tuple[bool, Op
result = local.cmd( result = local.cmd(
hypervisor_minion, hypervisor_minion,
'cmd.run', 'cmd.run',
["df -BG /nsm/libvirt/volumes | tail -1 | awk '{print $4}' | sed 's/G//'"] ["df -BG /nsm/libvirt/volumes | tail -1 | awk '{print $4}' | sed 's/G//'"],
kwarg={'python_shell': True}
) )
if not result or hypervisor_minion not in result: if not result or hypervisor_minion not in result:

View File

@@ -43,6 +43,18 @@
No Virtual Machines Found No Virtual Machines Found
{%- endif %} {%- endif %}
{%- elif baseDomainStatus == 'ImageDownloadFailed' %}
#### ERROR
Base domain image download failed. Please check the salt-master log for details and verify network connectivity.
{%- elif baseDomainStatus == 'SSHKeySetupFailed' %}
#### ERROR
SSH key setup failed. Please check the salt-master log for details.
{%- elif baseDomainStatus == 'SetupFailed' %}
#### WARNING
Setup failed. Please check the salt-master log for details.
{%- else %} {%- else %}
#### WARNING #### WARNING