diff --git a/salt/_runners/setup_hypervisor.py b/salt/_runners/setup_hypervisor.py index 929801783..b23734654 100644 --- a/salt/_runners/setup_hypervisor.py +++ b/salt/_runners/setup_hypervisor.py @@ -172,7 +172,15 @@ MANAGER_HOSTNAME = socket.gethostname() def _download_image(): """ - Download and validate the Oracle Linux KVM image. + Download and validate the Oracle Linux KVM image with retry logic and progress monitoring. + + Features: + - Detects stalled downloads (no progress for 30 seconds) + - Retries up to 3 times on failure + - Connection timeout of 30 seconds + - Read timeout of 60 seconds + - Cleans up partial downloads on failure + Returns: bool: True if successful or file exists with valid checksum, False on error """ @@ -185,45 +193,103 @@ def _download_image(): os.unlink(IMAGE_PATH) log.info("Starting image download process") + + # Retry configuration + max_attempts = 3 + stall_timeout = 30 # seconds without progress before considering download stalled + connection_timeout = 30 # seconds to establish connection + read_timeout = 60 # seconds to wait for data chunks + + for attempt in range(1, max_attempts + 1): + log.info("Download attempt %d of %d", attempt, max_attempts) + + try: + # Download file with timeouts + log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH) + response = requests.get( + IMAGE_URL, + stream=True, + timeout=(connection_timeout, read_timeout) + ) + response.raise_for_status() - try: - # Download file - log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH) - response = requests.get(IMAGE_URL, stream=True) - response.raise_for_status() + # Get total file size for progress tracking + total_size = int(response.headers.get('content-length', 0)) + downloaded_size = 0 + last_log_time = 0 + last_progress_time = time.time() + last_downloaded_size = 0 - # Get total file size for progress tracking - total_size = int(response.headers.get('content-length', 0)) - downloaded_size = 0 - last_log_time = 0 + # Save file with progress logging and stall detection + with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + downloaded_size += len(chunk) + current_time = time.time() + + # Check for stalled download + if downloaded_size > last_downloaded_size: + # Progress made, reset stall timer + last_progress_time = current_time + last_downloaded_size = downloaded_size + elif current_time - last_progress_time > stall_timeout: + # No progress for stall_timeout seconds + raise Exception( + f"Download stalled: no progress for {stall_timeout} seconds " + f"at {downloaded_size}/{total_size} bytes" + ) + + # Log progress every second + if current_time - last_log_time >= 1: + progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0 + log.info("Progress - %.1f%% (%d/%d bytes)", + progress, downloaded_size, total_size) + last_log_time = current_time - # Save file with progress logging - with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - f.write(chunk) - downloaded_size += len(chunk) + # Validate downloaded file + log.info("Download complete, validating checksum...") + if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256): + log.error("Checksum validation failed on attempt %d", attempt) + os.unlink(IMAGE_PATH) + if attempt < max_attempts: + log.info("Will retry download...") + continue + else: + log.error("All download attempts failed due to checksum mismatch") + return False + + log.info("Successfully downloaded and validated Oracle Linux KVM image") + return True + + except requests.exceptions.Timeout as e: + log.error("Download attempt %d failed: Timeout - %s", attempt, str(e)) + if os.path.exists(IMAGE_PATH): + os.unlink(IMAGE_PATH) + if attempt < max_attempts: + log.info("Will retry download...") + else: + log.error("All download attempts failed due to timeout") - # Log progress every second - current_time = time.time() - if current_time - last_log_time >= 1: - progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0 - log.info("Progress - %.1f%% (%d/%d bytes)", - progress, downloaded_size, total_size) - last_log_time = current_time - - # Validate downloaded file - if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256): - os.unlink(IMAGE_PATH) - return False - - log.info("Successfully downloaded and validated Oracle Linux KVM image") - return True - - except Exception as e: - log.error("Error downloading hypervisor image: %s", str(e)) - if os.path.exists(IMAGE_PATH): - os.unlink(IMAGE_PATH) - return False + except requests.exceptions.RequestException as e: + log.error("Download attempt %d failed: Network error - %s", attempt, str(e)) + if os.path.exists(IMAGE_PATH): + os.unlink(IMAGE_PATH) + if attempt < max_attempts: + log.info("Will retry download...") + else: + log.error("All download attempts failed due to network errors") + + except Exception as e: + log.error("Download attempt %d failed: %s", attempt, str(e)) + if os.path.exists(IMAGE_PATH): + os.unlink(IMAGE_PATH) + if attempt < max_attempts: + log.info("Will retry download...") + else: + log.error("All download attempts failed") + + return False def _check_ssh_keys_exist(): """