fix failed or hung qcow2 image download

This commit is contained in:
Josh Patterson
2025-11-19 15:48:53 -05:00
parent b407c68d88
commit dd0b4c3820

View File

@@ -172,7 +172,15 @@ MANAGER_HOSTNAME = socket.gethostname()
def _download_image(): def _download_image():
""" """
Download and validate the Oracle Linux KVM image. Download and validate the Oracle Linux KVM image with retry logic and progress monitoring.
Features:
- Detects stalled downloads (no progress for 30 seconds)
- Retries up to 3 times on failure
- Connection timeout of 30 seconds
- Read timeout of 60 seconds
- Cleans up partial downloads on failure
Returns: Returns:
bool: True if successful or file exists with valid checksum, False on error bool: True if successful or file exists with valid checksum, False on error
""" """
@@ -186,25 +194,53 @@ def _download_image():
log.info("Starting image download process") log.info("Starting image download process")
# Retry configuration
max_attempts = 3
stall_timeout = 30 # seconds without progress before considering download stalled
connection_timeout = 30 # seconds to establish connection
read_timeout = 60 # seconds to wait for data chunks
for attempt in range(1, max_attempts + 1):
log.info("Download attempt %d of %d", attempt, max_attempts)
try: try:
# Download file # Download file with timeouts
log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH) log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH)
response = requests.get(IMAGE_URL, stream=True) response = requests.get(
IMAGE_URL,
stream=True,
timeout=(connection_timeout, read_timeout)
)
response.raise_for_status() response.raise_for_status()
# Get total file size for progress tracking # Get total file size for progress tracking
total_size = int(response.headers.get('content-length', 0)) total_size = int(response.headers.get('content-length', 0))
downloaded_size = 0 downloaded_size = 0
last_log_time = 0 last_log_time = 0
last_progress_time = time.time()
last_downloaded_size = 0
# Save file with progress logging # Save file with progress logging and stall detection
with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f: with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192): for chunk in response.iter_content(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
f.write(chunk) f.write(chunk)
downloaded_size += len(chunk) downloaded_size += len(chunk)
current_time = time.time()
# Check for stalled download
if downloaded_size > last_downloaded_size:
# Progress made, reset stall timer
last_progress_time = current_time
last_downloaded_size = downloaded_size
elif current_time - last_progress_time > stall_timeout:
# No progress for stall_timeout seconds
raise Exception(
f"Download stalled: no progress for {stall_timeout} seconds "
f"at {downloaded_size}/{total_size} bytes"
)
# Log progress every second # Log progress every second
current_time = time.time()
if current_time - last_log_time >= 1: if current_time - last_log_time >= 1:
progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0 progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0
log.info("Progress - %.1f%% (%d/%d bytes)", log.info("Progress - %.1f%% (%d/%d bytes)",
@@ -212,17 +248,47 @@ def _download_image():
last_log_time = current_time last_log_time = current_time
# Validate downloaded file # Validate downloaded file
log.info("Download complete, validating checksum...")
if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256): if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256):
log.error("Checksum validation failed on attempt %d", attempt)
os.unlink(IMAGE_PATH) os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
continue
else:
log.error("All download attempts failed due to checksum mismatch")
return False return False
log.info("Successfully downloaded and validated Oracle Linux KVM image") log.info("Successfully downloaded and validated Oracle Linux KVM image")
return True return True
except Exception as e: except requests.exceptions.Timeout as e:
log.error("Error downloading hypervisor image: %s", str(e)) log.error("Download attempt %d failed: Timeout - %s", attempt, str(e))
if os.path.exists(IMAGE_PATH): if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH) os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
else:
log.error("All download attempts failed due to timeout")
except requests.exceptions.RequestException as e:
log.error("Download attempt %d failed: Network error - %s", attempt, str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
else:
log.error("All download attempts failed due to network errors")
except Exception as e:
log.error("Download attempt %d failed: %s", attempt, str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
else:
log.error("All download attempts failed")
return False return False
def _check_ssh_keys_exist(): def _check_ssh_keys_exist():