fix failed or hung qcow2 image download

This commit is contained in:
Josh Patterson
2025-11-19 15:48:53 -05:00
parent b407c68d88
commit dd0b4c3820

View File

@@ -172,7 +172,15 @@ MANAGER_HOSTNAME = socket.gethostname()
def _download_image(): def _download_image():
""" """
Download and validate the Oracle Linux KVM image. Download and validate the Oracle Linux KVM image with retry logic and progress monitoring.
Features:
- Detects stalled downloads (no progress for 30 seconds)
- Retries up to 3 times on failure
- Connection timeout of 30 seconds
- Read timeout of 60 seconds
- Cleans up partial downloads on failure
Returns: Returns:
bool: True if successful or file exists with valid checksum, False on error bool: True if successful or file exists with valid checksum, False on error
""" """
@@ -186,44 +194,102 @@ def _download_image():
log.info("Starting image download process") log.info("Starting image download process")
try: # Retry configuration
# Download file max_attempts = 3
log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH) stall_timeout = 30 # seconds without progress before considering download stalled
response = requests.get(IMAGE_URL, stream=True) connection_timeout = 30 # seconds to establish connection
response.raise_for_status() read_timeout = 60 # seconds to wait for data chunks
# Get total file size for progress tracking for attempt in range(1, max_attempts + 1):
total_size = int(response.headers.get('content-length', 0)) log.info("Download attempt %d of %d", attempt, max_attempts)
downloaded_size = 0
last_log_time = 0
# Save file with progress logging try:
with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f: # Download file with timeouts
for chunk in response.iter_content(chunk_size=8192): log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH)
f.write(chunk) response = requests.get(
downloaded_size += len(chunk) IMAGE_URL,
stream=True,
timeout=(connection_timeout, read_timeout)
)
response.raise_for_status()
# Log progress every second # Get total file size for progress tracking
current_time = time.time() total_size = int(response.headers.get('content-length', 0))
if current_time - last_log_time >= 1: downloaded_size = 0
progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0 last_log_time = 0
log.info("Progress - %.1f%% (%d/%d bytes)", last_progress_time = time.time()
progress, downloaded_size, total_size) last_downloaded_size = 0
last_log_time = current_time
# Validate downloaded file # Save file with progress logging and stall detection
if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256): with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f:
os.unlink(IMAGE_PATH) for chunk in response.iter_content(chunk_size=8192):
return False if chunk: # filter out keep-alive new chunks
f.write(chunk)
downloaded_size += len(chunk)
current_time = time.time()
log.info("Successfully downloaded and validated Oracle Linux KVM image") # Check for stalled download
return True if downloaded_size > last_downloaded_size:
# Progress made, reset stall timer
last_progress_time = current_time
last_downloaded_size = downloaded_size
elif current_time - last_progress_time > stall_timeout:
# No progress for stall_timeout seconds
raise Exception(
f"Download stalled: no progress for {stall_timeout} seconds "
f"at {downloaded_size}/{total_size} bytes"
)
except Exception as e: # Log progress every second
log.error("Error downloading hypervisor image: %s", str(e)) if current_time - last_log_time >= 1:
if os.path.exists(IMAGE_PATH): progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0
os.unlink(IMAGE_PATH) log.info("Progress - %.1f%% (%d/%d bytes)",
return False progress, downloaded_size, total_size)
last_log_time = current_time
# Validate downloaded file
log.info("Download complete, validating checksum...")
if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256):
log.error("Checksum validation failed on attempt %d", attempt)
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
continue
else:
log.error("All download attempts failed due to checksum mismatch")
return False
log.info("Successfully downloaded and validated Oracle Linux KVM image")
return True
except requests.exceptions.Timeout as e:
log.error("Download attempt %d failed: Timeout - %s", attempt, str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
else:
log.error("All download attempts failed due to timeout")
except requests.exceptions.RequestException as e:
log.error("Download attempt %d failed: Network error - %s", attempt, str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
else:
log.error("All download attempts failed due to network errors")
except Exception as e:
log.error("Download attempt %d failed: %s", attempt, str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
else:
log.error("All download attempts failed")
return False
def _check_ssh_keys_exist(): def _check_ssh_keys_exist():
""" """