fix failed or hung qcow2 image download

This commit is contained in:
Josh Patterson
2025-11-19 15:48:53 -05:00
parent b407c68d88
commit dd0b4c3820

View File

@@ -172,7 +172,15 @@ MANAGER_HOSTNAME = socket.gethostname()
def _download_image():
"""
Download and validate the Oracle Linux KVM image.
Download and validate the Oracle Linux KVM image with retry logic and progress monitoring.
Features:
- Detects stalled downloads (no progress for 30 seconds)
- Retries up to 3 times on failure
- Connection timeout of 30 seconds
- Read timeout of 60 seconds
- Cleans up partial downloads on failure
Returns:
bool: True if successful or file exists with valid checksum, False on error
"""
@@ -185,45 +193,103 @@ def _download_image():
os.unlink(IMAGE_PATH)
log.info("Starting image download process")
# Retry configuration
max_attempts = 3
stall_timeout = 30 # seconds without progress before considering download stalled
connection_timeout = 30 # seconds to establish connection
read_timeout = 60 # seconds to wait for data chunks
for attempt in range(1, max_attempts + 1):
log.info("Download attempt %d of %d", attempt, max_attempts)
try:
# Download file with timeouts
log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH)
response = requests.get(
IMAGE_URL,
stream=True,
timeout=(connection_timeout, read_timeout)
)
response.raise_for_status()
try:
# Download file
log.info("Downloading Oracle Linux KVM image from %s to %s", IMAGE_URL, IMAGE_PATH)
response = requests.get(IMAGE_URL, stream=True)
response.raise_for_status()
# Get total file size for progress tracking
total_size = int(response.headers.get('content-length', 0))
downloaded_size = 0
last_log_time = 0
last_progress_time = time.time()
last_downloaded_size = 0
# Get total file size for progress tracking
total_size = int(response.headers.get('content-length', 0))
downloaded_size = 0
last_log_time = 0
# Save file with progress logging and stall detection
with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
downloaded_size += len(chunk)
current_time = time.time()
# Check for stalled download
if downloaded_size > last_downloaded_size:
# Progress made, reset stall timer
last_progress_time = current_time
last_downloaded_size = downloaded_size
elif current_time - last_progress_time > stall_timeout:
# No progress for stall_timeout seconds
raise Exception(
f"Download stalled: no progress for {stall_timeout} seconds "
f"at {downloaded_size}/{total_size} bytes"
)
# Log progress every second
if current_time - last_log_time >= 1:
progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0
log.info("Progress - %.1f%% (%d/%d bytes)",
progress, downloaded_size, total_size)
last_log_time = current_time
# Save file with progress logging
with salt.utils.files.fopen(IMAGE_PATH, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
downloaded_size += len(chunk)
# Validate downloaded file
log.info("Download complete, validating checksum...")
if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256):
log.error("Checksum validation failed on attempt %d", attempt)
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
continue
else:
log.error("All download attempts failed due to checksum mismatch")
return False
log.info("Successfully downloaded and validated Oracle Linux KVM image")
return True
except requests.exceptions.Timeout as e:
log.error("Download attempt %d failed: Timeout - %s", attempt, str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
else:
log.error("All download attempts failed due to timeout")
# Log progress every second
current_time = time.time()
if current_time - last_log_time >= 1:
progress = (downloaded_size / total_size) * 100 if total_size > 0 else 0
log.info("Progress - %.1f%% (%d/%d bytes)",
progress, downloaded_size, total_size)
last_log_time = current_time
# Validate downloaded file
if not _validate_image_checksum(IMAGE_PATH, IMAGE_SHA256):
os.unlink(IMAGE_PATH)
return False
log.info("Successfully downloaded and validated Oracle Linux KVM image")
return True
except Exception as e:
log.error("Error downloading hypervisor image: %s", str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
return False
except requests.exceptions.RequestException as e:
log.error("Download attempt %d failed: Network error - %s", attempt, str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
else:
log.error("All download attempts failed due to network errors")
except Exception as e:
log.error("Download attempt %d failed: %s", attempt, str(e))
if os.path.exists(IMAGE_PATH):
os.unlink(IMAGE_PATH)
if attempt < max_attempts:
log.info("Will retry download...")
else:
log.error("All download attempts failed")
return False
def _check_ssh_keys_exist():
"""