hardware logging. vm state file logging

2025-12-08 10:12:53 +01:00 · 2025-02-15 21:41:01 -05:00
parent ec2fc0a5f2
commit c34be5313d
1 changed files with 164 additions and 67 deletions
--- a/salt/salt/engines/master/virtual_node_manager.py
+++ b/salt/salt/engines/master/virtual_node_manager.py
@@ -52,9 +52,8 @@ Examples:
 State Files:
    VM Tracking Files:
-        - <vm_name>: Active VM configuration and status
+        - <vm_name>: Active VM with status 'creating' or 'running'
-        - <vm_name>_failed: Failed VM creation details
+        - <vm_name>.error: Error state with detailed message
        - <vm_name>_invalidHW: Invalid hardware request details
 Notes:
    - Requires 'hvn' feature license
    - Uses hypervisor's sosmodel grain for hardware capabilities
@@ -106,9 +105,8 @@ Exit Codes:
    0: Success
    1: Invalid license
    2: Configuration error
-    3: Hardware allocation failure
+    3: Hardware validation failure (hardware doesn't exist in model or is already in use by another VM)
-    4: VM provisioning failure
+    4: VM provisioning failure (so-salt-cloud execution failed)
    5: Invalid hardware request
 Logging:
    Log files are written to /opt/so/log/salt/engines/virtual_node_manager.log
@@ -281,14 +279,15 @@ def validate_hardware_request(model_config: dict, requested_hw: dict) -> Tuple[b
        Tuple of (is_valid, error_details)
    """
    errors = {}
-    log.debug("Validating hardware request: %s", requested_hw)
+    log.debug("Validating if requested hardware exists in model configuration")
-    log.debug("Against model config: %s", model_config['hardware'])
+    log.debug("Requested hardware: %s", requested_hw)
    log.debug("Model hardware configuration: %s", model_config['hardware'])
    # Validate CPU
    if 'cpu' in requested_hw:
        try:
            cpu_count = int(requested_hw['cpu'])
-            log.debug("Validating CPU request: %d against maximum: %d",
+            log.debug("Checking if %d CPU cores exist in model (maximum: %d)",
                     cpu_count, model_config['hardware']['cpu'])
            if cpu_count > model_config['hardware']['cpu']:
                errors['cpu'] = f"Requested {cpu_count} CPU cores exceeds maximum {model_config['hardware']['cpu']}"
@@ -299,7 +298,7 @@ def validate_hardware_request(model_config: dict, requested_hw: dict) -> Tuple[b
    if 'memory' in requested_hw:
        try:
            memory = int(requested_hw['memory'])
-            log.debug("Validating memory request: %dGB against maximum: %dGB",
+            log.debug("Checking if %dGB memory exists in model (maximum: %dGB)",
                     memory, model_config['hardware']['memory'])
            if memory > model_config['hardware']['memory']:
                errors['memory'] = f"Requested {memory}GB memory exceeds maximum {model_config['hardware']['memory']}GB"
@@ -311,19 +310,19 @@ def validate_hardware_request(model_config: dict, requested_hw: dict) -> Tuple[b
        if hw_type in requested_hw and requested_hw[hw_type]:
            try:
                indices = [int(x) for x in str(requested_hw[hw_type]).split(',')]
-                log.debug("Validating %s indices: %s", hw_type, indices)
+                log.debug("Checking if %s indices %s exist in model", hw_type, indices)
                if hw_type not in model_config['hardware']:
                    log.error("Hardware type %s not found in model config", hw_type)
                    errors[hw_type] = f"No {hw_type} configuration found in model"
                    continue
-                available_indices = set(int(k) for k in model_config['hardware'][hw_type].keys())
+                model_indices = set(int(k) for k in model_config['hardware'][hw_type].keys())
-                log.debug("Available %s indices: %s", hw_type, available_indices)
+                log.debug("Model has %s indices: %s", hw_type, model_indices)
-                invalid_indices = [idx for idx in indices if idx not in available_indices]
+                invalid_indices = [idx for idx in indices if idx not in model_indices]
                if invalid_indices:
-                    log.error("Invalid %s indices found: %s", hw_type, invalid_indices)
+                    log.error("%s indices %s do not exist in model", hw_type, invalid_indices)
                    errors[hw_type] = f"Invalid {hw_type} indices: {invalid_indices}"
            except ValueError:
                log.error("Invalid %s indices format: %s", hw_type, requested_hw[hw_type])
@@ -339,26 +338,117 @@ def validate_hardware_request(model_config: dict, requested_hw: dict) -> Tuple[b
    return (len(errors) == 0, errors if errors else None)
-def check_hardware_availability(hypervisor_path: str, vm_name: str) -> bool:
+def check_hardware_availability(hypervisor_path: str, vm_name: str, requested_hw: dict, model_config: dict) -> Tuple[bool, Optional[dict]]:
-    """Check if requested hardware is already claimed by another VM."""
+    """
-    try:
+    Check if requested hardware is available.
-        # List all VM tracking files
+    
-        files = glob.glob(os.path.join(hypervisor_path, '*_*'))
+    Args:
-        for file_path in files:
+        hypervisor_path: Path to hypervisor directory
-            # Skip the VM we're checking and any failed/invalid VMs
+        vm_name: Name of requesting VM
-            basename = os.path.basename(file_path)
+        requested_hw: Hardware being requested
-            if basename.startswith(vm_name) or '_failed' in basename or '_invalidHW' in basename:
+        model_config: Model hardware configuration
    Returns:
        Tuple of (is_available, error_details)
    """
    log.debug("Checking if requested hardware is currently in use by other VMs")
    log.debug("VM requesting hardware: %s", vm_name)
    log.debug("Hardware being requested: %s", requested_hw)
    errors = {}
    # Track total CPU/memory usage
    total_cpu = 0
    total_memory = 0
    # Track used unique resources and which VM is using them
    used_resources = {
        'disk': {},    # {index: vm_name}
        'copper': {},  # {index: vm_name}
        'sfp': {}      # {index: vm_name}
    }
    # Calculate current usage from existing VMs
    log.debug("Scanning existing VMs to check hardware usage")
    for vm_file in glob.glob(os.path.join(hypervisor_path, '*_*')):
        basename = os.path.basename(vm_file)
        # Skip if it's the same VM requesting hardware or in error state
        if basename.startswith(vm_name):
            log.debug("Skipping file %s (same VM requesting hardware)", basename)
            continue
        if basename.endswith('.error'):
            log.debug("Skipping file %s (error state)", basename)
            continue
-            # Check if any hardware overlaps
+        vm_config = read_json_file(vm_file)
-            vm_config = read_json_file(file_path)
+        if 'config' not in vm_config or vm_config.get('status') != 'running':
-            if 'hardware' in vm_config and 'allocated' in vm_config['hardware']:
+            log.debug("Skipping VM %s (not running)", basename)
-                # TODO: Implement hardware conflict checking
+            continue
-                pass
+            
-        return True
+        config = vm_config['config']
-    except Exception as e:
+        log.debug("Processing running VM %s", basename)
-        log.error("Failed to check hardware availability: %s", str(e))
+        
-        return False
+        # Add to CPU/memory totals
        vm_cpu = int(config.get('cpu', 0))
        vm_memory = int(config.get('memory', 0))
        total_cpu += vm_cpu
        total_memory += vm_memory
        log.debug("Found running VM %s using CPU: %d, Memory: %dGB", basename, vm_cpu, vm_memory)
        # Track unique resources
        for hw_type in ['disk', 'copper', 'sfp']:
            if hw_type in config and config[hw_type]:
                indices = [int(x) for x in str(config[hw_type]).split(',')]
                for idx in indices:
                    used_resources[hw_type][idx] = basename.replace('_sensor', '')  # Store VM name without role
                log.debug("VM %s is using %s indices: %s", basename, hw_type, indices)
    log.debug("Total hardware currently in use - CPU: %d, Memory: %dGB", total_cpu, total_memory)
    log.debug("Hardware indices currently in use: %s", used_resources)
    # Check CPU capacity
    requested_cpu = int(requested_hw.get('cpu', 0))
    total_cpu_needed = total_cpu + requested_cpu
    log.debug("Checking CPU capacity - Currently in use: %d + Requested: %d = %d (Max: %d)",
             total_cpu, requested_cpu, total_cpu_needed, model_config['hardware']['cpu'])
    if total_cpu_needed > model_config['hardware']['cpu']:
        errors['cpu'] = f"Total CPU usage ({total_cpu_needed}) would exceed capacity ({model_config['hardware']['cpu']})"
    # Check memory capacity
    requested_memory = int(requested_hw.get('memory', 0))
    total_memory_needed = total_memory + requested_memory
    log.debug("Checking memory capacity - Currently in use: %d + Requested: %d = %d (Max: %d)",
             total_memory, requested_memory, total_memory_needed, model_config['hardware']['memory'])
    if total_memory_needed > model_config['hardware']['memory']:
        errors['memory'] = f"Total memory usage ({total_memory_needed}GB) would exceed capacity ({model_config['hardware']['memory']}GB)"
    # Check for hardware conflicts
    for hw_type in ['disk', 'copper', 'sfp']:
        if hw_type in requested_hw and requested_hw[hw_type]:
            requested_indices = [int(x) for x in str(requested_hw[hw_type]).split(',')]
            log.debug("Checking for %s conflicts - Requesting indices: %s, Currently in use: %s",
                     hw_type, requested_indices, used_resources[hw_type])
            conflicts = {}  # {index: vm_name}
            for idx in requested_indices:
                if idx in used_resources[hw_type]:
                    conflicts[idx] = used_resources[hw_type][idx]
            if conflicts:
                # Create one sentence per conflict
                conflict_details = []
                hw_name = hw_type.upper() if hw_type == 'sfp' else hw_type.capitalize()
                for idx, vm in conflicts.items():
                    conflict_details.append(f"{hw_name} index {idx} in use by {vm}")
                log.debug("Found conflicting %s indices: %s", hw_type, conflict_details)
                errors[hw_type] = ". ".join(conflict_details) + "."
    if errors:
        log.debug("Hardware validation failed with errors: %s", errors)
    else:
        log.debug("Hardware validation successful")
    return (len(errors) == 0, errors if errors else None)
 def create_vm_tracking_file(hypervisor_path: str, vm_name: str, config: dict) -> None:
    """Create VM tracking file with initial state."""
@@ -371,52 +461,58 @@ def create_vm_tracking_file(hypervisor_path: str, vm_name: str, config: dict) ->
        data = {
            'config': config,
-            'status': 'creating',
+            'status': 'creating'
            'hardware': {
                'allocated': {}
            }
        }
        # Write file and set ownership
-        with open(file_path, 'w') as f:
+        write_json_file(file_path, data)
            json.dump(data, f, indent=2)
        set_socore_ownership(file_path)
        log.debug("Successfully created VM tracking file with socore ownership")
    except Exception as e:
        log.error("Failed to create VM tracking file: %s", str(e))
        raise
 def mark_vm_failed(vm_file: str, error_code: int, message: str) -> None:
-    """Mark VM as failed with error details."""
+    """Create error file with VM failure details."""
    try:
-        # Rename file to add _failed suffix if not already present
+        # Get original config if it exists
-        if not vm_file.endswith('_failed'):
+        config = {}
-            new_file = f"{vm_file}_failed"
+        if os.path.exists(vm_file):
            os.rename(vm_file, new_file)
            vm_file = new_file
        # Update file contents
            data = read_json_file(vm_file)
-        data['status'] = 'failed'
+            config = data.get('config', {})
-        data['error'] = {
+            # Remove the original file since we'll create an error file
-            'code': error_code,
+            os.remove(vm_file)
        # Create error file
        error_file = f"{vm_file}.error"
        data = {
            'config': config,
            'status': 'error',
            'error_details': {
                'message': message,
                'timestamp': datetime.now().isoformat()
            }
-        write_json_file(vm_file, data)
+        }
        write_json_file(error_file, data)
    except Exception as e:
-        log.error("Failed to mark VM as failed: %s", str(e))
+        log.error("Failed to create error file: %s", str(e))
        raise
 def mark_invalid_hardware(hypervisor_path: str, vm_name: str, config: dict, error_details: dict) -> None:
-    """Create invalid hardware tracking file with error details."""
+    """Create error file with hardware validation failure details."""
-    file_path = os.path.join(hypervisor_path, f"{vm_name}_invalidHW")
+    file_path = os.path.join(hypervisor_path, f"{vm_name}.error")
    try:
        # Build error message from error details
        error_messages = []
        for hw_type, message in error_details.items():
            error_messages.append(message)
        # Join all messages with proper sentence structure
        full_message = "Hardware validation failure: " + " ".join(error_messages)
        data = {
            'config': config,
-            'error': {
+            'status': 'error',
-                'code': 5,
+            'error_details': {
-                'message': "Invalid hardware configuration",
+                'message': full_message,
                'invalid_hardware': error_details,
                'timestamp': datetime.now().isoformat()
            }
        }
@@ -480,16 +576,17 @@ def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None:
        model = get_hypervisor_model(hypervisor)
        model_config = load_hardware_defaults(model)
-        # Initial hardware validation
+        # Initial hardware validation against model
        is_valid, errors = validate_hardware_request(model_config, vm_config)
        if not is_valid:
            mark_invalid_hardware(hypervisor_path, vm_name, vm_config, errors)
            return
        # Check hardware availability
-        if not check_hardware_availability(hypervisor_path, vm_name):
+        is_available, availability_errors = check_hardware_availability(
-            mark_vm_failed(os.path.join(hypervisor_path, vm_name), 3,
+            hypervisor_path, vm_name, vm_config, model_config)
-                        "Requested hardware is already in use")
+        if not is_available:
            mark_invalid_hardware(hypervisor_path, vm_name, vm_config, availability_errors)
            return
        # Create tracking file
@@ -618,7 +715,7 @@ def process_hypervisor(hypervisor_path: str) -> None:
        existing_vms = set()
        for file_path in glob.glob(os.path.join(hypervisor_path, '*_*')):
            basename = os.path.basename(file_path)
-            if not any(x in basename for x in ['_failed', '_invalidHW']):
+            if not basename.endswith('.error'):
                existing_vms.add(basename)
        # Process new VMs