change locking method

This commit is contained in:
Josh Patterson
2025-02-15 18:56:04 -05:00
parent ad54afe39a
commit ec2fc0a5f2

View File

@@ -19,7 +19,7 @@ Usage:
base_path: /opt/so/saltstack/local/salt/hypervisor/hosts
Options:
interval: Time in seconds between processing cycles (default: 30)
interval: Time in seconds between engine runs (managed by salt-master, default: 30)
base_path: Base directory containing hypervisor configurations (default: /opt/so/saltstack/local/salt/hypervisor/hosts)
Memory values in VM configuration should be specified in GB. These values
@@ -55,47 +55,52 @@ State Files:
- <vm_name>: Active VM configuration and status
- <vm_name>_failed: Failed VM creation details
- <vm_name>_invalidHW: Invalid hardware request details
Lock Files:
- .lock: Prevents concurrent processing of VMs
- Contains VM name and timestamp
- Automatically removed after processing
Notes:
- Requires 'hvn' feature license
- Uses hypervisor's sosmodel grain for hardware capabilities
- Hardware allocation based on model-specific configurations
- All created files maintain socore ownership
- Comprehensive logging for troubleshooting
- Lock files prevent concurrent processing
- Single engine-wide lock prevents concurrent instances
- Lock remains if error occurs (requires admin intervention)
Description:
The engine operates in the following phases:
The engine operates in the following phases:
1. License Validation
- Verifies 'hvn' feature is licensed
- Prevents operation if license is invalid
1. Engine Lock Acquisition
- Acquires single engine-wide lock
- Prevents multiple instances from running
- Lock remains until clean shutdown or error
2. Configuration Processing
- Reads nodes file from each hypervisor directory
- Validates configuration parameters
- Compares against existing VM tracking files
2. License Validation
- Verifies 'hvn' feature is licensed
- Prevents operation if license is invalid
3. Hardware Allocation
- Retrieves hypervisor model from grains cache
- Loads model-specific hardware capabilities
- Validates hardware requests against model limits
- Converts hardware indices to PCI IDs
- Ensures proper type handling for hardware indices
- Creates state tracking files with socore ownership
3. Configuration Processing
- Reads nodes file from each hypervisor directory
- Validates configuration parameters
- Compares against existing VM tracking files
4. VM Provisioning
- Creates lock file to prevent concurrent operations
- Executes so-salt-cloud with validated configuration
- Handles network setup (static/DHCP)
- Configures hardware passthrough with converted PCI IDs
- Updates VM state tracking
- Removes lock file after completion
4. Hardware Allocation
- Retrieves hypervisor model from grains cache
- Loads model-specific hardware capabilities
- Validates hardware requests against model limits
- Converts hardware indices to PCI IDs
- Ensures proper type handling for hardware indices
- Creates state tracking files with socore ownership
5. VM Provisioning
- Executes so-salt-cloud with validated configuration
- Handles network setup (static/DHCP)
- Configures hardware passthrough with converted PCI IDs
- Updates VM state tracking
Lock Management:
- Lock acquired at engine start
- Released only on clean shutdown
- Remains if error occurs
- Admin must restart service to clear lock
- Error-level logging for lock issues
Exit Codes:
0: Success
@@ -129,6 +134,7 @@ import salt.config
import salt.runner
from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime
from threading import Lock
# Get socore uid/gid
SOCORE_UID = pwd.getpwnam('socore').pw_uid
@@ -150,6 +156,9 @@ VALID_ROLES = ['sensor', 'searchnode', 'idh', 'receiver', 'heavynode', 'fleet']
LICENSE_PATH = '/opt/so/saltstack/local/pillar/soc/license.sls'
DEFAULTS_PATH = '/opt/so/saltstack/default/salt/hypervisor/defaults.yaml'
# Single engine-wide lock for virtual node manager
engine_lock = Lock()
def read_json_file(file_path: str) -> Any:
"""
Read and parse a JSON file.
@@ -225,53 +234,20 @@ def convert_pci_id(pci_id: str) -> str:
log.error("Failed to convert PCI ID %s: %s", pci_id, str(e))
raise
def create_lock_file(hypervisor_path: str, vm_name: str) -> bool:
"""Create .lock file for VM processing."""
lock_file = os.path.join(hypervisor_path, '.lock')
try:
if os.path.exists(lock_file):
log.warning("Lock file already exists at %s", lock_file)
return False
write_json_file(lock_file, {
'vm': vm_name,
'timestamp': datetime.now().isoformat()
})
return True
except Exception as e:
log.error("Failed to create lock file: %s", str(e))
return False
def remove_lock_file(hypervisor_path: str) -> None:
"""Remove .lock file after processing."""
lock_file = os.path.join(hypervisor_path, '.lock')
try:
if os.path.exists(lock_file):
os.remove(lock_file)
except Exception as e:
log.error("Failed to remove lock file: %s", str(e))
def is_locked(hypervisor_path: str) -> bool:
"""Check if hypervisor directory is locked."""
return os.path.exists(os.path.join(hypervisor_path, '.lock'))
def get_hypervisor_model(hypervisor: str) -> str:
"""Get sosmodel from hypervisor grains."""
log.info(hypervisor) #MOD
try:
# Get cached grains using Salt runner
grains = runner.cmd(
'cache.grains',
[f'{hypervisor}_*', 'glob']
)
log.info(grains) #MOD
if not grains:
raise ValueError(f"No grains found for hypervisor {hypervisor}")
# Get the first minion ID that matches our hypervisor
minion_id = next(iter(grains.keys()))
log.info(minion_id) #MOD
model = grains[minion_id].get('sosmodel')
log.info(model) #MOD
if not model:
raise ValueError(f"No sosmodel grain found for hypervisor {hypervisor}")
@@ -452,7 +428,7 @@ def mark_invalid_hardware(hypervisor_path: str, vm_name: str, config: dict, erro
def validate_hvn_license() -> bool:
"""Check if the license file exists and contains required values."""
if not os.path.exists(LICENSE_PATH):
log.error("LICENSE: License file not found at %s", LICENSE_PATH)
log.error("License file not found at %s", LICENSE_PATH)
return False
try:
@@ -460,14 +436,14 @@ def validate_hvn_license() -> bool:
license_data = yaml.safe_load(f)
if not license_data:
log.error("LICENSE: Empty or invalid license file")
log.error("Empty or invalid license file")
return False
license_id = license_data.get('license_id')
features = license_data.get('features', [])
if not license_id:
log.error("LICENSE: No license_id found in license file")
log.error("No license_id found in license file")
return False
if 'hvn' not in features:
@@ -476,29 +452,35 @@ def validate_hvn_license() -> bool:
"for more information about purchasing a license to enable this feature.")
return False
log.info("LICENSE: License validation successful")
log.info("License validation successful")
return True
except Exception as e:
log.error("LICENSE: Error reading license file: %s", str(e))
log.error("Error reading license file: %s", str(e))
return False
def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None:
"""Process a single VM creation request."""
"""
Process a single VM creation request.
This function handles the creation of a new VM, including hardware validation,
resource allocation, and provisioning. All operations are protected by the
engine-wide lock that is acquired at engine start.
Args:
hypervisor_path: Path to the hypervisor directory
vm_config: Dictionary containing VM configuration
"""
# Get the actual hypervisor name (last directory in path)
hypervisor = os.path.basename(hypervisor_path)
vm_name = f"{vm_config['hostname']}_{vm_config['role']}"
try:
# Create lock file
if not create_lock_file(hypervisor_path, vm_name):
return
# Get hypervisor model and capabilities
model = get_hypervisor_model(hypervisor)
model_config = load_hardware_defaults(model)
# Validate hardware request
# Initial hardware validation
is_valid, errors = validate_hardware_request(model_config, vm_config)
if not is_valid:
mark_invalid_hardware(hypervisor_path, vm_name, vm_config, errors)
@@ -507,96 +489,54 @@ def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None:
# Check hardware availability
if not check_hardware_availability(hypervisor_path, vm_name):
mark_vm_failed(os.path.join(hypervisor_path, vm_name), 3,
"Requested hardware is already in use")
"Requested hardware is already in use")
return
# Create tracking file
create_vm_tracking_file(hypervisor_path, vm_name, vm_config)
# Build so-salt-cloud command
log.debug("Building so-salt-cloud command for VM %s", vm_name)
# Build and execute so-salt-cloud command
cmd = ['so-salt-cloud', '-p', f'sool9-{hypervisor}', vm_name]
log.debug("Base command: %s", ' '.join(cmd))
# Add network configuration
if vm_config['network_mode'] == 'static4':
log.debug("Adding static network configuration")
cmd.extend(['--static4', '--ip4', vm_config['ip4'], '--gw4', vm_config['gw4']])
if 'dns4' in vm_config:
cmd.extend(['--dns4', vm_config['dns4']])
if 'search4' in vm_config:
cmd.extend(['--search4', vm_config['search4']])
else:
log.debug("Using DHCP network configuration")
cmd.append('--dhcp4')
# Add hardware configuration
if 'cpu' in vm_config:
log.debug("Adding CPU configuration: %s cores", vm_config['cpu'])
cmd.extend(['-c', str(vm_config['cpu'])])
if 'memory' in vm_config:
memory_mib = int(vm_config['memory']) * 1024
log.debug("Adding memory configuration: %sGB (%sMiB)", vm_config['memory'], memory_mib)
cmd.extend(['-m', str(memory_mib)])
# Add PCI devices with proper conversion
# Add PCI devices
for hw_type in ['disk', 'copper', 'sfp']:
if hw_type in vm_config and vm_config[hw_type]:
log.debug("Processing %s hardware configuration", hw_type)
indices = [int(x) for x in str(vm_config[hw_type]).split(',')]
log.debug("Requested %s indices: %s", hw_type, indices)
for idx in indices:
try:
log.debug("Looking up PCI ID for %s index %d in model config", hw_type, idx)
log.debug("Model config for %s: %s", hw_type, model_config['hardware'][hw_type])
hw_config = {int(k): v for k, v in model_config['hardware'][hw_type].items()}
pci_id = hw_config[idx]
converted_pci_id = convert_pci_id(pci_id)
cmd.extend(['-P', converted_pci_id])
# Convert all keys to integers for comparison
hw_config = {int(k): v for k, v in model_config['hardware'][hw_type].items()}
log.debug("Converted hardware config: %s", hw_config)
pci_id = hw_config[idx]
log.debug("Found PCI ID for %s index %d: %s", hw_type, idx, pci_id)
converted_pci_id = convert_pci_id(pci_id)
log.debug("Converted PCI ID from %s to %s", pci_id, converted_pci_id)
cmd.extend(['-P', converted_pci_id])
except Exception as e:
log.error("Failed to process PCI ID for %s index %d: %s", hw_type, idx, str(e))
log.error("Hardware config keys: %s, looking for index: %s",
list(model_config['hardware'][hw_type].keys()), idx)
raise
# Execute so-salt-cloud
log.info("Executing command: %s", ' '.join(cmd))
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
log.debug("Command completed successfully")
log.debug("Command stdout: %s", result.stdout)
if result.stderr:
log.warning("Command stderr (non-fatal): %s", result.stderr)
except subprocess.CalledProcessError as e:
error_msg = f"so-salt-cloud execution failed (code {e.returncode})"
if e.stdout:
log.error("Command stdout: %s", e.stdout)
if e.stderr:
log.error("Command stderr: %s", e.stderr)
error_msg = f"{error_msg}: {e.stderr}"
log.error(error_msg)
mark_vm_failed(os.path.join(hypervisor_path, vm_name), 4, error_msg)
raise
# Execute command
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
# Update tracking file status
tracking_file = os.path.join(hypervisor_path, vm_name)
data = read_json_file(tracking_file)
data['status'] = 'running'
write_json_file(tracking_file, data)
log.info("Successfully updated VM status to running")
except subprocess.CalledProcessError as e:
error_msg = f"so-salt-cloud execution failed (code {e.returncode})"
if e.stdout:
log.error("Command stdout: %s", e.stdout)
if e.stderr:
log.error("Command stderr: %s", e.stderr)
error_msg = f"{error_msg}: {e.stderr}"
log.error(error_msg)
mark_vm_failed(os.path.join(hypervisor_path, vm_name), 4, error_msg)
@@ -604,24 +544,28 @@ def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None:
except Exception as e:
error_msg = f"VM creation failed: {str(e)}"
log.error(error_msg)
# If we haven't created the tracking file yet, create a failed one
if not os.path.exists(os.path.join(hypervisor_path, vm_name)):
mark_vm_failed(os.path.join(hypervisor_path, f"{vm_name}_failed"), 4, error_msg)
finally:
remove_lock_file(hypervisor_path)
raise
def process_vm_deletion(hypervisor_path: str, vm_name: str) -> None:
"""Process a single VM deletion request."""
try:
if not create_lock_file(hypervisor_path, vm_name):
return
"""
Process a single VM deletion request.
This function handles the deletion of an existing VM. All operations are protected
by the engine-wide lock that is acquired at engine start.
Args:
hypervisor_path: Path to the hypervisor directory
vm_name: Name of the VM to delete
"""
try:
# Get the actual hypervisor name (last directory in path)
hypervisor = os.path.basename(hypervisor_path)
cmd = ['so-salt-cloud', '-p', f'sool9-{hypervisor}', vm_name, '-yd']
log.info("Executing: %s", ' '.join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True)
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
# Log command output
if result.stdout:
@@ -629,35 +573,39 @@ def process_vm_deletion(hypervisor_path: str, vm_name: str) -> None:
if result.stderr:
log.warning("Command stderr: %s", result.stderr)
# Check return code
if result.returncode != 0:
error_msg = f"so-salt-cloud deletion failed (code {result.returncode}): {result.stderr}"
log.error(error_msg)
raise subprocess.CalledProcessError(
result.returncode, cmd,
output=result.stdout,
stderr=result.stderr
)
# Remove VM tracking file
vm_file = os.path.join(hypervisor_path, vm_name)
if os.path.exists(vm_file):
os.remove(vm_file)
log.info("Successfully removed VM tracking file")
except subprocess.CalledProcessError as e:
error_msg = f"so-salt-cloud deletion failed (code {e.returncode}): {e.stderr}"
log.error(error_msg)
raise
except Exception as e:
log.error("Error processing VM deletion: %s", str(e))
raise
finally:
remove_lock_file(hypervisor_path)
def process_hypervisor(hypervisor_path: str) -> None:
"""Process VM configurations for a single hypervisor."""
try:
if is_locked(hypervisor_path):
return
"""
Process VM configurations for a single hypervisor.
# Read nodes file
This function handles the processing of VM configurations for a hypervisor,
including creation of new VMs and deletion of removed VMs. All operations
are protected by the engine-wide lock that is acquired at engine start.
The function performs the following steps:
1. Reads and validates nodes configuration
2. Identifies existing VMs
3. Processes new VM creation requests
4. Handles VM deletions for removed configurations
Args:
hypervisor_path: Path to the hypervisor directory
"""
try:
# Detection phase - no lock needed
nodes_file = os.path.join(hypervisor_path, 'nodes')
if not os.path.exists(nodes_file):
return
@@ -666,7 +614,7 @@ def process_hypervisor(hypervisor_path: str) -> None:
if not nodes_config:
return
# Get existing VMs
# Get existing VMs - no lock needed
existing_vms = set()
for file_path in glob.glob(os.path.join(hypervisor_path, '*_*')):
basename = os.path.basename(file_path)
@@ -684,6 +632,7 @@ def process_hypervisor(hypervisor_path: str) -> None:
configured_vms.add(vm_name)
if vm_name not in existing_vms:
# process_vm_creation handles its own locking
process_vm_creation(hypervisor_path, vm_config)
# Process VM deletions
@@ -692,29 +641,52 @@ def process_hypervisor(hypervisor_path: str) -> None:
except Exception as e:
log.error("Failed to process hypervisor %s: %s", hypervisor_path, str(e))
raise
def start(interval: int = DEFAULT_INTERVAL,
base_path: str = DEFAULT_BASE_PATH) -> None:
"""
Main engine loop.
Process virtual node configurations.
This function implements a single engine-wide lock to ensure only one
instance of the virtual node manager runs at a time. The lock is:
- Acquired at start
- Released after processing completes
- Maintained if error occurs
Args:
interval: Time in seconds between processing cycles
interval: Time in seconds between engine runs (managed by salt-master)
base_path: Base path containing hypervisor configurations
Notes:
- Lock remains if engine encounters an error
- Admin must restart service to clear lock
- Error-level logging used for lock issues
"""
log.info("Starting virtual node manager engine")
if not validate_hvn_license():
return
while True:
try:
# Process each hypervisor directory
for hypervisor_path in glob.glob(os.path.join(base_path, '*')):
if os.path.isdir(hypervisor_path):
process_hypervisor(hypervisor_path)
# Attempt to acquire engine lock
if not engine_lock.acquire(blocking=False):
log.error("Another virtual node manager is already running")
return
except Exception as e:
log.error("Error in main engine loop: %s", str(e))
log.debug("Virtual node manager acquired engine lock")
time.sleep(interval)
try:
# Process each hypervisor directory
for hypervisor_path in glob.glob(os.path.join(base_path, '*')):
if os.path.isdir(hypervisor_path):
process_hypervisor(hypervisor_path)
# Clean shutdown - release lock
log.debug("Virtual node manager releasing engine lock")
engine_lock.release()
log.info("Virtual node manager completed successfully")
except Exception as e:
log.error("Error in virtual node manager - lock will remain until cleared: %s", str(e))
# Don't release lock on error - requires admin intervention
return