From 09d699432a29e66163781703fb8a6b63a7fae733 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Fri, 10 Oct 2025 17:07:02 -0400 Subject: [PATCH] ui notification of nsm volume creation failure and cleanup of vm inventory in soc grid config for hypervisor --- salt/_modules/hypervisor.py | 91 ++++++++ salt/hypervisor/map.jinja | 22 +- .../engines/master/virtual_node_manager.py | 204 +++++++++++++++++- salt/soc/dyanno/hypervisor/map.jinja | 1 + .../dyanno/hypervisor/remove_failed_vm.sls | 51 +++++ 5 files changed, 355 insertions(+), 14 deletions(-) create mode 100644 salt/_modules/hypervisor.py create mode 100644 salt/soc/dyanno/hypervisor/remove_failed_vm.sls diff --git a/salt/_modules/hypervisor.py b/salt/_modules/hypervisor.py new file mode 100644 index 000000000..7119c8507 --- /dev/null +++ b/salt/_modules/hypervisor.py @@ -0,0 +1,91 @@ +#!/opt/saltstack/salt/bin/python3 + +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. +# +# Note: Per the Elastic License 2.0, the second limitation states: +# +# "You may not move, change, disable, or circumvent the license key functionality +# in the software, and you may not remove or obscure any functionality in the +# software that is protected by the license key." + +""" +Salt execution module for hypervisor operations. + +This module provides functions for managing hypervisor configurations, +including VM file management. +""" + +import json +import logging +import os + +log = logging.getLogger(__name__) + +__virtualname__ = 'hypervisor' + + +def __virtual__(): + """ + Only load this module if we're on a system that can manage hypervisors. + """ + return __virtualname__ + + +def remove_vm_from_vms_file(vms_file_path, vm_hostname, vm_role): + """ + Remove a VM entry from the hypervisorVMs file. + + Args: + vms_file_path (str): Path to the hypervisorVMs file + vm_hostname (str): Hostname of the VM to remove (without role suffix) + vm_role (str): Role of the VM + + Returns: + dict: Result dictionary with success status and message + + CLI Example: + salt '*' hypervisor.remove_vm_from_vms_file /opt/so/saltstack/local/salt/hypervisor/hosts/hypervisor1VMs node1 nsm + """ + try: + # Check if file exists + if not os.path.exists(vms_file_path): + msg = f"VMs file not found: {vms_file_path}" + log.error(msg) + return {'result': False, 'comment': msg} + + # Read current VMs + with open(vms_file_path, 'r') as f: + content = f.read().strip() + vms = json.loads(content) if content else [] + + # Find and remove the VM entry + original_count = len(vms) + vms = [vm for vm in vms if not (vm.get('hostname') == vm_hostname and vm.get('role') == vm_role)] + + if len(vms) < original_count: + # VM was found and removed, write back to file + with open(vms_file_path, 'w') as f: + json.dump(vms, f, indent=2) + + # Set socore:socore ownership (939:939) + os.chown(vms_file_path, 939, 939) + + msg = f"Removed VM {vm_hostname}_{vm_role} from {vms_file_path}" + log.info(msg) + return {'result': True, 'comment': msg} + else: + msg = f"VM {vm_hostname}_{vm_role} not found in {vms_file_path}" + log.warning(msg) + return {'result': False, 'comment': msg} + + except json.JSONDecodeError as e: + msg = f"Failed to parse JSON in {vms_file_path}: {str(e)}" + log.error(msg) + return {'result': False, 'comment': msg} + except Exception as e: + msg = f"Failed to remove VM {vm_hostname}_{vm_role} from {vms_file_path}: {str(e)}" + log.error(msg) + return {'result': False, 'comment': msg} diff --git a/salt/hypervisor/map.jinja b/salt/hypervisor/map.jinja index 3519f6078..087fd7bf7 100644 --- a/salt/hypervisor/map.jinja +++ b/salt/hypervisor/map.jinja @@ -58,10 +58,26 @@ {% set role = vm.get('role', '') %} {% do salt.log.debug('salt/hypervisor/map.jinja: Processing VM - hostname: ' ~ hostname ~ ', role: ' ~ role) %} - {# Load VM configuration from config file #} + {# Try to load VM configuration from config file first, then .error file if config doesn't exist #} {% set vm_file = 'hypervisor/hosts/' ~ hypervisor ~ '/' ~ hostname ~ '_' ~ role %} + {% set vm_error_file = vm_file ~ '.error' %} {% do salt.log.debug('salt/hypervisor/map.jinja: VM config file: ' ~ vm_file) %} - {% import_json vm_file as vm_state %} + + {# Check if base config file exists #} + {% set config_exists = salt['file.file_exists']('/opt/so/saltstack/local/salt/' ~ vm_file) %} + {% set error_exists = salt['file.file_exists']('/opt/so/saltstack/local/salt/' ~ vm_error_file) %} + + {% set vm_state = none %} + {% if config_exists %} + {% import_json vm_file as vm_state %} + {% do salt.log.debug('salt/hypervisor/map.jinja: Loaded VM config from base file') %} + {% elif error_exists %} + {% import_json vm_error_file as vm_state %} + {% do salt.log.debug('salt/hypervisor/map.jinja: Loaded VM config from .error file') %} + {% else %} + {% do salt.log.warning('salt/hypervisor/map.jinja: No config or error file found for VM ' ~ hostname ~ '_' ~ role) %} + {% endif %} + {% if vm_state %} {% do salt.log.debug('salt/hypervisor/map.jinja: VM config content: ' ~ vm_state | tojson) %} {% set vm_data = {'config': vm_state.config} %} @@ -85,7 +101,7 @@ {% endif %} {% do vms.update({hostname ~ '_' ~ role: vm_data}) %} {% else %} - {% do salt.log.debug('salt/hypervisor/map.jinja: Config file empty: ' ~ vm_file) %} + {% do salt.log.debug('salt/hypervisor/map.jinja: Skipping VM ' ~ hostname ~ '_' ~ role ~ ' - no config available') %} {% endif %} {% endfor %} diff --git a/salt/salt/engines/master/virtual_node_manager.py b/salt/salt/engines/master/virtual_node_manager.py index 1c4eae7ea..270a93c11 100644 --- a/salt/salt/engines/master/virtual_node_manager.py +++ b/salt/salt/engines/master/virtual_node_manager.py @@ -117,7 +117,7 @@ Exit Codes: 4: VM provisioning failure (so-salt-cloud execution failed) Logging: - Log files are written to /opt/so/log/salt/engines/virtual_node_manager.log + Log files are written to /opt/so/log/salt/engines/virtual_node_manager Comprehensive logging includes: - Hardware validation details - PCI ID conversion process @@ -138,23 +138,49 @@ import pwd import grp import salt.config import salt.runner +import salt.client from typing import Dict, List, Optional, Tuple, Any from datetime import datetime, timedelta from threading import Lock -# Get socore uid/gid -SOCORE_UID = pwd.getpwnam('socore').pw_uid -SOCORE_GID = grp.getgrnam('socore').gr_gid - -# Initialize Salt runner once +# Initialize Salt runner and local client once opts = salt.config.master_config('/etc/salt/master') opts['output'] = 'json' runner = salt.runner.RunnerClient(opts) +local = salt.client.LocalClient() + +# Get socore uid/gid for file ownership +SOCORE_UID = pwd.getpwnam('socore').pw_uid +SOCORE_GID = grp.getgrnam('socore').gr_gid # Configure logging log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) +# Prevent propagation to parent loggers to avoid duplicate log entries +log.propagate = False + +# Add file handler for dedicated log file +log_dir = '/opt/so/log/salt' +log_file = os.path.join(log_dir, 'virtual_node_manager') + +# Create log directory if it doesn't exist +os.makedirs(log_dir, exist_ok=True) + +# Create file handler +file_handler = logging.FileHandler(log_file) +file_handler.setLevel(logging.DEBUG) + +# Create formatter +formatter = logging.Formatter( + '%(asctime)s [%(name)s:%(lineno)d][%(levelname)-8s][%(process)d] %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' +) +file_handler.setFormatter(formatter) + +# Add handler to logger +log.addHandler(file_handler) + # Constants DEFAULT_INTERVAL = 30 DEFAULT_BASE_PATH = '/opt/so/saltstack/local/salt/hypervisor/hosts' @@ -203,6 +229,39 @@ def write_json_file(file_path: str, data: Any) -> None: except Exception as e: log.error("Failed to write JSON file %s: %s", file_path, str(e)) raise +def remove_vm_from_vms_file(vms_file_path: str, vm_hostname: str, vm_role: str) -> bool: + """ + Remove a VM entry from the hypervisorVMs file. + + Args: + vms_file_path: Path to the hypervisorVMs file + vm_hostname: Hostname of the VM to remove (without role suffix) + vm_role: Role of the VM + + Returns: + bool: True if VM was removed, False otherwise + """ + try: + # Read current VMs + vms = read_json_file(vms_file_path) + + # Find and remove the VM entry + original_count = len(vms) + vms = [vm for vm in vms if not (vm.get('hostname') == vm_hostname and vm.get('role') == vm_role)] + + if len(vms) < original_count: + # VM was found and removed, write back to file + write_json_file(vms_file_path, vms) + log.info("Removed VM %s_%s from %s", vm_hostname, vm_role, vms_file_path) + return True + else: + log.warning("VM %s_%s not found in %s", vm_hostname, vm_role, vms_file_path) + return False + + except Exception as e: + log.error("Failed to remove VM %s_%s from %s: %s", vm_hostname, vm_role, vms_file_path, str(e)) + return False + def read_yaml_file(file_path: str) -> dict: """Read and parse a YAML file.""" @@ -558,6 +617,13 @@ def mark_vm_failed(vm_file: str, error_code: int, message: str) -> None: # Remove the original file since we'll create an error file os.remove(vm_file) + # Clear hardware resource claims so failed VMs don't consume resources + # Keep nsm_size for reference but clear cpu, memory, sfp, copper + config.pop('cpu', None) + config.pop('memory', None) + config.pop('sfp', None) + config.pop('copper', None) + # Create error file error_file = f"{vm_file}.error" data = { @@ -586,8 +652,16 @@ def mark_invalid_hardware(hypervisor_path: str, vm_name: str, config: dict, erro # Join all messages with proper sentence structure full_message = "Hardware validation failure: " + " ".join(error_messages) + # Clear hardware resource claims so failed VMs don't consume resources + # Keep nsm_size for reference but clear cpu, memory, sfp, copper + config_copy = config.copy() + config_copy.pop('cpu', None) + config_copy.pop('memory', None) + config_copy.pop('sfp', None) + config_copy.pop('copper', None) + data = { - 'config': config, + 'config': config_copy, 'status': 'error', 'timestamp': datetime.now().isoformat(), 'error_details': { @@ -634,6 +708,61 @@ def validate_vrt_license() -> bool: log.error("Error reading license file: %s", str(e)) return False +def check_hypervisor_disk_space(hypervisor: str, size_gb: int) -> Tuple[bool, Optional[str]]: + """ + Check if hypervisor has sufficient disk space for volume creation. + + Args: + hypervisor: Hypervisor hostname + size_gb: Required size in GB + + Returns: + Tuple of (has_space, error_message) + """ + try: + # Get hypervisor minion ID + hypervisor_minion = f"{hypervisor}_hypervisor" + + # Check disk space on /nsm/libvirt/volumes using LocalClient + result = local.cmd( + hypervisor_minion, + 'cmd.run', + ["df -BG /nsm/libvirt/volumes | tail -1 | awk '{print $4}' | sed 's/G//'"] + ) + + if not result or hypervisor_minion not in result: + log.error("Failed to check disk space on hypervisor %s", hypervisor) + return False, "Failed to check disk space on hypervisor" + + available_gb_str = result[hypervisor_minion].strip() + if not available_gb_str: + log.error("Empty disk space response from hypervisor %s", hypervisor) + return False, "Failed to get disk space information" + + try: + available_gb = float(available_gb_str) + except ValueError: + log.error("Invalid disk space value from hypervisor %s: %s", hypervisor, available_gb_str) + return False, f"Invalid disk space value: {available_gb_str}" + + # Add 10% buffer for filesystem overhead + required_gb = size_gb * 1.1 + + log.debug("Hypervisor %s disk space check: Available=%.2fGB, Required=%.2fGB", + hypervisor, available_gb, required_gb) + + if available_gb < required_gb: + error_msg = f"Insufficient disk space on hypervisor {hypervisor}. Available: {available_gb:.2f}GB, Required: {required_gb:.2f}GB (including 10% overhead)" + log.error(error_msg) + return False, error_msg + + log.info("Hypervisor %s has sufficient disk space for %dGB volume", hypervisor, size_gb) + return True, None + + except Exception as e: + log.error("Error checking disk space on hypervisor %s: %s", hypervisor, str(e)) + return False, f"Error checking disk space: {str(e)}" + def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None: """ Process a single VM creation request. @@ -695,6 +824,33 @@ def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None: log.warning("VM: %s - Both disk and nsm_size specified. disk takes precedence, nsm_size will be ignored.", vm_name) + # Check disk space BEFORE creating VM if nsm_size is specified + if has_nsm_size and not has_disk: + size_gb = int(vm_config['nsm_size']) + has_space, space_error = check_hypervisor_disk_space(hypervisor, size_gb) + if not has_space: + log.error("VM: %s - %s", vm_name, space_error) + + # Send Volume nsm Create Failed status event + try: + subprocess.run([ + 'so-salt-emit-vm-deployment-status-event', + '-v', vm_name, + '-H', hypervisor, + '-s', 'Volume nsm Create Failed' + ], check=True) + except subprocess.CalledProcessError as e: + log.error("Failed to emit volume create failed event for %s: %s", vm_name, str(e)) + + mark_invalid_hardware( + hypervisor_path, + vm_name, + vm_config, + {'disk_space': f"Insufficient disk space for {size_gb}GB volume: {space_error}"} + ) + return + log.debug("VM: %s - Hypervisor has sufficient space for %dGB volume", vm_name, size_gb) + # Initial hardware validation against model is_valid, errors = validate_hardware_request(model_config, vm_config) if not is_valid: @@ -967,12 +1123,21 @@ def process_hypervisor(hypervisor_path: str) -> None: if not nodes_config: log.debug("Empty VMs configuration in %s", vms_file) - # Get existing VMs + # Get existing VMs and track failed VMs separately existing_vms = set() + failed_vms = set() # VMs with .error files for file_path in glob.glob(os.path.join(hypervisor_path, '*_*')): basename = os.path.basename(file_path) - # Skip error and status files - if not basename.endswith('.error') and not basename.endswith('.status'): + # Skip status files + if basename.endswith('.status'): + continue + # Track VMs with .error files separately + if basename.endswith('.error'): + vm_name = basename[:-6] # Remove '.error' suffix + failed_vms.add(vm_name) + existing_vms.add(vm_name) # Also add to existing to prevent recreation + log.debug(f"Found failed VM with .error file: {vm_name}") + else: existing_vms.add(basename) # Process new VMs @@ -989,12 +1154,29 @@ def process_hypervisor(hypervisor_path: str) -> None: # process_vm_creation handles its own locking process_vm_creation(hypervisor_path, vm_config) - # Process VM deletions + # Process VM deletions (but skip failed VMs that only have .error files) vms_to_delete = existing_vms - configured_vms log.debug(f"Existing VMs: {existing_vms}") log.debug(f"Configured VMs: {configured_vms}") + log.debug(f"Failed VMs: {failed_vms}") log.debug(f"VMs to delete: {vms_to_delete}") for vm_name in vms_to_delete: + # Skip deletion if VM only has .error file (no actual VM to delete) + if vm_name in failed_vms: + error_file = os.path.join(hypervisor_path, f"{vm_name}.error") + base_file = os.path.join(hypervisor_path, vm_name) + # Only skip if there's no base file (VM never successfully created) + if not os.path.exists(base_file): + log.info(f"Skipping deletion of failed VM {vm_name} (VM never successfully created)") + # Clean up the .error and .status files since VM is no longer configured + if os.path.exists(error_file): + os.remove(error_file) + log.info(f"Removed .error file for unconfigured VM: {vm_name}") + status_file = os.path.join(hypervisor_path, f"{vm_name}.status") + if os.path.exists(status_file): + os.remove(status_file) + log.info(f"Removed .status file for unconfigured VM: {vm_name}") + continue log.info(f"Initiating deletion process for VM: {vm_name}") process_vm_deletion(hypervisor_path, vm_name) diff --git a/salt/soc/dyanno/hypervisor/map.jinja b/salt/soc/dyanno/hypervisor/map.jinja index 4a5107371..139003f17 100644 --- a/salt/soc/dyanno/hypervisor/map.jinja +++ b/salt/soc/dyanno/hypervisor/map.jinja @@ -3,6 +3,7 @@ {# Define the list of process steps in order (case-sensitive) #} {% set PROCESS_STEPS = [ 'Processing', + 'Volume nsm Create Failed', 'IP Configuration', 'Starting Create', 'Executing Deploy Script', diff --git a/salt/soc/dyanno/hypervisor/remove_failed_vm.sls b/salt/soc/dyanno/hypervisor/remove_failed_vm.sls new file mode 100644 index 000000000..a47eff595 --- /dev/null +++ b/salt/soc/dyanno/hypervisor/remove_failed_vm.sls @@ -0,0 +1,51 @@ +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. +# +# Note: Per the Elastic License 2.0, the second limitation states: +# +# "You may not move, change, disable, or circumvent the license key functionality +# in the software, and you may not remove or obscure any functionality in the +# software that is protected by the license key." + +{% if 'vrt' in salt['pillar.get']('features', []) %} + +{% do salt.log.info('soc/dyanno/hypervisor/remove_failed_vm: Running') %} +{% set vm_name = pillar.get('vm_name') %} +{% set hypervisor = pillar.get('hypervisor') %} + +{% if vm_name and hypervisor %} +{% set vm_parts = vm_name.split('_') %} +{% if vm_parts | length >= 2 %} +{% set vm_role = vm_parts[-1] %} +{% set vm_hostname = '_'.join(vm_parts[:-1]) %} +{% set vms_file = '/opt/so/saltstack/local/salt/hypervisor/hosts/' ~ hypervisor ~ 'VMs' %} + +{% do salt.log.info('soc/dyanno/hypervisor/remove_failed_vm: Removing VM ' ~ vm_name ~ ' from ' ~ vms_file) %} + +remove_vm_{{ vm_name }}_from_vms_file: + module.run: + - name: hypervisor.remove_vm_from_vms_file + - vms_file_path: {{ vms_file }} + - vm_hostname: {{ vm_hostname }} + - vm_role: {{ vm_role }} + +{% else %} +{% do salt.log.error('soc/dyanno/hypervisor/remove_failed_vm: Invalid vm_name format: ' ~ vm_name) %} +{% endif %} +{% else %} +{% do salt.log.error('soc/dyanno/hypervisor/remove_failed_vm: Missing required pillar data (vm_name or hypervisor)') %} +{% endif %} + +{% do salt.log.info('soc/dyanno/hypervisor/remove_failed_vm: Completed') %} + +{% else %} + +{% do salt.log.error( + 'Hypervisor nodes are a feature supported only for customers with a valid license. ' + 'Contact Security Onion Solutions, LLC via our website at https://securityonionsolutions.com ' + 'for more information about purchasing a license to enable this feature.' +) %} + +{% endif %}