ui notification of nsm volume creation failure and cleanup of vm inventory in soc grid config for hypervisor

This commit is contained in:
Josh Patterson
2025-10-10 17:07:02 -04:00
parent e551c6e037
commit 09d699432a
5 changed files with 355 additions and 14 deletions

View File

@@ -0,0 +1,91 @@
#!/opt/saltstack/salt/bin/python3
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
# https://securityonion.net/license; you may not use this file except in compliance with the
# Elastic License 2.0.
#
# Note: Per the Elastic License 2.0, the second limitation states:
#
# "You may not move, change, disable, or circumvent the license key functionality
# in the software, and you may not remove or obscure any functionality in the
# software that is protected by the license key."
"""
Salt execution module for hypervisor operations.
This module provides functions for managing hypervisor configurations,
including VM file management.
"""
import json
import logging
import os
log = logging.getLogger(__name__)
__virtualname__ = 'hypervisor'
def __virtual__():
"""
Only load this module if we're on a system that can manage hypervisors.
"""
return __virtualname__
def remove_vm_from_vms_file(vms_file_path, vm_hostname, vm_role):
"""
Remove a VM entry from the hypervisorVMs file.
Args:
vms_file_path (str): Path to the hypervisorVMs file
vm_hostname (str): Hostname of the VM to remove (without role suffix)
vm_role (str): Role of the VM
Returns:
dict: Result dictionary with success status and message
CLI Example:
salt '*' hypervisor.remove_vm_from_vms_file /opt/so/saltstack/local/salt/hypervisor/hosts/hypervisor1VMs node1 nsm
"""
try:
# Check if file exists
if not os.path.exists(vms_file_path):
msg = f"VMs file not found: {vms_file_path}"
log.error(msg)
return {'result': False, 'comment': msg}
# Read current VMs
with open(vms_file_path, 'r') as f:
content = f.read().strip()
vms = json.loads(content) if content else []
# Find and remove the VM entry
original_count = len(vms)
vms = [vm for vm in vms if not (vm.get('hostname') == vm_hostname and vm.get('role') == vm_role)]
if len(vms) < original_count:
# VM was found and removed, write back to file
with open(vms_file_path, 'w') as f:
json.dump(vms, f, indent=2)
# Set socore:socore ownership (939:939)
os.chown(vms_file_path, 939, 939)
msg = f"Removed VM {vm_hostname}_{vm_role} from {vms_file_path}"
log.info(msg)
return {'result': True, 'comment': msg}
else:
msg = f"VM {vm_hostname}_{vm_role} not found in {vms_file_path}"
log.warning(msg)
return {'result': False, 'comment': msg}
except json.JSONDecodeError as e:
msg = f"Failed to parse JSON in {vms_file_path}: {str(e)}"
log.error(msg)
return {'result': False, 'comment': msg}
except Exception as e:
msg = f"Failed to remove VM {vm_hostname}_{vm_role} from {vms_file_path}: {str(e)}"
log.error(msg)
return {'result': False, 'comment': msg}

View File

@@ -58,10 +58,26 @@
{% set role = vm.get('role', '') %} {% set role = vm.get('role', '') %}
{% do salt.log.debug('salt/hypervisor/map.jinja: Processing VM - hostname: ' ~ hostname ~ ', role: ' ~ role) %} {% do salt.log.debug('salt/hypervisor/map.jinja: Processing VM - hostname: ' ~ hostname ~ ', role: ' ~ role) %}
{# Load VM configuration from config file #} {# Try to load VM configuration from config file first, then .error file if config doesn't exist #}
{% set vm_file = 'hypervisor/hosts/' ~ hypervisor ~ '/' ~ hostname ~ '_' ~ role %} {% set vm_file = 'hypervisor/hosts/' ~ hypervisor ~ '/' ~ hostname ~ '_' ~ role %}
{% set vm_error_file = vm_file ~ '.error' %}
{% do salt.log.debug('salt/hypervisor/map.jinja: VM config file: ' ~ vm_file) %} {% do salt.log.debug('salt/hypervisor/map.jinja: VM config file: ' ~ vm_file) %}
{# Check if base config file exists #}
{% set config_exists = salt['file.file_exists']('/opt/so/saltstack/local/salt/' ~ vm_file) %}
{% set error_exists = salt['file.file_exists']('/opt/so/saltstack/local/salt/' ~ vm_error_file) %}
{% set vm_state = none %}
{% if config_exists %}
{% import_json vm_file as vm_state %} {% import_json vm_file as vm_state %}
{% do salt.log.debug('salt/hypervisor/map.jinja: Loaded VM config from base file') %}
{% elif error_exists %}
{% import_json vm_error_file as vm_state %}
{% do salt.log.debug('salt/hypervisor/map.jinja: Loaded VM config from .error file') %}
{% else %}
{% do salt.log.warning('salt/hypervisor/map.jinja: No config or error file found for VM ' ~ hostname ~ '_' ~ role) %}
{% endif %}
{% if vm_state %} {% if vm_state %}
{% do salt.log.debug('salt/hypervisor/map.jinja: VM config content: ' ~ vm_state | tojson) %} {% do salt.log.debug('salt/hypervisor/map.jinja: VM config content: ' ~ vm_state | tojson) %}
{% set vm_data = {'config': vm_state.config} %} {% set vm_data = {'config': vm_state.config} %}
@@ -85,7 +101,7 @@
{% endif %} {% endif %}
{% do vms.update({hostname ~ '_' ~ role: vm_data}) %} {% do vms.update({hostname ~ '_' ~ role: vm_data}) %}
{% else %} {% else %}
{% do salt.log.debug('salt/hypervisor/map.jinja: Config file empty: ' ~ vm_file) %} {% do salt.log.debug('salt/hypervisor/map.jinja: Skipping VM ' ~ hostname ~ '_' ~ role ~ ' - no config available') %}
{% endif %} {% endif %}
{% endfor %} {% endfor %}

View File

@@ -117,7 +117,7 @@ Exit Codes:
4: VM provisioning failure (so-salt-cloud execution failed) 4: VM provisioning failure (so-salt-cloud execution failed)
Logging: Logging:
Log files are written to /opt/so/log/salt/engines/virtual_node_manager.log Log files are written to /opt/so/log/salt/engines/virtual_node_manager
Comprehensive logging includes: Comprehensive logging includes:
- Hardware validation details - Hardware validation details
- PCI ID conversion process - PCI ID conversion process
@@ -138,23 +138,49 @@ import pwd
import grp import grp
import salt.config import salt.config
import salt.runner import salt.runner
import salt.client
from typing import Dict, List, Optional, Tuple, Any from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime, timedelta from datetime import datetime, timedelta
from threading import Lock from threading import Lock
# Get socore uid/gid # Initialize Salt runner and local client once
SOCORE_UID = pwd.getpwnam('socore').pw_uid
SOCORE_GID = grp.getgrnam('socore').gr_gid
# Initialize Salt runner once
opts = salt.config.master_config('/etc/salt/master') opts = salt.config.master_config('/etc/salt/master')
opts['output'] = 'json' opts['output'] = 'json'
runner = salt.runner.RunnerClient(opts) runner = salt.runner.RunnerClient(opts)
local = salt.client.LocalClient()
# Get socore uid/gid for file ownership
SOCORE_UID = pwd.getpwnam('socore').pw_uid
SOCORE_GID = grp.getgrnam('socore').gr_gid
# Configure logging # Configure logging
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG) log.setLevel(logging.DEBUG)
# Prevent propagation to parent loggers to avoid duplicate log entries
log.propagate = False
# Add file handler for dedicated log file
log_dir = '/opt/so/log/salt'
log_file = os.path.join(log_dir, 'virtual_node_manager')
# Create log directory if it doesn't exist
os.makedirs(log_dir, exist_ok=True)
# Create file handler
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.DEBUG)
# Create formatter
formatter = logging.Formatter(
'%(asctime)s [%(name)s:%(lineno)d][%(levelname)-8s][%(process)d] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
file_handler.setFormatter(formatter)
# Add handler to logger
log.addHandler(file_handler)
# Constants # Constants
DEFAULT_INTERVAL = 30 DEFAULT_INTERVAL = 30
DEFAULT_BASE_PATH = '/opt/so/saltstack/local/salt/hypervisor/hosts' DEFAULT_BASE_PATH = '/opt/so/saltstack/local/salt/hypervisor/hosts'
@@ -203,6 +229,39 @@ def write_json_file(file_path: str, data: Any) -> None:
except Exception as e: except Exception as e:
log.error("Failed to write JSON file %s: %s", file_path, str(e)) log.error("Failed to write JSON file %s: %s", file_path, str(e))
raise raise
def remove_vm_from_vms_file(vms_file_path: str, vm_hostname: str, vm_role: str) -> bool:
"""
Remove a VM entry from the hypervisorVMs file.
Args:
vms_file_path: Path to the hypervisorVMs file
vm_hostname: Hostname of the VM to remove (without role suffix)
vm_role: Role of the VM
Returns:
bool: True if VM was removed, False otherwise
"""
try:
# Read current VMs
vms = read_json_file(vms_file_path)
# Find and remove the VM entry
original_count = len(vms)
vms = [vm for vm in vms if not (vm.get('hostname') == vm_hostname and vm.get('role') == vm_role)]
if len(vms) < original_count:
# VM was found and removed, write back to file
write_json_file(vms_file_path, vms)
log.info("Removed VM %s_%s from %s", vm_hostname, vm_role, vms_file_path)
return True
else:
log.warning("VM %s_%s not found in %s", vm_hostname, vm_role, vms_file_path)
return False
except Exception as e:
log.error("Failed to remove VM %s_%s from %s: %s", vm_hostname, vm_role, vms_file_path, str(e))
return False
def read_yaml_file(file_path: str) -> dict: def read_yaml_file(file_path: str) -> dict:
"""Read and parse a YAML file.""" """Read and parse a YAML file."""
@@ -558,6 +617,13 @@ def mark_vm_failed(vm_file: str, error_code: int, message: str) -> None:
# Remove the original file since we'll create an error file # Remove the original file since we'll create an error file
os.remove(vm_file) os.remove(vm_file)
# Clear hardware resource claims so failed VMs don't consume resources
# Keep nsm_size for reference but clear cpu, memory, sfp, copper
config.pop('cpu', None)
config.pop('memory', None)
config.pop('sfp', None)
config.pop('copper', None)
# Create error file # Create error file
error_file = f"{vm_file}.error" error_file = f"{vm_file}.error"
data = { data = {
@@ -586,8 +652,16 @@ def mark_invalid_hardware(hypervisor_path: str, vm_name: str, config: dict, erro
# Join all messages with proper sentence structure # Join all messages with proper sentence structure
full_message = "Hardware validation failure: " + " ".join(error_messages) full_message = "Hardware validation failure: " + " ".join(error_messages)
# Clear hardware resource claims so failed VMs don't consume resources
# Keep nsm_size for reference but clear cpu, memory, sfp, copper
config_copy = config.copy()
config_copy.pop('cpu', None)
config_copy.pop('memory', None)
config_copy.pop('sfp', None)
config_copy.pop('copper', None)
data = { data = {
'config': config, 'config': config_copy,
'status': 'error', 'status': 'error',
'timestamp': datetime.now().isoformat(), 'timestamp': datetime.now().isoformat(),
'error_details': { 'error_details': {
@@ -634,6 +708,61 @@ def validate_vrt_license() -> bool:
log.error("Error reading license file: %s", str(e)) log.error("Error reading license file: %s", str(e))
return False return False
def check_hypervisor_disk_space(hypervisor: str, size_gb: int) -> Tuple[bool, Optional[str]]:
"""
Check if hypervisor has sufficient disk space for volume creation.
Args:
hypervisor: Hypervisor hostname
size_gb: Required size in GB
Returns:
Tuple of (has_space, error_message)
"""
try:
# Get hypervisor minion ID
hypervisor_minion = f"{hypervisor}_hypervisor"
# Check disk space on /nsm/libvirt/volumes using LocalClient
result = local.cmd(
hypervisor_minion,
'cmd.run',
["df -BG /nsm/libvirt/volumes | tail -1 | awk '{print $4}' | sed 's/G//'"]
)
if not result or hypervisor_minion not in result:
log.error("Failed to check disk space on hypervisor %s", hypervisor)
return False, "Failed to check disk space on hypervisor"
available_gb_str = result[hypervisor_minion].strip()
if not available_gb_str:
log.error("Empty disk space response from hypervisor %s", hypervisor)
return False, "Failed to get disk space information"
try:
available_gb = float(available_gb_str)
except ValueError:
log.error("Invalid disk space value from hypervisor %s: %s", hypervisor, available_gb_str)
return False, f"Invalid disk space value: {available_gb_str}"
# Add 10% buffer for filesystem overhead
required_gb = size_gb * 1.1
log.debug("Hypervisor %s disk space check: Available=%.2fGB, Required=%.2fGB",
hypervisor, available_gb, required_gb)
if available_gb < required_gb:
error_msg = f"Insufficient disk space on hypervisor {hypervisor}. Available: {available_gb:.2f}GB, Required: {required_gb:.2f}GB (including 10% overhead)"
log.error(error_msg)
return False, error_msg
log.info("Hypervisor %s has sufficient disk space for %dGB volume", hypervisor, size_gb)
return True, None
except Exception as e:
log.error("Error checking disk space on hypervisor %s: %s", hypervisor, str(e))
return False, f"Error checking disk space: {str(e)}"
def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None: def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None:
""" """
Process a single VM creation request. Process a single VM creation request.
@@ -695,6 +824,33 @@ def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None:
log.warning("VM: %s - Both disk and nsm_size specified. disk takes precedence, nsm_size will be ignored.", log.warning("VM: %s - Both disk and nsm_size specified. disk takes precedence, nsm_size will be ignored.",
vm_name) vm_name)
# Check disk space BEFORE creating VM if nsm_size is specified
if has_nsm_size and not has_disk:
size_gb = int(vm_config['nsm_size'])
has_space, space_error = check_hypervisor_disk_space(hypervisor, size_gb)
if not has_space:
log.error("VM: %s - %s", vm_name, space_error)
# Send Volume nsm Create Failed status event
try:
subprocess.run([
'so-salt-emit-vm-deployment-status-event',
'-v', vm_name,
'-H', hypervisor,
'-s', 'Volume nsm Create Failed'
], check=True)
except subprocess.CalledProcessError as e:
log.error("Failed to emit volume create failed event for %s: %s", vm_name, str(e))
mark_invalid_hardware(
hypervisor_path,
vm_name,
vm_config,
{'disk_space': f"Insufficient disk space for {size_gb}GB volume: {space_error}"}
)
return
log.debug("VM: %s - Hypervisor has sufficient space for %dGB volume", vm_name, size_gb)
# Initial hardware validation against model # Initial hardware validation against model
is_valid, errors = validate_hardware_request(model_config, vm_config) is_valid, errors = validate_hardware_request(model_config, vm_config)
if not is_valid: if not is_valid:
@@ -967,12 +1123,21 @@ def process_hypervisor(hypervisor_path: str) -> None:
if not nodes_config: if not nodes_config:
log.debug("Empty VMs configuration in %s", vms_file) log.debug("Empty VMs configuration in %s", vms_file)
# Get existing VMs # Get existing VMs and track failed VMs separately
existing_vms = set() existing_vms = set()
failed_vms = set() # VMs with .error files
for file_path in glob.glob(os.path.join(hypervisor_path, '*_*')): for file_path in glob.glob(os.path.join(hypervisor_path, '*_*')):
basename = os.path.basename(file_path) basename = os.path.basename(file_path)
# Skip error and status files # Skip status files
if not basename.endswith('.error') and not basename.endswith('.status'): if basename.endswith('.status'):
continue
# Track VMs with .error files separately
if basename.endswith('.error'):
vm_name = basename[:-6] # Remove '.error' suffix
failed_vms.add(vm_name)
existing_vms.add(vm_name) # Also add to existing to prevent recreation
log.debug(f"Found failed VM with .error file: {vm_name}")
else:
existing_vms.add(basename) existing_vms.add(basename)
# Process new VMs # Process new VMs
@@ -989,12 +1154,29 @@ def process_hypervisor(hypervisor_path: str) -> None:
# process_vm_creation handles its own locking # process_vm_creation handles its own locking
process_vm_creation(hypervisor_path, vm_config) process_vm_creation(hypervisor_path, vm_config)
# Process VM deletions # Process VM deletions (but skip failed VMs that only have .error files)
vms_to_delete = existing_vms - configured_vms vms_to_delete = existing_vms - configured_vms
log.debug(f"Existing VMs: {existing_vms}") log.debug(f"Existing VMs: {existing_vms}")
log.debug(f"Configured VMs: {configured_vms}") log.debug(f"Configured VMs: {configured_vms}")
log.debug(f"Failed VMs: {failed_vms}")
log.debug(f"VMs to delete: {vms_to_delete}") log.debug(f"VMs to delete: {vms_to_delete}")
for vm_name in vms_to_delete: for vm_name in vms_to_delete:
# Skip deletion if VM only has .error file (no actual VM to delete)
if vm_name in failed_vms:
error_file = os.path.join(hypervisor_path, f"{vm_name}.error")
base_file = os.path.join(hypervisor_path, vm_name)
# Only skip if there's no base file (VM never successfully created)
if not os.path.exists(base_file):
log.info(f"Skipping deletion of failed VM {vm_name} (VM never successfully created)")
# Clean up the .error and .status files since VM is no longer configured
if os.path.exists(error_file):
os.remove(error_file)
log.info(f"Removed .error file for unconfigured VM: {vm_name}")
status_file = os.path.join(hypervisor_path, f"{vm_name}.status")
if os.path.exists(status_file):
os.remove(status_file)
log.info(f"Removed .status file for unconfigured VM: {vm_name}")
continue
log.info(f"Initiating deletion process for VM: {vm_name}") log.info(f"Initiating deletion process for VM: {vm_name}")
process_vm_deletion(hypervisor_path, vm_name) process_vm_deletion(hypervisor_path, vm_name)

View File

@@ -3,6 +3,7 @@
{# Define the list of process steps in order (case-sensitive) #} {# Define the list of process steps in order (case-sensitive) #}
{% set PROCESS_STEPS = [ {% set PROCESS_STEPS = [
'Processing', 'Processing',
'Volume nsm Create Failed',
'IP Configuration', 'IP Configuration',
'Starting Create', 'Starting Create',
'Executing Deploy Script', 'Executing Deploy Script',

View File

@@ -0,0 +1,51 @@
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
# https://securityonion.net/license; you may not use this file except in compliance with the
# Elastic License 2.0.
#
# Note: Per the Elastic License 2.0, the second limitation states:
#
# "You may not move, change, disable, or circumvent the license key functionality
# in the software, and you may not remove or obscure any functionality in the
# software that is protected by the license key."
{% if 'vrt' in salt['pillar.get']('features', []) %}
{% do salt.log.info('soc/dyanno/hypervisor/remove_failed_vm: Running') %}
{% set vm_name = pillar.get('vm_name') %}
{% set hypervisor = pillar.get('hypervisor') %}
{% if vm_name and hypervisor %}
{% set vm_parts = vm_name.split('_') %}
{% if vm_parts | length >= 2 %}
{% set vm_role = vm_parts[-1] %}
{% set vm_hostname = '_'.join(vm_parts[:-1]) %}
{% set vms_file = '/opt/so/saltstack/local/salt/hypervisor/hosts/' ~ hypervisor ~ 'VMs' %}
{% do salt.log.info('soc/dyanno/hypervisor/remove_failed_vm: Removing VM ' ~ vm_name ~ ' from ' ~ vms_file) %}
remove_vm_{{ vm_name }}_from_vms_file:
module.run:
- name: hypervisor.remove_vm_from_vms_file
- vms_file_path: {{ vms_file }}
- vm_hostname: {{ vm_hostname }}
- vm_role: {{ vm_role }}
{% else %}
{% do salt.log.error('soc/dyanno/hypervisor/remove_failed_vm: Invalid vm_name format: ' ~ vm_name) %}
{% endif %}
{% else %}
{% do salt.log.error('soc/dyanno/hypervisor/remove_failed_vm: Missing required pillar data (vm_name or hypervisor)') %}
{% endif %}
{% do salt.log.info('soc/dyanno/hypervisor/remove_failed_vm: Completed') %}
{% else %}
{% do salt.log.error(
'Hypervisor nodes are a feature supported only for customers with a valid license. '
'Contact Security Onion Solutions, LLC via our website at https://securityonionsolutions.com '
'for more information about purchasing a license to enable this feature.'
) %}
{% endif %}