Merge remote-tracking branch 'origin/2.4/dev' into idstools-refactor

This commit is contained in:
DefensiveDepth
2025-11-06 10:38:37 -05:00
84 changed files with 3913 additions and 312 deletions
@@ -14,7 +14,7 @@ sool9_{{host}}:
private_key: /etc/ssh/auth_keys/soqemussh/id_ecdsa
sudo: True
deploy_command: sh /tmp/.saltcloud-*/deploy.sh
script_args: -r -F -x python3 stable 3006.9
script_args: -r -F -x python3 stable {{ SALTVERSION }}
minion:
master: {{ grains.host }}
master_port: 4506
+9 -2
View File
@@ -13,6 +13,7 @@
{% if '.'.join(sls.split('.')[:2]) in allowed_states %}
{% if 'vrt' in salt['pillar.get']('features', []) %}
{% set HYPERVISORS = salt['pillar.get']('hypervisor:nodes', {} ) %}
{% from 'salt/map.jinja' import SALTVERSION %}
{% if HYPERVISORS %}
cloud_providers:
@@ -20,7 +21,7 @@ cloud_providers:
- name: /etc/salt/cloud.providers.d/libvirt.conf
- source: salt://salt/cloud/cloud.providers.d/libvirt.conf.jinja
- defaults:
HYPERVISORS: {{HYPERVISORS}}
HYPERVISORS: {{ HYPERVISORS }}
- template: jinja
- makedirs: True
@@ -29,11 +30,17 @@ cloud_profiles:
- name: /etc/salt/cloud.profiles.d/socloud.conf
- source: salt://salt/cloud/cloud.profiles.d/socloud.conf.jinja
- defaults:
HYPERVISORS: {{HYPERVISORS}}
HYPERVISORS: {{ HYPERVISORS }}
MANAGERHOSTNAME: {{ grains.host }}
MANAGERIP: {{ pillar.host.mainip }}
SALTVERSION: {{ SALTVERSION }}
- template: jinja
- makedirs: True
{% else %}
no_hypervisors_configured:
test.succeed_without_changes:
- name: no_hypervisors_configured
- comment: No hypervisors are configured
{% endif %}
{% else %}
+275 -18
View File
@@ -117,7 +117,7 @@ Exit Codes:
4: VM provisioning failure (so-salt-cloud execution failed)
Logging:
Log files are written to /opt/so/log/salt/engines/virtual_node_manager.log
Log files are written to /opt/so/log/salt/engines/virtual_node_manager
Comprehensive logging includes:
- Hardware validation details
- PCI ID conversion process
@@ -138,29 +138,56 @@ import pwd
import grp
import salt.config
import salt.runner
import salt.client
from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime, timedelta
from threading import Lock
# Get socore uid/gid
SOCORE_UID = pwd.getpwnam('socore').pw_uid
SOCORE_GID = grp.getgrnam('socore').gr_gid
# Initialize Salt runner once
# Initialize Salt runner and local client once
opts = salt.config.master_config('/etc/salt/master')
opts['output'] = 'json'
runner = salt.runner.RunnerClient(opts)
local = salt.client.LocalClient()
# Get socore uid/gid for file ownership
SOCORE_UID = pwd.getpwnam('socore').pw_uid
SOCORE_GID = grp.getgrnam('socore').gr_gid
# Configure logging
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
# Prevent propagation to parent loggers to avoid duplicate log entries
log.propagate = False
# Add file handler for dedicated log file
log_dir = '/opt/so/log/salt'
log_file = os.path.join(log_dir, 'virtual_node_manager')
# Create log directory if it doesn't exist
os.makedirs(log_dir, exist_ok=True)
# Create file handler
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.DEBUG)
# Create formatter
formatter = logging.Formatter(
'%(asctime)s [%(name)s:%(lineno)d][%(levelname)-8s][%(process)d] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
file_handler.setFormatter(formatter)
# Add handler to logger
log.addHandler(file_handler)
# Constants
DEFAULT_INTERVAL = 30
DEFAULT_BASE_PATH = '/opt/so/saltstack/local/salt/hypervisor/hosts'
VALID_ROLES = ['sensor', 'searchnode', 'idh', 'receiver', 'heavynode', 'fleet']
LICENSE_PATH = '/opt/so/saltstack/local/pillar/soc/license.sls'
DEFAULTS_PATH = '/opt/so/saltstack/default/salt/hypervisor/defaults.yaml'
HYPERVISOR_PILLAR_PATH = '/opt/so/saltstack/local/pillar/hypervisor/soc_hypervisor.sls'
# Define the retention period for destroyed VMs (in hours)
DESTROYED_VM_RETENTION_HOURS = 48
@@ -202,6 +229,39 @@ def write_json_file(file_path: str, data: Any) -> None:
except Exception as e:
log.error("Failed to write JSON file %s: %s", file_path, str(e))
raise
def remove_vm_from_vms_file(vms_file_path: str, vm_hostname: str, vm_role: str) -> bool:
"""
Remove a VM entry from the hypervisorVMs file.
Args:
vms_file_path: Path to the hypervisorVMs file
vm_hostname: Hostname of the VM to remove (without role suffix)
vm_role: Role of the VM
Returns:
bool: True if VM was removed, False otherwise
"""
try:
# Read current VMs
vms = read_json_file(vms_file_path)
# Find and remove the VM entry
original_count = len(vms)
vms = [vm for vm in vms if not (vm.get('hostname') == vm_hostname and vm.get('role') == vm_role)]
if len(vms) < original_count:
# VM was found and removed, write back to file
write_json_file(vms_file_path, vms)
log.info("Removed VM %s_%s from %s", vm_hostname, vm_role, vms_file_path)
return True
else:
log.warning("VM %s_%s not found in %s", vm_hostname, vm_role, vms_file_path)
return False
except Exception as e:
log.error("Failed to remove VM %s_%s from %s: %s", vm_hostname, vm_role, vms_file_path, str(e))
return False
def read_yaml_file(file_path: str) -> dict:
"""Read and parse a YAML file."""
@@ -271,7 +331,7 @@ def parse_hardware_indices(hw_value: Any) -> List[int]:
return indices
def get_hypervisor_model(hypervisor: str) -> str:
"""Get sosmodel from hypervisor grains."""
"""Get sosmodel or byodmodel from hypervisor grains."""
try:
# Get cached grains using Salt runner
grains = runner.cmd(
@@ -283,9 +343,9 @@ def get_hypervisor_model(hypervisor: str) -> str:
# Get the first minion ID that matches our hypervisor
minion_id = next(iter(grains.keys()))
model = grains[minion_id].get('sosmodel')
model = grains[minion_id].get('sosmodel', grains[minion_id].get('byodmodel', ''))
if not model:
raise ValueError(f"No sosmodel grain found for hypervisor {hypervisor}")
raise ValueError(f"No sosmodel or byodmodel grain found for hypervisor {hypervisor}")
log.debug("Found model %s for hypervisor %s", model, hypervisor)
return model
@@ -295,16 +355,48 @@ def get_hypervisor_model(hypervisor: str) -> str:
raise
def load_hardware_defaults(model: str) -> dict:
"""Load hardware configuration from defaults.yaml."""
"""Load hardware configuration from defaults.yaml and optionally override with pillar configuration."""
config = None
config_source = None
try:
# First, try to load from defaults.yaml
log.debug("Checking for model %s in %s", model, DEFAULTS_PATH)
defaults = read_yaml_file(DEFAULTS_PATH)
if not defaults or 'hypervisor' not in defaults:
raise ValueError("Invalid defaults.yaml structure")
if 'model' not in defaults['hypervisor']:
raise ValueError("No model configurations found in defaults.yaml")
if model not in defaults['hypervisor']['model']:
raise ValueError(f"Model {model} not found in defaults.yaml")
return defaults['hypervisor']['model'][model]
# Check if model exists in defaults
if model in defaults['hypervisor']['model']:
config = defaults['hypervisor']['model'][model]
config_source = DEFAULTS_PATH
log.debug("Found model %s in %s", model, DEFAULTS_PATH)
# Then, try to load from pillar file (if it exists)
try:
log.debug("Checking for model %s in %s", model, HYPERVISOR_PILLAR_PATH)
pillar_config = read_yaml_file(HYPERVISOR_PILLAR_PATH)
if pillar_config and 'hypervisor' in pillar_config:
if 'model' in pillar_config['hypervisor']:
if model in pillar_config['hypervisor']['model']:
# Override with pillar configuration
config = pillar_config['hypervisor']['model'][model]
config_source = HYPERVISOR_PILLAR_PATH
log.debug("Found model %s in %s (overriding defaults)", model, HYPERVISOR_PILLAR_PATH)
except FileNotFoundError:
log.debug("Pillar file %s not found, using defaults only", HYPERVISOR_PILLAR_PATH)
except Exception as e:
log.warning("Failed to read pillar file %s: %s (using defaults)", HYPERVISOR_PILLAR_PATH, str(e))
# If model was not found in either file, raise an error
if config is None:
raise ValueError(f"Model {model} not found in {DEFAULTS_PATH} or {HYPERVISOR_PILLAR_PATH}")
log.debug("Using hardware configuration for model %s from %s", model, config_source)
return config
except Exception as e:
log.error("Failed to load hardware defaults: %s", str(e))
raise
@@ -525,6 +617,13 @@ def mark_vm_failed(vm_file: str, error_code: int, message: str) -> None:
# Remove the original file since we'll create an error file
os.remove(vm_file)
# Clear hardware resource claims so failed VMs don't consume resources
# Keep nsm_size for reference but clear cpu, memory, sfp, copper
config.pop('cpu', None)
config.pop('memory', None)
config.pop('sfp', None)
config.pop('copper', None)
# Create error file
error_file = f"{vm_file}.error"
data = {
@@ -553,8 +652,16 @@ def mark_invalid_hardware(hypervisor_path: str, vm_name: str, config: dict, erro
# Join all messages with proper sentence structure
full_message = "Hardware validation failure: " + " ".join(error_messages)
# Clear hardware resource claims so failed VMs don't consume resources
# Keep nsm_size for reference but clear cpu, memory, sfp, copper
config_copy = config.copy()
config_copy.pop('cpu', None)
config_copy.pop('memory', None)
config_copy.pop('sfp', None)
config_copy.pop('copper', None)
data = {
'config': config,
'config': config_copy,
'status': 'error',
'timestamp': datetime.now().isoformat(),
'error_details': {
@@ -601,6 +708,61 @@ def validate_vrt_license() -> bool:
log.error("Error reading license file: %s", str(e))
return False
def check_hypervisor_disk_space(hypervisor: str, size_gb: int) -> Tuple[bool, Optional[str]]:
"""
Check if hypervisor has sufficient disk space for volume creation.
Args:
hypervisor: Hypervisor hostname
size_gb: Required size in GB
Returns:
Tuple of (has_space, error_message)
"""
try:
# Get hypervisor minion ID
hypervisor_minion = f"{hypervisor}_hypervisor"
# Check disk space on /nsm/libvirt/volumes using LocalClient
result = local.cmd(
hypervisor_minion,
'cmd.run',
["df -BG /nsm/libvirt/volumes | tail -1 | awk '{print $4}' | sed 's/G//'"]
)
if not result or hypervisor_minion not in result:
log.error("Failed to check disk space on hypervisor %s", hypervisor)
return False, "Failed to check disk space on hypervisor"
available_gb_str = result[hypervisor_minion].strip()
if not available_gb_str:
log.error("Empty disk space response from hypervisor %s", hypervisor)
return False, "Failed to get disk space information"
try:
available_gb = float(available_gb_str)
except ValueError:
log.error("Invalid disk space value from hypervisor %s: %s", hypervisor, available_gb_str)
return False, f"Invalid disk space value: {available_gb_str}"
# Add 10% buffer for filesystem overhead
required_gb = size_gb * 1.1
log.debug("Hypervisor %s disk space check: Available=%.2fGB, Required=%.2fGB",
hypervisor, available_gb, required_gb)
if available_gb < required_gb:
error_msg = f"Insufficient disk space on hypervisor {hypervisor}. Available: {available_gb:.2f}GB, Required: {required_gb:.2f}GB (including 10% overhead)"
log.error(error_msg)
return False, error_msg
log.info("Hypervisor %s has sufficient disk space for %dGB volume", hypervisor, size_gb)
return True, None
except Exception as e:
log.error("Error checking disk space on hypervisor %s: %s", hypervisor, str(e))
return False, f"Error checking disk space: {str(e)}"
def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None:
"""
Process a single VM creation request.
@@ -633,6 +795,62 @@ def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None:
except subprocess.CalledProcessError as e:
logger.error(f"Failed to emit success status event: {e}")
# Validate nsm_size if present
if 'nsm_size' in vm_config:
try:
size = int(vm_config['nsm_size'])
if size <= 0:
log.error("VM: %s - nsm_size must be a positive integer, got: %d", vm_name, size)
mark_invalid_hardware(hypervisor_path, vm_name, vm_config,
{'nsm_size': 'Invalid nsm_size: must be positive integer'})
return
if size > 10000: # 10TB reasonable maximum
log.error("VM: %s - nsm_size %dGB exceeds reasonable maximum (10000GB)", vm_name, size)
mark_invalid_hardware(hypervisor_path, vm_name, vm_config,
{'nsm_size': f'Invalid nsm_size: {size}GB exceeds maximum (10000GB)'})
return
log.debug("VM: %s - nsm_size validated: %dGB", vm_name, size)
except (ValueError, TypeError) as e:
log.error("VM: %s - nsm_size must be a valid integer, got: %s", vm_name, vm_config.get('nsm_size'))
mark_invalid_hardware(hypervisor_path, vm_name, vm_config,
{'nsm_size': 'Invalid nsm_size: must be valid integer'})
return
# Check for conflicting storage configurations
has_disk = 'disk' in vm_config and vm_config['disk']
has_nsm_size = 'nsm_size' in vm_config and vm_config['nsm_size']
if has_disk and has_nsm_size:
log.warning("VM: %s - Both disk and nsm_size specified. disk takes precedence, nsm_size will be ignored.",
vm_name)
# Check disk space BEFORE creating VM if nsm_size is specified
if has_nsm_size and not has_disk:
size_gb = int(vm_config['nsm_size'])
has_space, space_error = check_hypervisor_disk_space(hypervisor, size_gb)
if not has_space:
log.error("VM: %s - %s", vm_name, space_error)
# Send Hypervisor NSM Disk Full status event
try:
subprocess.run([
'so-salt-emit-vm-deployment-status-event',
'-v', vm_name,
'-H', hypervisor,
'-s', 'Hypervisor NSM Disk Full'
], check=True)
except subprocess.CalledProcessError as e:
log.error("Failed to emit volume create failed event for %s: %s", vm_name, str(e))
mark_invalid_hardware(
hypervisor_path,
vm_name,
vm_config,
{'disk_space': f"Insufficient disk space for {size_gb}GB volume: {space_error}"}
)
return
log.debug("VM: %s - Hypervisor has sufficient space for %dGB volume", vm_name, size_gb)
# Initial hardware validation against model
is_valid, errors = validate_hardware_request(model_config, vm_config)
if not is_valid:
@@ -668,6 +886,11 @@ def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None:
if 'memory' in vm_config:
memory_mib = int(vm_config['memory']) * 1024
cmd.extend(['-m', str(memory_mib)])
# Add nsm_size if specified and disk is not specified
if 'nsm_size' in vm_config and vm_config['nsm_size'] and not ('disk' in vm_config and vm_config['disk']):
cmd.extend(['--nsm-size', str(vm_config['nsm_size'])])
log.debug("VM: %s - Adding nsm_size parameter: %s", vm_name, vm_config['nsm_size'])
# Add PCI devices
for hw_type in ['disk', 'copper', 'sfp']:
@@ -900,12 +1123,21 @@ def process_hypervisor(hypervisor_path: str) -> None:
if not nodes_config:
log.debug("Empty VMs configuration in %s", vms_file)
# Get existing VMs
# Get existing VMs and track failed VMs separately
existing_vms = set()
failed_vms = set() # VMs with .error files
for file_path in glob.glob(os.path.join(hypervisor_path, '*_*')):
basename = os.path.basename(file_path)
# Skip error and status files
if not basename.endswith('.error') and not basename.endswith('.status'):
# Skip status files
if basename.endswith('.status'):
continue
# Track VMs with .error files separately
if basename.endswith('.error'):
vm_name = basename[:-6] # Remove '.error' suffix
failed_vms.add(vm_name)
existing_vms.add(vm_name) # Also add to existing to prevent recreation
log.debug(f"Found failed VM with .error file: {vm_name}")
else:
existing_vms.add(basename)
# Process new VMs
@@ -922,12 +1154,37 @@ def process_hypervisor(hypervisor_path: str) -> None:
# process_vm_creation handles its own locking
process_vm_creation(hypervisor_path, vm_config)
# Process VM deletions
# Process VM deletions (but skip failed VMs that only have .error files)
vms_to_delete = existing_vms - configured_vms
log.debug(f"Existing VMs: {existing_vms}")
log.debug(f"Configured VMs: {configured_vms}")
log.debug(f"Failed VMs: {failed_vms}")
log.debug(f"VMs to delete: {vms_to_delete}")
for vm_name in vms_to_delete:
# Skip deletion if VM only has .error file (no actual VM to delete)
if vm_name in failed_vms:
error_file = os.path.join(hypervisor_path, f"{vm_name}.error")
base_file = os.path.join(hypervisor_path, vm_name)
# Only skip if there's no base file (VM never successfully created)
if not os.path.exists(base_file):
log.info(f"Skipping deletion of failed VM {vm_name} (VM never successfully created)")
# Clean up the .error and .status files since VM is no longer configured
if os.path.exists(error_file):
os.remove(error_file)
log.info(f"Removed .error file for unconfigured VM: {vm_name}")
status_file = os.path.join(hypervisor_path, f"{vm_name}.status")
if os.path.exists(status_file):
os.remove(status_file)
log.info(f"Removed .status file for unconfigured VM: {vm_name}")
# Trigger hypervisor annotation update to reflect the removal
try:
log.info(f"Triggering hypervisor annotation update after removing failed VM: {vm_name}")
runner.cmd('state.orch', ['orch.dyanno_hypervisor'])
except Exception as e:
log.error(f"Failed to trigger hypervisor annotation update for {vm_name}: {str(e)}")
continue
log.info(f"Initiating deletion process for VM: {vm_name}")
process_vm_deletion(hypervisor_path, vm_name)
+4 -1
View File
@@ -4,7 +4,10 @@
Elastic License 2.0. #}
{% set role = salt['grains.get']('role', '') %}
{% if role in ['so-hypervisor','so-managerhype'] and salt['network.ip_addrs']('br0')|length > 0 %}
{# We are using usebr0 mostly for setup of the so-managerhype node and controlling when we use br0 vs the physical interface #}
{% set usebr0 = salt['pillar.get']('usebr0', True) %}
{% if role in ['so-hypervisor','so-managerhype'] and usebr0 %}
{% set interface = 'br0' %}
{% else %}
{% set interface = pillar.host.mainint %}
+1 -1
View File
@@ -1,4 +1,4 @@
# version cannot be used elsewhere in this pillar as soup is grepping for it to determine if Salt needs to be patched
salt:
master:
version: '3006.9'
version: '3006.16'
+1 -1
View File
@@ -3,7 +3,7 @@
# https://securityonion.net/license; you may not use this file except in compliance with the
# Elastic License 2.0.
# this state was seperated from salt.minion state since it is called during setup
# this state was separated from salt.minion state since it is called during setup
# GLOBALS are imported in the salt.minion state and that is not available at that point in setup
# this state is included in the salt.minion state
+1 -1
View File
@@ -1,5 +1,5 @@
# version cannot be used elsewhere in this pillar as soup is grepping for it to determine if Salt needs to be patched
salt:
minion:
version: '3006.9'
version: '3006.16'
check_threshold: 3600 # in seconds, threshold used for so-salt-minion-check. any value less than 600 seconds may cause a lot of salt-minion restarts since the job to touch the file occurs every 5-8 minutes by default
@@ -1,18 +1,22 @@
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
# https://securityonion.net/license; you may not use this file except in compliance with the
# Elastic License 2.0.
{% from 'vars/globals.map.jinja' import GLOBALS %}
{% from 'salt/map.jinja' import UPGRADECOMMAND with context %}
{% from 'salt/map.jinja' import SALTVERSION %}
{% from 'salt/map.jinja' import INSTALLEDSALTVERSION %}
{% from 'salt/map.jinja' import SALTPACKAGES %}
{% from 'salt/map.jinja' import SYSTEMD_UNIT_FILE %}
{% import_yaml 'salt/minion.defaults.yaml' as SALTMINION %}
include:
- salt.python_modules
- salt.patch.x509_v2
- salt
- systemd.reload
- repo.client
- salt.mine_functions
- salt.minion.service_file
{% if GLOBALS.role in GLOBALS.manager_roles %}
- ca
{% endif %}
@@ -94,17 +98,6 @@ enable_startup_states:
- regex: '^startup_states: highstate$'
- unless: pgrep so-setup
# prior to 2.4.30 this managed file would restart the salt-minion service when updated
# since this file is currently only adding a delay service start
# it is not required to restart the service
salt_minion_service_unit_file:
file.managed:
- name: {{ SYSTEMD_UNIT_FILE }}
- source: salt://salt/service/salt-minion.service.jinja
- template: jinja
- onchanges_in:
- module: systemd_reload
{% endif %}
# this has to be outside the if statement above since there are <requisite>_in calls to this state
+26
View File
@@ -0,0 +1,26 @@
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
# https://securityonion.net/license; you may not use this file except in compliance with the
# Elastic License 2.0.
{% from 'salt/map.jinja' import SALTVERSION %}
{% from 'salt/map.jinja' import INSTALLEDSALTVERSION %}
{% from 'salt/map.jinja' import SYSTEMD_UNIT_FILE %}
include:
- systemd.reload
{% if INSTALLEDSALTVERSION|string == SALTVERSION|string %}
# prior to 2.4.30 this managed file would restart the salt-minion service when updated
# since this file is currently only adding a delay service start
# it is not required to restart the service
salt_minion_service_unit_file:
file.managed:
- name: {{ SYSTEMD_UNIT_FILE }}
- source: salt://salt/service/salt-minion.service.jinja
- template: jinja
- onchanges_in:
- module: systemd_reload
{% endif %}