From 86eca53d4b39b9370aa3790be366844be0d3c253 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Wed, 1 Oct 2025 14:57:25 -0400 Subject: [PATCH 1/3] support for byodmodel --- salt/hypervisor/map.jinja | 6 ++++-- salt/salt/engines/master/virtual_node_manager.py | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/salt/hypervisor/map.jinja b/salt/hypervisor/map.jinja index dae3985d4..3519f6078 100644 --- a/salt/hypervisor/map.jinja +++ b/salt/hypervisor/map.jinja @@ -13,6 +13,7 @@ {# Import defaults.yaml for model hardware capabilities #} {% import_yaml 'hypervisor/defaults.yaml' as DEFAULTS %} +{% set HYPERVISORMERGED = salt['pillar.get']('hypervisor', default=DEFAULTS.hypervisor, merge=True) %} {# Get hypervisor nodes from pillar #} {% set NODES = salt['pillar.get']('hypervisor:nodes', {}) %} @@ -30,9 +31,10 @@ {% set model = '' %} {% if grains %} {% set minion_id = grains.keys() | first %} - {% set model = grains[minion_id].get('sosmodel', '') %} + {% set model = grains[minion_id].get('sosmodel', grains[minion_id].get('byodmodel', '')) %} {% endif %} - {% set model_config = DEFAULTS.hypervisor.model.get(model, {}) %} + + {% set model_config = HYPERVISORMERGED.model.get(model, {}) %} {# Get VM list from VMs file #} {% set vms = {} %} diff --git a/salt/salt/engines/master/virtual_node_manager.py b/salt/salt/engines/master/virtual_node_manager.py index 88ccede9c..f09aca751 100644 --- a/salt/salt/engines/master/virtual_node_manager.py +++ b/salt/salt/engines/master/virtual_node_manager.py @@ -271,7 +271,7 @@ def parse_hardware_indices(hw_value: Any) -> List[int]: return indices def get_hypervisor_model(hypervisor: str) -> str: - """Get sosmodel from hypervisor grains.""" + """Get sosmodel or byodmodel from hypervisor grains.""" try: # Get cached grains using Salt runner grains = runner.cmd( @@ -283,9 +283,9 @@ def get_hypervisor_model(hypervisor: str) -> str: # Get the first minion ID that matches our hypervisor minion_id = next(iter(grains.keys())) - model = grains[minion_id].get('sosmodel') + model = grains[minion_id].get('sosmodel', grains[minion_id].get('byodmodel', '')) if not model: - raise ValueError(f"No sosmodel grain found for hypervisor {hypervisor}") + raise ValueError(f"No sosmodel or byodmodel grain found for hypervisor {hypervisor}") log.debug("Found model %s for hypervisor %s", model, hypervisor) return model From 7deef44ff61603d1bce8dad2a667681b2d3e020f Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Thu, 2 Oct 2025 11:55:50 -0400 Subject: [PATCH 2/3] check defaults or pillar file --- .../engines/master/virtual_node_manager.py | 41 +++++++++++++++++-- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/salt/salt/engines/master/virtual_node_manager.py b/salt/salt/engines/master/virtual_node_manager.py index f09aca751..7783e7c35 100644 --- a/salt/salt/engines/master/virtual_node_manager.py +++ b/salt/salt/engines/master/virtual_node_manager.py @@ -161,6 +161,7 @@ DEFAULT_BASE_PATH = '/opt/so/saltstack/local/salt/hypervisor/hosts' VALID_ROLES = ['sensor', 'searchnode', 'idh', 'receiver', 'heavynode', 'fleet'] LICENSE_PATH = '/opt/so/saltstack/local/pillar/soc/license.sls' DEFAULTS_PATH = '/opt/so/saltstack/default/salt/hypervisor/defaults.yaml' +HYPERVISOR_PILLAR_PATH = '/opt/so/saltstack/local/pillar/hypervisor/soc_hypervisor.sls' # Define the retention period for destroyed VMs (in hours) DESTROYED_VM_RETENTION_HOURS = 48 @@ -295,16 +296,48 @@ def get_hypervisor_model(hypervisor: str) -> str: raise def load_hardware_defaults(model: str) -> dict: - """Load hardware configuration from defaults.yaml.""" + """Load hardware configuration from defaults.yaml and optionally override with pillar configuration.""" + config = None + config_source = None + try: + # First, try to load from defaults.yaml + log.debug("Checking for model %s in %s", model, DEFAULTS_PATH) defaults = read_yaml_file(DEFAULTS_PATH) if not defaults or 'hypervisor' not in defaults: raise ValueError("Invalid defaults.yaml structure") if 'model' not in defaults['hypervisor']: raise ValueError("No model configurations found in defaults.yaml") - if model not in defaults['hypervisor']['model']: - raise ValueError(f"Model {model} not found in defaults.yaml") - return defaults['hypervisor']['model'][model] + + # Check if model exists in defaults + if model in defaults['hypervisor']['model']: + config = defaults['hypervisor']['model'][model] + config_source = DEFAULTS_PATH + log.debug("Found model %s in %s", model, DEFAULTS_PATH) + + # Then, try to load from pillar file (if it exists) + try: + log.debug("Checking for model %s in %s", model, HYPERVISOR_PILLAR_PATH) + pillar_config = read_yaml_file(HYPERVISOR_PILLAR_PATH) + if pillar_config and 'hypervisor' in pillar_config: + if 'model' in pillar_config['hypervisor']: + if model in pillar_config['hypervisor']['model']: + # Override with pillar configuration + config = pillar_config['hypervisor']['model'][model] + config_source = HYPERVISOR_PILLAR_PATH + log.debug("Found model %s in %s (overriding defaults)", model, HYPERVISOR_PILLAR_PATH) + except FileNotFoundError: + log.debug("Pillar file %s not found, using defaults only", HYPERVISOR_PILLAR_PATH) + except Exception as e: + log.warning("Failed to read pillar file %s: %s (using defaults)", HYPERVISOR_PILLAR_PATH, str(e)) + + # If model was not found in either file, raise an error + if config is None: + raise ValueError(f"Model {model} not found in {DEFAULTS_PATH} or {HYPERVISOR_PILLAR_PATH}") + + log.debug("Using hardware configuration for model %s from %s", model, config_source) + return config + except Exception as e: log.error("Failed to load hardware defaults: %s", str(e)) raise From 05321cf1edf2becd89ada079e94ef136b68bdd52 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Thu, 2 Oct 2025 15:03:11 -0400 Subject: [PATCH 3/3] add --force-cleanup to nvme raid script --- salt/hypervisor/tools/sbin/so-nvme-raid1.sh | 122 +++++++++++++++++++- 1 file changed, 116 insertions(+), 6 deletions(-) diff --git a/salt/hypervisor/tools/sbin/so-nvme-raid1.sh b/salt/hypervisor/tools/sbin/so-nvme-raid1.sh index cc9916a4c..fe96c063b 100644 --- a/salt/hypervisor/tools/sbin/so-nvme-raid1.sh +++ b/salt/hypervisor/tools/sbin/so-nvme-raid1.sh @@ -30,7 +30,9 @@ # # WARNING: This script will DESTROY all data on the target drives! # -# USAGE: sudo ./so-nvme-raid1.sh +# USAGE: +# sudo ./so-nvme-raid1.sh # Normal operation +# sudo ./so-nvme-raid1.sh --force-cleanup # Force cleanup of existing RAID # ################################################################# @@ -41,6 +43,19 @@ set -e RAID_ARRAY_NAME="md0" RAID_DEVICE="/dev/${RAID_ARRAY_NAME}" MOUNT_POINT="/nsm" +FORCE_CLEANUP=false + +# Parse command line arguments +for arg in "$@"; do + case $arg in + --force-cleanup) + FORCE_CLEANUP=true + shift + ;; + *) + ;; + esac +done # Function to log messages log() { @@ -55,6 +70,91 @@ check_root() { fi } +# Function to force cleanup all RAID components +force_cleanup_raid() { + log "=== FORCE CLEANUP MODE ===" + log "This will destroy all RAID configurations and data on target drives!" + + # Stop all MD arrays + log "Stopping all MD arrays" + mdadm --stop --scan 2>/dev/null || true + + # Wait for arrays to stop + sleep 2 + + # Remove any running md devices + for md in /dev/md*; do + if [ -b "$md" ]; then + log "Stopping $md" + mdadm --stop "$md" 2>/dev/null || true + fi + done + + # Force cleanup both NVMe drives + for device in "/dev/nvme0n1" "/dev/nvme1n1"; do + log "Force cleaning $device" + + # Kill any processes using the device + fuser -k "${device}"* 2>/dev/null || true + + # Unmount any mounted partitions + for part in "${device}"*; do + if [ -b "$part" ]; then + umount -f "$part" 2>/dev/null || true + fi + done + + # Force zero RAID superblocks on partitions + for part in "${device}"p*; do + if [ -b "$part" ]; then + log "Zeroing RAID superblock on $part" + mdadm --zero-superblock --force "$part" 2>/dev/null || true + fi + done + + # Zero superblock on the device itself + log "Zeroing RAID superblock on $device" + mdadm --zero-superblock --force "$device" 2>/dev/null || true + + # Remove LVM physical volumes + pvremove -ff -y "$device" 2>/dev/null || true + + # Wipe all filesystem and partition signatures + log "Wiping all signatures from $device" + wipefs -af "$device" 2>/dev/null || true + + # Overwrite the beginning of the drive (partition table area) + log "Clearing partition table on $device" + dd if=/dev/zero of="$device" bs=1M count=10 2>/dev/null || true + + # Clear the end of the drive (backup partition table area) + local device_size=$(blockdev --getsz "$device" 2>/dev/null || echo "0") + if [ "$device_size" -gt 0 ]; then + dd if=/dev/zero of="$device" bs=512 seek=$(( device_size - 2048 )) count=2048 2>/dev/null || true + fi + + # Force kernel to re-read partition table + blockdev --rereadpt "$device" 2>/dev/null || true + partprobe -s "$device" 2>/dev/null || true + done + + # Clear mdadm configuration + log "Clearing mdadm configuration" + echo "DEVICE partitions" > /etc/mdadm.conf + + # Remove any fstab entries for the RAID device or mount point + log "Cleaning fstab entries" + sed -i "\|${RAID_DEVICE}|d" /etc/fstab + sed -i "\|${MOUNT_POINT}|d" /etc/fstab + + # Wait for system to settle + udevadm settle + sleep 5 + + log "Force cleanup complete!" + log "Proceeding with RAID setup..." +} + # Function to find MD arrays using specific devices find_md_arrays_using_devices() { local target_devices=("$@") @@ -205,10 +305,15 @@ check_existing_raid() { fi log "Error: $device appears to be part of an existing RAID array" - log "To reuse this device, you must first:" - log "1. Unmount any filesystems" - log "2. Stop the RAID array: mdadm --stop $array_name" - log "3. Zero the superblock: mdadm --zero-superblock ${device}p1" + log "Old RAID metadata detected but array is not running." + log "" + log "To fix this, run the script with --force-cleanup:" + log " sudo $0 --force-cleanup" + log "" + log "Or manually clean up with:" + log "1. Stop any arrays: mdadm --stop --scan" + log "2. Zero superblocks: mdadm --zero-superblock --force ${device}p1" + log "3. Wipe signatures: wipefs -af $device" exit 1 fi done @@ -238,7 +343,7 @@ ensure_devices_free() { done # Clear MD superblock - mdadm --zero-superblock "${device}"* 2>/dev/null || true + mdadm --zero-superblock --force "${device}"* 2>/dev/null || true # Remove LVM PV if exists pvremove -ff -y "$device" 2>/dev/null || true @@ -263,6 +368,11 @@ main() { # Check if running as root check_root + # If force cleanup flag is set, do aggressive cleanup first + if [ "$FORCE_CLEANUP" = true ]; then + force_cleanup_raid + fi + # Check for existing RAID setup check_existing_raid