diff --git a/salt/elasticfleet/tools/sbin_jinja/so-elastic-fleet-outputs-update b/salt/elasticfleet/tools/sbin_jinja/so-elastic-fleet-outputs-update index 43eef6ee9..9efe8a19d 100644 --- a/salt/elasticfleet/tools/sbin_jinja/so-elastic-fleet-outputs-update +++ b/salt/elasticfleet/tools/sbin_jinja/so-elastic-fleet-outputs-update @@ -15,8 +15,21 @@ if ! is_manager_node; then fi function update_logstash_outputs() { - # Generate updated JSON payload - JSON_STRING=$(jq -n --arg UPDATEDLIST $NEW_LIST_JSON '{"name":"grid-logstash","type":"logstash","hosts": $UPDATEDLIST,"is_default":true,"is_default_monitoring":true,"config_yaml":""}') + if logstash_policy=$(curl -K /opt/so/conf/elasticsearch/curl.config -L "http://localhost:5601/api/fleet/outputs/so-manager_logstash" --retry 3 --retry-delay 10 --fail 2>/dev/null); then + SSL_CONFIG=$(echo "$logstash_policy" | jq -r '.item.ssl') + if SECRETS=$(echo "$logstash_policy" | jq -er '.item.secrets' 2>/dev/null); then + JSON_STRING=$(jq -n \ + --arg UPDATEDLIST "$NEW_LIST_JSON" \ + --argjson SECRETS "$SECRETS" \ + --argjson SSL_CONFIG "$SSL_CONFIG" \ + '{"name":"grid-logstash","type":"logstash","hosts": $UPDATEDLIST,"is_default":true,"is_default_monitoring":true,"config_yaml":"","ssl": $SSL_CONFIG,"secrets": $SECRETS}') + else + JSON_STRING=$(jq -n \ + --arg UPDATEDLIST "$NEW_LIST_JSON" \ + --argjson SSL_CONFIG "$SSL_CONFIG" \ + '{"name":"grid-logstash","type":"logstash","hosts": $UPDATEDLIST,"is_default":true,"is_default_monitoring":true,"config_yaml":"","ssl": $SSL_CONFIG}') + fi + fi # Update Logstash Outputs curl -K /opt/so/conf/elasticsearch/curl.config -L -X PUT "localhost:5601/api/fleet/outputs/so-manager_logstash" -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -d "$JSON_STRING" | jq diff --git a/salt/elasticfleet/tools/sbin_jinja/so-elastic-fleet-setup b/salt/elasticfleet/tools/sbin_jinja/so-elastic-fleet-setup index ab6757893..446fc6c9a 100755 --- a/salt/elasticfleet/tools/sbin_jinja/so-elastic-fleet-setup +++ b/salt/elasticfleet/tools/sbin_jinja/so-elastic-fleet-setup @@ -127,7 +127,7 @@ JSON_STRING=$( jq -n \ --arg LOGSTASHCRT "$LOGSTASHCRT" \ --arg LOGSTASHKEY "$LOGSTASHKEY" \ --arg LOGSTASHCA "$LOGSTASHCA" \ - '{"name":"grid-logstash","is_default":true,"is_default_monitoring":true,"id":"so-manager_logstash","type":"logstash","hosts":["{{ GLOBALS.manager_ip }}:5055", "{{ GLOBALS.manager }}:5055"],"config_yaml":"","ssl":{"certificate": $LOGSTASHCRT,"key": $LOGSTASHKEY,"certificate_authorities":[ $LOGSTASHCA ]},"proxy_id":null}' + '{"name":"grid-logstash","is_default":true,"is_default_monitoring":true,"id":"so-manager_logstash","type":"logstash","hosts":["{{ GLOBALS.manager_ip }}:5055", "{{ GLOBALS.manager }}:5055"],"config_yaml":"","ssl":{"certificate": $LOGSTASHCRT,"certificate_authorities":[ $LOGSTASHCA ]},"secrets":{"ssl":{"key": $LOGSTASHKEY }},"proxy_id":null}' ) if ! fleet_api "outputs" -XPOST -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -d "$JSON_STRING"; then echo -e "\nFailed to create logstash fleet output" diff --git a/salt/hypervisor/map.jinja b/salt/hypervisor/map.jinja index dae3985d4..3519f6078 100644 --- a/salt/hypervisor/map.jinja +++ b/salt/hypervisor/map.jinja @@ -13,6 +13,7 @@ {# Import defaults.yaml for model hardware capabilities #} {% import_yaml 'hypervisor/defaults.yaml' as DEFAULTS %} +{% set HYPERVISORMERGED = salt['pillar.get']('hypervisor', default=DEFAULTS.hypervisor, merge=True) %} {# Get hypervisor nodes from pillar #} {% set NODES = salt['pillar.get']('hypervisor:nodes', {}) %} @@ -30,9 +31,10 @@ {% set model = '' %} {% if grains %} {% set minion_id = grains.keys() | first %} - {% set model = grains[minion_id].get('sosmodel', '') %} + {% set model = grains[minion_id].get('sosmodel', grains[minion_id].get('byodmodel', '')) %} {% endif %} - {% set model_config = DEFAULTS.hypervisor.model.get(model, {}) %} + + {% set model_config = HYPERVISORMERGED.model.get(model, {}) %} {# Get VM list from VMs file #} {% set vms = {} %} diff --git a/salt/hypervisor/tools/sbin/so-nvme-raid1.sh b/salt/hypervisor/tools/sbin/so-nvme-raid1.sh index cc9916a4c..fe96c063b 100644 --- a/salt/hypervisor/tools/sbin/so-nvme-raid1.sh +++ b/salt/hypervisor/tools/sbin/so-nvme-raid1.sh @@ -30,7 +30,9 @@ # # WARNING: This script will DESTROY all data on the target drives! # -# USAGE: sudo ./so-nvme-raid1.sh +# USAGE: +# sudo ./so-nvme-raid1.sh # Normal operation +# sudo ./so-nvme-raid1.sh --force-cleanup # Force cleanup of existing RAID # ################################################################# @@ -41,6 +43,19 @@ set -e RAID_ARRAY_NAME="md0" RAID_DEVICE="/dev/${RAID_ARRAY_NAME}" MOUNT_POINT="/nsm" +FORCE_CLEANUP=false + +# Parse command line arguments +for arg in "$@"; do + case $arg in + --force-cleanup) + FORCE_CLEANUP=true + shift + ;; + *) + ;; + esac +done # Function to log messages log() { @@ -55,6 +70,91 @@ check_root() { fi } +# Function to force cleanup all RAID components +force_cleanup_raid() { + log "=== FORCE CLEANUP MODE ===" + log "This will destroy all RAID configurations and data on target drives!" + + # Stop all MD arrays + log "Stopping all MD arrays" + mdadm --stop --scan 2>/dev/null || true + + # Wait for arrays to stop + sleep 2 + + # Remove any running md devices + for md in /dev/md*; do + if [ -b "$md" ]; then + log "Stopping $md" + mdadm --stop "$md" 2>/dev/null || true + fi + done + + # Force cleanup both NVMe drives + for device in "/dev/nvme0n1" "/dev/nvme1n1"; do + log "Force cleaning $device" + + # Kill any processes using the device + fuser -k "${device}"* 2>/dev/null || true + + # Unmount any mounted partitions + for part in "${device}"*; do + if [ -b "$part" ]; then + umount -f "$part" 2>/dev/null || true + fi + done + + # Force zero RAID superblocks on partitions + for part in "${device}"p*; do + if [ -b "$part" ]; then + log "Zeroing RAID superblock on $part" + mdadm --zero-superblock --force "$part" 2>/dev/null || true + fi + done + + # Zero superblock on the device itself + log "Zeroing RAID superblock on $device" + mdadm --zero-superblock --force "$device" 2>/dev/null || true + + # Remove LVM physical volumes + pvremove -ff -y "$device" 2>/dev/null || true + + # Wipe all filesystem and partition signatures + log "Wiping all signatures from $device" + wipefs -af "$device" 2>/dev/null || true + + # Overwrite the beginning of the drive (partition table area) + log "Clearing partition table on $device" + dd if=/dev/zero of="$device" bs=1M count=10 2>/dev/null || true + + # Clear the end of the drive (backup partition table area) + local device_size=$(blockdev --getsz "$device" 2>/dev/null || echo "0") + if [ "$device_size" -gt 0 ]; then + dd if=/dev/zero of="$device" bs=512 seek=$(( device_size - 2048 )) count=2048 2>/dev/null || true + fi + + # Force kernel to re-read partition table + blockdev --rereadpt "$device" 2>/dev/null || true + partprobe -s "$device" 2>/dev/null || true + done + + # Clear mdadm configuration + log "Clearing mdadm configuration" + echo "DEVICE partitions" > /etc/mdadm.conf + + # Remove any fstab entries for the RAID device or mount point + log "Cleaning fstab entries" + sed -i "\|${RAID_DEVICE}|d" /etc/fstab + sed -i "\|${MOUNT_POINT}|d" /etc/fstab + + # Wait for system to settle + udevadm settle + sleep 5 + + log "Force cleanup complete!" + log "Proceeding with RAID setup..." +} + # Function to find MD arrays using specific devices find_md_arrays_using_devices() { local target_devices=("$@") @@ -205,10 +305,15 @@ check_existing_raid() { fi log "Error: $device appears to be part of an existing RAID array" - log "To reuse this device, you must first:" - log "1. Unmount any filesystems" - log "2. Stop the RAID array: mdadm --stop $array_name" - log "3. Zero the superblock: mdadm --zero-superblock ${device}p1" + log "Old RAID metadata detected but array is not running." + log "" + log "To fix this, run the script with --force-cleanup:" + log " sudo $0 --force-cleanup" + log "" + log "Or manually clean up with:" + log "1. Stop any arrays: mdadm --stop --scan" + log "2. Zero superblocks: mdadm --zero-superblock --force ${device}p1" + log "3. Wipe signatures: wipefs -af $device" exit 1 fi done @@ -238,7 +343,7 @@ ensure_devices_free() { done # Clear MD superblock - mdadm --zero-superblock "${device}"* 2>/dev/null || true + mdadm --zero-superblock --force "${device}"* 2>/dev/null || true # Remove LVM PV if exists pvremove -ff -y "$device" 2>/dev/null || true @@ -263,6 +368,11 @@ main() { # Check if running as root check_root + # If force cleanup flag is set, do aggressive cleanup first + if [ "$FORCE_CLEANUP" = true ]; then + force_cleanup_raid + fi + # Check for existing RAID setup check_existing_raid diff --git a/salt/manager/tools/sbin/soup b/salt/manager/tools/sbin/soup index 52d6e92e9..ff9414b2d 100755 --- a/salt/manager/tools/sbin/soup +++ b/salt/manager/tools/sbin/soup @@ -422,6 +422,7 @@ preupgrade_changes() { [[ "$INSTALLEDVERSION" == 2.4.150 ]] && up_to_2.4.160 [[ "$INSTALLEDVERSION" == 2.4.160 ]] && up_to_2.4.170 [[ "$INSTALLEDVERSION" == 2.4.170 ]] && up_to_2.4.180 + [[ "$INSTALLEDVERSION" == 2.4.180 ]] && up_to_2.4.190 true } @@ -617,6 +618,16 @@ post_to_2.4.190() { update_import_fleet_output fi + # Check if expected default policy is logstash (global.pipeline is REDIS or "") + pipeline=$(lookup_pillar "pipeline" "global") + if [[ -z "$pipeline" ]] || [[ "$pipeline" == "REDIS" ]]; then + # Check if this grid is currently affected by corrupt fleet output policy + if elastic-agent status | grep "config: key file not configured" > /dev/null 2>&1; then + echo "Elastic Agent shows an ssl error connecting to logstash output. Updating output policy..." + update_default_logstash_output + fi + fi + POSTVERSION=2.4.190 } @@ -1173,6 +1184,31 @@ update_import_fleet_output() { fi } +update_default_logstash_output() { + echo "Updating fleet logstash output policy grid-logstash" + if logstash_policy=$(curl -K /opt/so/conf/elasticsearch/curl.config -L "http://localhost:5601/api/fleet/outputs/so-manager_logstash" --retry 3 --retry-delay 10 --fail 2>/dev/null); then + # Keep already configured hosts for this update, subsequent host updates come from so-elastic-fleet-outputs-update + HOSTS=$(echo "$logstash_policy" | jq -r '.item.hosts') + DEFAULT_ENABLED=$(echo "$logstash_policy" | jq -r '.item.is_default') + DEFAULT_MONITORING_ENABLED=$(echo "$logstash_policy" | jq -r '.item.is_default_monitoring') + LOGSTASHKEY=$(openssl rsa -in /etc/pki/elasticfleet-logstash.key) + LOGSTASHCRT=$(openssl x509 -in /etc/pki/elasticfleet-logstash.crt) + LOGSTASHCA=$(openssl x509 -in /etc/pki/tls/certs/intca.crt) + JSON_STRING=$(jq -n \ + --argjson HOSTS "$HOSTS" \ + --arg DEFAULT_ENABLED "$DEFAULT_ENABLED" \ + --arg DEFAULT_MONITORING_ENABLED "$DEFAULT_MONITORING_ENABLED" \ + --arg LOGSTASHKEY "$LOGSTASHKEY" \ + --arg LOGSTASHCRT "$LOGSTASHCRT" \ + --arg LOGSTASHCA "$LOGSTASHCA" \ + '{"name":"grid-logstash","type":"logstash","hosts": $HOSTS,"is_default": $DEFAULT_ENABLED,"is_default_monitoring": $DEFAULT_MONITORING_ENABLED,"config_yaml":"","ssl":{"certificate": $LOGSTASHCRT,"certificate_authorities":[ $LOGSTASHCA ]},"secrets":{"ssl":{"key": $LOGSTASHKEY }}}') + fi + + if curl -K /opt/so/conf/elasticsearch/curl.config -L -X PUT "localhost:5601/api/fleet/outputs/so-manager_logstash" -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -d "$JSON_STRING" --retry 3 --retry-delay 10 --fail; then + echo "Successfully updated grid-logstash fleet output policy" + fi +} + update_salt_mine() { echo "Populating the mine with mine_functions for each host." set +e diff --git a/salt/salt/engines/master/virtual_node_manager.py b/salt/salt/engines/master/virtual_node_manager.py index 88ccede9c..7783e7c35 100644 --- a/salt/salt/engines/master/virtual_node_manager.py +++ b/salt/salt/engines/master/virtual_node_manager.py @@ -161,6 +161,7 @@ DEFAULT_BASE_PATH = '/opt/so/saltstack/local/salt/hypervisor/hosts' VALID_ROLES = ['sensor', 'searchnode', 'idh', 'receiver', 'heavynode', 'fleet'] LICENSE_PATH = '/opt/so/saltstack/local/pillar/soc/license.sls' DEFAULTS_PATH = '/opt/so/saltstack/default/salt/hypervisor/defaults.yaml' +HYPERVISOR_PILLAR_PATH = '/opt/so/saltstack/local/pillar/hypervisor/soc_hypervisor.sls' # Define the retention period for destroyed VMs (in hours) DESTROYED_VM_RETENTION_HOURS = 48 @@ -271,7 +272,7 @@ def parse_hardware_indices(hw_value: Any) -> List[int]: return indices def get_hypervisor_model(hypervisor: str) -> str: - """Get sosmodel from hypervisor grains.""" + """Get sosmodel or byodmodel from hypervisor grains.""" try: # Get cached grains using Salt runner grains = runner.cmd( @@ -283,9 +284,9 @@ def get_hypervisor_model(hypervisor: str) -> str: # Get the first minion ID that matches our hypervisor minion_id = next(iter(grains.keys())) - model = grains[minion_id].get('sosmodel') + model = grains[minion_id].get('sosmodel', grains[minion_id].get('byodmodel', '')) if not model: - raise ValueError(f"No sosmodel grain found for hypervisor {hypervisor}") + raise ValueError(f"No sosmodel or byodmodel grain found for hypervisor {hypervisor}") log.debug("Found model %s for hypervisor %s", model, hypervisor) return model @@ -295,16 +296,48 @@ def get_hypervisor_model(hypervisor: str) -> str: raise def load_hardware_defaults(model: str) -> dict: - """Load hardware configuration from defaults.yaml.""" + """Load hardware configuration from defaults.yaml and optionally override with pillar configuration.""" + config = None + config_source = None + try: + # First, try to load from defaults.yaml + log.debug("Checking for model %s in %s", model, DEFAULTS_PATH) defaults = read_yaml_file(DEFAULTS_PATH) if not defaults or 'hypervisor' not in defaults: raise ValueError("Invalid defaults.yaml structure") if 'model' not in defaults['hypervisor']: raise ValueError("No model configurations found in defaults.yaml") - if model not in defaults['hypervisor']['model']: - raise ValueError(f"Model {model} not found in defaults.yaml") - return defaults['hypervisor']['model'][model] + + # Check if model exists in defaults + if model in defaults['hypervisor']['model']: + config = defaults['hypervisor']['model'][model] + config_source = DEFAULTS_PATH + log.debug("Found model %s in %s", model, DEFAULTS_PATH) + + # Then, try to load from pillar file (if it exists) + try: + log.debug("Checking for model %s in %s", model, HYPERVISOR_PILLAR_PATH) + pillar_config = read_yaml_file(HYPERVISOR_PILLAR_PATH) + if pillar_config and 'hypervisor' in pillar_config: + if 'model' in pillar_config['hypervisor']: + if model in pillar_config['hypervisor']['model']: + # Override with pillar configuration + config = pillar_config['hypervisor']['model'][model] + config_source = HYPERVISOR_PILLAR_PATH + log.debug("Found model %s in %s (overriding defaults)", model, HYPERVISOR_PILLAR_PATH) + except FileNotFoundError: + log.debug("Pillar file %s not found, using defaults only", HYPERVISOR_PILLAR_PATH) + except Exception as e: + log.warning("Failed to read pillar file %s: %s (using defaults)", HYPERVISOR_PILLAR_PATH, str(e)) + + # If model was not found in either file, raise an error + if config is None: + raise ValueError(f"Model {model} not found in {DEFAULTS_PATH} or {HYPERVISOR_PILLAR_PATH}") + + log.debug("Using hardware configuration for model %s from %s", model, config_source) + return config + except Exception as e: log.error("Failed to load hardware defaults: %s", str(e)) raise