allow for destroyed vms to be displayed in ui. VNM cleanup destroyed status files after 48h

This commit is contained in:
Josh Patterson
2025-02-26 09:06:45 -05:00
parent 5811b184be
commit d6f527881a
6 changed files with 186 additions and 42 deletions

View File

@@ -85,6 +85,60 @@
{% endif %}
{% endfor %}
{# Find and add destroyed VMs from status files #}
{% set processed_vms = [] %}
{% for vm_name, vm_data in vms.items() %}
{% do processed_vms.append(vm_name) %}
{% endfor %}
{# Find all status files for this hypervisor #}
{% set relative_path = 'hypervisor/hosts/' ~ hypervisor %}
{% set absolute_path = '/opt/so/saltstack/local/salt/' ~ relative_path %}
{% do salt.log.info('salt/hypervisor/map.jinja: Scanning for status files in: ' ~ absolute_path) %}
{# Try to find status files using file.find with absolute path #}
{% set status_files = salt['file.find'](absolute_path, name='*_*.status', type='f') %}
{% do salt.log.info('salt/hypervisor/map.jinja: Found status files: ' ~ status_files | tojson) %}
{# Convert absolute paths back to relative paths for processing #}
{% set relative_status_files = [] %}
{% for status_file in status_files %}
{% set relative_file = status_file | replace('/opt/so/saltstack/local/salt/', '') %}
{% do relative_status_files.append(relative_file) %}
{% endfor %}
{% set status_files = relative_status_files %}
{% do salt.log.info('salt/hypervisor/map.jinja: Converted to relative paths: ' ~ status_files | tojson) %}
{% for status_file in status_files %}
{# Extract the VM name from the filename #}
{% set basename = status_file.split('/')[-1] %}
{% set vm_name = basename.replace('.status', '') %}
{% set hostname = vm_name.split('_')[0] %}
{# Skip already processed VMs #}
{% if hostname in processed_vms %}
{% continue %}
{% endif %}
{# Read the status file #}
{% do salt.log.info('salt/hypervisor/map.jinja: Processing potential destroyed VM status file: ' ~ status_file) %}
{% import_json status_file as status_data %}
{# Only process files with "Destroyed Instance" status #}
{% if status_data and status_data.status == 'Destroyed Instance' %}
{% do salt.log.info('salt/hypervisor/map.jinja: Found VM with Destroyed Instance status: ' ~ hostname) %}
{# Add to vms with minimal config #}
{% do vms.update({
hostname: {
'status': status_data,
'config': {}
}
}) %}
{% endif %}
{% endfor %}
{# Merge node config with model capabilities and VM states #}
{% do HYPERVISORS[role].update({
hypervisor: {

View File

@@ -73,7 +73,7 @@ Notes:
Description:
The engine operates in the following phases:
1. Engine Lock Acquisition
1. Lock Acquisition
- Acquires single engine-wide lock
- Prevents multiple instances from running
- Lock remains until clean shutdown or error
@@ -138,7 +138,7 @@ import grp
import salt.config
import salt.runner
from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime
from datetime import datetime, timedelta
from threading import Lock
# Get socore uid/gid
@@ -160,6 +160,8 @@ DEFAULT_BASE_PATH = '/opt/so/saltstack/local/salt/hypervisor/hosts'
VALID_ROLES = ['sensor', 'searchnode', 'idh', 'receiver', 'heavynode', 'fleet']
LICENSE_PATH = '/opt/so/saltstack/local/pillar/soc/license.sls'
DEFAULTS_PATH = '/opt/so/saltstack/default/salt/hypervisor/defaults.yaml'
# Define the retention period for destroyed VMs (in hours)
DESTROYED_VM_RETENTION_HOURS = 48
# Single engine-wide lock for virtual node manager
engine_lock = Lock()
@@ -667,6 +669,50 @@ def process_vm_creation(hypervisor_path: str, vm_config: dict) -> None:
mark_vm_failed(os.path.join(hypervisor_path, f"{vm_name}_failed"), 4, error_msg)
raise
def cleanup_destroyed_vm_status_files(hypervisor_path: str) -> None:
"""
Clean up status files for destroyed VMs that are older than the retention period.
Args:
hypervisor_path: Path to the hypervisor directory
"""
try:
log.debug(f"Using destroyed VM retention period of {DESTROYED_VM_RETENTION_HOURS} hours")
# Calculate the retention cutoff time
cutoff_time = datetime.now() - timedelta(hours=DESTROYED_VM_RETENTION_HOURS)
# Find all status files for destroyed VMs
status_files = glob.glob(os.path.join(hypervisor_path, '*_*.status'))
log.debug(f"Found {len(status_files)} status files to check for expired destroyed VMs")
for status_file in status_files:
try:
# Read the status file
status_data = read_json_file(status_file)
# Check if this is a destroyed VM
if status_data.get('status') == 'Destroyed Instance':
# Parse the timestamp
timestamp_str = status_data.get('timestamp', '')
if timestamp_str:
timestamp = datetime.fromisoformat(timestamp_str)
vm_name = os.path.basename(status_file).replace('.status', '')
age_hours = (datetime.now() - timestamp).total_seconds() / 3600
# If older than retention period, delete the file
if timestamp < cutoff_time:
log.info(f"Removing expired status file for VM {vm_name} (age: {age_hours:.1f} hours > retention: {DESTROYED_VM_RETENTION_HOURS} hours)")
os.remove(status_file)
else:
log.debug(f"Status file for VM {vm_name} (age: {age_hours:.1f} hours < retention: {DESTROYED_VM_RETENTION_HOURS} hours)")
except Exception as e:
log.error(f"Error processing status file {status_file}: {e}")
except Exception as e:
log.error(f"Failed to clean up destroyed VM status files: {e}")
def process_vm_deletion(hypervisor_path: str, vm_name: str) -> None:
"""
Process a single VM deletion request.
@@ -731,6 +777,9 @@ def process_hypervisor(hypervisor_path: str) -> None:
vms_file = os.path.join(os.path.dirname(hypervisor_path), f"{hypervisor}VMs")
if not os.path.exists(vms_file):
log.debug("No VMs file found at %s", vms_file)
# Even if no VMs file exists, we should still clean up any expired status files
cleanup_destroyed_vm_status_files(hypervisor_path)
return
nodes_config = read_json_file(vms_file)
@@ -768,6 +817,9 @@ def process_hypervisor(hypervisor_path: str) -> None:
log.info(f"Initiating deletion process for VM: {vm_name}")
process_vm_deletion(hypervisor_path, vm_name)
# Clean up expired status files for destroyed VMs
cleanup_destroyed_vm_status_files(hypervisor_path)
except Exception as e:
log.error("Failed to process hypervisor %s: %s", hypervisor_path, str(e))
raise
@@ -797,12 +849,12 @@ def start(interval: int = DEFAULT_INTERVAL,
if not validate_hvn_license():
return
# Attempt to acquire engine lock
# Attempt to acquire lock
if not engine_lock.acquire(blocking=False):
log.error("Another virtual node manager is already running")
return
log.debug("Virtual node manager acquired engine lock")
log.debug("Virtual node manager acquired lock")
try:
# Process each hypervisor directory
@@ -811,7 +863,7 @@ def start(interval: int = DEFAULT_INTERVAL,
process_hypervisor(hypervisor_path)
# Clean shutdown - release lock
log.debug("Virtual node manager releasing engine lock")
log.debug("Virtual node manager releasing lock")
engine_lock.release()
log.info("Virtual node manager completed successfully")

View File

@@ -4,6 +4,8 @@ hypervisor:
title: defaultHost
description: "Hypervisor Configuration"
syntax: json
file: true
global: true
uiElements:
- field: hostname
label: "Hostname"
@@ -62,21 +64,16 @@ hypervisor:
forcedType: int
- field: disk
label: "Disk(s) for passthrough. Line-delimited list. Free: FREE | Total: TOTAL"
required: true
readonly: true
forcedType: '[]int'
multiline: true
- field: copper
label: "Copper port(s) for passthrough. Line-delimited list. Free: FREE | Total: TOTAL"
required: true
readonly: true
forcedType: '[]int'
multiline: true
- field: sfp
label: "SFP port(s) for passthrough. Line-delimited list. Free: FREE | Total: TOTAL"
required: true
readonly: true
forcedType: '[]int'
multiline: true
file: true
global: true

View File

@@ -1 +1,14 @@
{% set HYPERVISORS = salt['pillar.get']('hypervisor:nodes', {}) %}
{# Define the list of process steps in order (case-sensitive) #}
{% set PROCESS_STEPS = [
'Processing',
'IP Configuration',
'Starting Create',
'Executing Deploy Script',
'Initialize Minion Pillars',
'Created Instance',
'Hardware Configuration',
'Highstate Triggered',
'Destroyed Instance'
] %}

View File

@@ -13,6 +13,7 @@
{%- import_yaml 'soc/dyanno/hypervisor/hypervisor.yaml' as ANNOTATION -%}
{%- from 'hypervisor/map.jinja' import HYPERVISORS -%}
{%- from 'soc/dyanno/hypervisor/map.jinja' import PROCESS_STEPS -%}
{%- set TEMPLATE = ANNOTATION.hypervisor.hosts.pop('defaultHost') -%}
@@ -20,23 +21,53 @@
# Hypervisor Configuration: {{ description }}
## Resource Summary
| Resource | Available | Total |
|-------------|-----------|-----------|
| CPU Cores | {{ cpu_free }} | {{ cpu_total }} |
| Memory (GB) | {{ mem_free }} | {{ mem_total }} |
| Disk | {{ disk_free | replace('\n', ',') if disk_free else 'None' }} | {{ disk_total | replace('\n', ',') }} |
| Copper | {{ copper_free | replace('\n', ',') if copper_free else 'None' }} | {{ copper_total | replace('\n', ',') }} |
| SFP | {{ sfp_free | replace('\n', ',') if sfp_free else 'None' }} | {{ sfp_total | replace('\n', ',') }} |
| | CPU Cores | Memory (GB) | Disk | Copper | SFP |
|-----------|-----------|-------------|-------------|-------------|-------------|
| Available | {{ cpu_free }} | {{ mem_free }} | {{ disk_free | replace('\n', ',') if disk_free else 'None' }} | {{ copper_free | replace('\n', ',') if copper_free else 'None' }} | {{ sfp_free | replace('\n', ',') if sfp_free else 'None' }} |
| Total | {{ cpu_total }} | {{ mem_total }} | {{ disk_total | replace('\n', ',') }} | {{ copper_total | replace('\n', ',') }} | {{ sfp_total | replace('\n', ',') }} |
{%- if vm_list %}
## Virtual Machines
VMs can have the following status values: {% for step in PROCESS_STEPS %}{{ step }}{% if not loop.last %}, {% endif %}{% endfor %}. The "Last Updated" timestamp shows when the VM status was last changed. After reaching "Highstate Triggered" status, additional highstate runs will not update the timestamp. Only changing to "Destroyed Instance" status will update the timestamp again.
| Name | Status | CPU Cores | Memory (GB)| Disk | Copper | SFP | Last Updated |
|--------------------|--------------------|-----------|------------|------|--------|------|---------------------|
{%- for hostname, vm_data in vm_list.items() %}
| {{ hostname }}_{{ vm_data.get('config', {}).get('role', 'unknown') }} | {{ vm_data.get('status', {}).get('status', 'Unknown') }} | {{ vm_data.get('config', {}).get('cpu', 'N/A') }} | {{ vm_data.get('config', {}).get('memory', 'N/A') }} | {{ vm_data.get('config', {}).get('disk', '-') | replace('\n', ',') if vm_data.get('config', {}).get('disk') else '-' }} | {{ vm_data.get('config', {}).get('copper', '-') | replace('\n', ',') if vm_data.get('config', {}).get('copper') else '-' }} | {{ vm_data.get('config', {}).get('sfp', '-') | replace('\n', ',') if vm_data.get('config', {}).get('sfp') else '-' }} | {{ vm_data.get('status', {}).get('timestamp', 'Never') | replace('T', ' ') | regex_replace('\\.[0-9]+', '') }} |
{%- set vm_status = vm_data.get('status', {}).get('status', 'Unknown') %}
{%- set is_destroyed = vm_status == 'Destroyed Instance' %}
{%- set vm_role = vm_data.get('config', {}).get('role', 'unknown') %}
{%- set name = hostname ~ (('_' ~ vm_role) if not is_destroyed and vm_role != 'unknown' else '') %}
| {{ name }} | {{ vm_status }} |
{%- if is_destroyed -%}
-
{%- else -%}
{{ vm_data.get('config', {}).get('cpu', 'N/A') }}
{%- endif %} |
{%- if is_destroyed -%}
-
{%- else -%}
{{ vm_data.get('config', {}).get('memory', 'N/A') }}
{%- endif %} |
{%- if is_destroyed -%}
-
{%- else -%}
{{ vm_data.get('config', {}).get('disk', '-') | replace('\n', ',') if vm_data.get('config', {}).get('disk') else '-' }}
{%- endif %} |
{%- if is_destroyed -%}
-
{%- else -%}
{{ vm_data.get('config', {}).get('copper', '-') | replace('\n', ',') if vm_data.get('config', {}).get('copper') else '-' }}
{%- endif %} |
{%- if is_destroyed -%}
-
{%- else -%}
{{ vm_data.get('config', {}).get('sfp', '-') | replace('\n', ',') if vm_data.get('config', {}).get('sfp') else '-' }}
{%- endif %} | {{ vm_data.get('status', {}).get('timestamp', 'Never') | replace('T', ' ') | regex_replace('\\.[0-9]+', '') }} |
{%- endfor %}
{%- else %}
## Virtual Machines
VMs can have the following status values: {% for step in PROCESS_STEPS %}{{ step }}{% if not loop.last %}, {% endif %}{% endfor %}. The "Last Updated" timestamp shows when the VM status was last changed. After reaching "Highstate Triggered" status, additional highstate runs will not update the timestamp. Only changing to "Destroyed Instance" status will update the timestamp again.
No Virtual Machines Found
{%- endif %}
{%- endmacro -%}
@@ -66,9 +97,12 @@ No Virtual Machines Found
{%- set used_memory = 0 -%}
{%- set ns = namespace(used_cpu=0, used_memory=0) -%}
{%- for hostname, vm_data in vms.items() -%}
{%- set vm_config = vm_data.config -%}
{%- set ns.used_cpu = ns.used_cpu + vm_config.cpu | int -%}
{%- set ns.used_memory = ns.used_memory + vm_config.memory | int -%}
{%- set vm_status = vm_data.get('status', {}).get('status', '') -%}
{%- if vm_status != 'Destroyed Instance' -%}
{%- set vm_config = vm_data.config -%}
{%- set ns.used_cpu = ns.used_cpu + vm_config.get('cpu', 0) | int -%}
{%- set ns.used_memory = ns.used_memory + vm_config.get('memory', 0) | int -%}
{%- endif -%}
{%- endfor -%}
{# Calculate available resources #}
@@ -80,10 +114,13 @@ No Virtual Machines Found
{%- set used_copper = [] -%}
{%- set used_sfp = [] -%}
{%- for hostname, vm in vms.items() -%}
{%- set config = vm.get('config', {}) -%}
{%- do used_disk.extend((config.get('disk', '') | string).split('\n') | map('trim') | list) -%}
{%- do used_copper.extend((config.get('copper', '') | string).split('\n') | map('trim') | list) -%}
{%- do used_sfp.extend((config.get('sfp', '') | string).split('\n') | map('trim') | list) -%}
{%- set vm_status = vm.get('status', {}).get('status', '') -%}
{%- if vm_status != 'Destroyed Instance' -%}
{%- set config = vm.get('config', {}) -%}
{%- do used_disk.extend((config.get('disk', '') | string).split('\n') | map('trim') | list) -%}
{%- do used_copper.extend((config.get('copper', '') | string).split('\n') | map('trim') | list) -%}
{%- do used_sfp.extend((config.get('sfp', '') | string).split('\n') | map('trim') | list) -%}
{%- endif -%}
{%- endfor -%}
{# Get available PCI indices #}

View File

@@ -11,6 +11,9 @@
{% if 'hvn' in salt['pillar.get']('features', []) %}
{# Import the process steps from map.jinja #}
{% from 'soc/dyanno/hypervisor/map.jinja' import PROCESS_STEPS %}
{% do salt.log.info('soc/dyanno/hypervisor/write_status: Running') %}
{% set vm_name = pillar.get('vm_name') %}
{% set hypervisor = pillar.get('hypervisor') %}
@@ -21,19 +24,7 @@
{% set status_dir = base_path ~ '/' ~ hypervisor %}
{% set status_file = status_dir ~ '/' ~ vm_name ~ '.status' %}
# Define the list of process steps in order (case-sensitive)
{% set process_steps = [
'Processing',
'IP Configuration',
'Starting Create',
'Executing Deploy Script',
'Initialize Minion Pillars',
'Created Instance',
'Hardware Configuration',
'Highstate Triggered',
'Destroyed Instance'
] %}
{% set new_index = process_steps.index(status_data.get('status')) %}
{% set new_index = PROCESS_STEPS.index(status_data.get('status')) %}
{% do salt.log.debug('soc/dyanno/hypervisor/write_status: new_index: ' ~ new_index|string) %}
# Function to read and parse current JSON status file
@@ -46,8 +37,8 @@
{% import_json rel_path_status_file as current_status %}
{% do salt.log.debug('soc/dyanno/hypervisor/write_status: current status: ' ~ current_status) %}
{% do salt.log.debug('soc/dyanno/hypervisor/write_status: current status: ' ~ current_status.get('status')) %}
{% if current_status.get('status') in process_steps %}
{% set current_index = process_steps.index(current_status.get('status')) %}
{% if current_status.get('status') in PROCESS_STEPS %}
{% set current_index = PROCESS_STEPS.index(current_status.get('status')) %}
{% do salt.log.debug('soc/dyanno/hypervisor/write_status: current_index: ' ~ current_index|string) %}
{%- set return_value = current_index -%}
{% else %}
@@ -74,7 +65,7 @@ ensure_status_dir:
{# Some of the status updates trigger within a second of each other can can cause, for example, IP Configuration orchestration to process before the Processing #}
{# This check has been put in place to ensure a status sooner in the process can't overwrite this file if a status later in the process wrote to it first. #}
{# The final step is Destroyed, so we allow Processing to overwrite that incase someone creates a new VM with same name that was previously destroyed. #}
{% if new_index > current_index or (current_index == process_steps | length - 1 and new_index == 0) %}
{% if new_index > current_index or (current_index == PROCESS_STEPS | length - 1 and new_index == 0) %}
write_status_file:
file.serialize:
- name: {{ status_file }}
@@ -88,7 +79,7 @@ write_status_file:
- file: ensure_status_dir
{% else %}
{% do salt.log.debug('soc/dyanno/hypervisor/write_status: File not written. ' ~ process_steps[new_index] ~ ' cannot overwrite ' ~ process_steps[current_index] ~ '.' ) %}
{% do salt.log.debug('soc/dyanno/hypervisor/write_status: File not written. ' ~ PROCESS_STEPS[new_index] ~ ' cannot overwrite ' ~ PROCESS_STEPS[current_index] ~ '.' ) %}
{% endif %}