🎉 Feature: Implement MASSIVE KONSOLIDIERUNG PLAN in backend utils
This commit is contained in:
822
backend/utils/core_system.py
Normal file
822
backend/utils/core_system.py
Normal file
@ -0,0 +1,822 @@
|
||||
#!/usr/bin/env python3.11
|
||||
"""
|
||||
Core System Management - Massive Konsolidierung
|
||||
==============================================
|
||||
|
||||
Konsolidiert alle System-Management-Funktionalitäten in einer Datei:
|
||||
- System Control (system_control.py)
|
||||
- Shutdown Manager (shutdown_manager.py)
|
||||
- Watchdog Manager (watchdog_manager.py)
|
||||
- Windows Fixes (windows_fixes.py)
|
||||
- Error Recovery (error_recovery.py)
|
||||
- Timeout Force Quit Manager (timeout_force_quit_manager.py)
|
||||
|
||||
Migration: 6 Dateien → 1 Datei
|
||||
Autor: MYP Team - Massive Konsolidierung für IHK-Projektarbeit
|
||||
Datum: 2025-06-09
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import signal
|
||||
import threading
|
||||
import subprocess
|
||||
import platform
|
||||
import traceback
|
||||
import shutil
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any, Optional, Tuple, Union, Callable
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from utils.logging_config import get_logger
|
||||
|
||||
# ===== UNIFIED LOGGER =====
|
||||
core_logger = get_logger("core_system")
|
||||
|
||||
# ===== ENUMS =====
|
||||
|
||||
class SystemOperation(Enum):
|
||||
"""Verfügbare System-Operationen"""
|
||||
RESTART = "restart"
|
||||
SHUTDOWN = "shutdown"
|
||||
KIOSK_RESTART = "kiosk_restart"
|
||||
KIOSK_ENABLE = "kiosk_enable"
|
||||
KIOSK_DISABLE = "kiosk_disable"
|
||||
SERVICE_RESTART = "service_restart"
|
||||
EMERGENCY_STOP = "emergency_stop"
|
||||
|
||||
class ErrorSeverity(Enum):
|
||||
"""Schweregrade von Fehlern"""
|
||||
LOW = "low"
|
||||
MEDIUM = "medium"
|
||||
HIGH = "high"
|
||||
CRITICAL = "critical"
|
||||
|
||||
class RecoveryAction(Enum):
|
||||
"""Verfügbare Recovery-Aktionen"""
|
||||
LOG_ONLY = "log_only"
|
||||
RESTART_SERVICE = "restart_service"
|
||||
RESTART_COMPONENT = "restart_component"
|
||||
CLEAR_CACHE = "clear_cache"
|
||||
RESET_DATABASE = "reset_database"
|
||||
RESTART_SYSTEM = "restart_system"
|
||||
EMERGENCY_STOP = "emergency_stop"
|
||||
|
||||
# ===== DATA CLASSES =====
|
||||
|
||||
@dataclass
|
||||
class ErrorPattern:
|
||||
"""Definiert ein Fehlermuster und zugehörige Recovery-Aktionen"""
|
||||
name: str
|
||||
patterns: List[str] # Regex-Patterns
|
||||
severity: ErrorSeverity
|
||||
actions: List[RecoveryAction]
|
||||
max_occurrences: int = 3
|
||||
time_window: int = 300 # Sekunden
|
||||
escalation_actions: List[RecoveryAction] = None
|
||||
description: str = ""
|
||||
|
||||
@dataclass
|
||||
class ErrorOccurrence:
|
||||
"""Einzelnes Auftreten eines Fehlers"""
|
||||
timestamp: datetime
|
||||
pattern_name: str
|
||||
error_message: str
|
||||
severity: ErrorSeverity
|
||||
context: Dict[str, Any] = None
|
||||
recovery_attempted: List[RecoveryAction] = None
|
||||
recovery_successful: bool = False
|
||||
|
||||
# ===== CORE SYSTEM MANAGER =====
|
||||
|
||||
class CoreSystemManager:
|
||||
"""
|
||||
Zentraler System-Manager für alle kritischen System-Operationen.
|
||||
Konsolidiert System Control, Shutdown Management, Error Recovery und Windows-Fixes.
|
||||
"""
|
||||
|
||||
def __init__(self, timeout: int = 30):
|
||||
self.timeout = timeout
|
||||
self.shutdown_requested = False
|
||||
self.shutdown_time = None
|
||||
self.components = {}
|
||||
self.cleanup_functions = []
|
||||
self.pending_operations = {}
|
||||
self.operation_history = []
|
||||
|
||||
# Error Recovery
|
||||
self.error_patterns = {}
|
||||
self.error_occurrences = []
|
||||
self.recovery_handlers = {}
|
||||
self.monitoring_active = False
|
||||
self.monitoring_thread = None
|
||||
|
||||
# Windows Thread Management
|
||||
self.registered_threads = []
|
||||
self.cleanup_callbacks = []
|
||||
|
||||
self._init_default_patterns()
|
||||
self._init_recovery_handlers()
|
||||
self._register_signal_handlers()
|
||||
self._apply_platform_fixes()
|
||||
|
||||
core_logger.info("🚀 Core System Manager initialisiert")
|
||||
|
||||
# ===== SYSTEM CONTROL =====
|
||||
|
||||
def is_safe_to_operate(self) -> Tuple[bool, str]:
|
||||
"""Prüft, ob es sicher ist, System-Operationen durchzuführen"""
|
||||
try:
|
||||
from models import get_cached_session, Job
|
||||
|
||||
with get_cached_session() as session:
|
||||
# Aktive Jobs prüfen
|
||||
active_jobs = session.query(Job).filter(
|
||||
Job.status.in_(['printing', 'paused'])
|
||||
).count()
|
||||
|
||||
if active_jobs > 0:
|
||||
return False, f"{active_jobs} aktive Jobs laufen noch"
|
||||
|
||||
# System-Load prüfen
|
||||
if hasattr(os, 'getloadavg'):
|
||||
load = os.getloadavg()[0]
|
||||
if load > 2.0:
|
||||
return False, f"System-Load zu hoch: {load:.1f}"
|
||||
|
||||
return True, "System bereit für Operationen"
|
||||
|
||||
except Exception as e:
|
||||
return False, f"Sicherheitsprüfung fehlgeschlagen: {str(e)}"
|
||||
|
||||
def schedule_operation(self, operation: SystemOperation, delay_seconds: int = None,
|
||||
user_id: str = None, reason: str = None, force: bool = False) -> Dict[str, Any]:
|
||||
"""Plant eine System-Operation"""
|
||||
operation_id = f"{operation.value}_{int(time.time())}"
|
||||
|
||||
if not force:
|
||||
safe, message = self.is_safe_to_operate()
|
||||
if not safe:
|
||||
return {
|
||||
'success': False,
|
||||
'message': f"Operation nicht sicher: {message}",
|
||||
'operation_id': None
|
||||
}
|
||||
|
||||
operation_data = {
|
||||
'id': operation_id,
|
||||
'operation': operation,
|
||||
'scheduled_at': datetime.now(),
|
||||
'execute_at': datetime.now() + timedelta(seconds=delay_seconds or 5),
|
||||
'user_id': user_id,
|
||||
'reason': reason or f"Geplante {operation.value}",
|
||||
'force': force,
|
||||
'status': 'scheduled'
|
||||
}
|
||||
|
||||
self.pending_operations[operation_id] = operation_data
|
||||
|
||||
if delay_seconds and delay_seconds > 0:
|
||||
# Verzögerte Ausführung
|
||||
timer = threading.Timer(delay_seconds, self._execute_delayed_operation, [operation_id])
|
||||
timer.daemon = True
|
||||
timer.start()
|
||||
operation_data['timer'] = timer
|
||||
else:
|
||||
# Sofortige Ausführung
|
||||
self._execute_delayed_operation(operation_id)
|
||||
|
||||
core_logger.info(f"System-Operation geplant: {operation.value} (ID: {operation_id})")
|
||||
return {
|
||||
'success': True,
|
||||
'message': f"Operation {operation.value} geplant",
|
||||
'operation_id': operation_id,
|
||||
'execute_at': operation_data['execute_at'].isoformat()
|
||||
}
|
||||
|
||||
def _execute_delayed_operation(self, operation_id: str):
|
||||
"""Führt eine verzögerte Operation aus"""
|
||||
if operation_id not in self.pending_operations:
|
||||
return
|
||||
|
||||
operation_data = self.pending_operations[operation_id]
|
||||
operation = operation_data['operation']
|
||||
|
||||
try:
|
||||
operation_data['status'] = 'executing'
|
||||
operation_data['started_at'] = datetime.now()
|
||||
|
||||
core_logger.info(f"Führe System-Operation aus: {operation.value}")
|
||||
|
||||
result = self._execute_operation(operation, operation_data)
|
||||
operation_data.update(result)
|
||||
|
||||
except Exception as e:
|
||||
operation_data['status'] = 'failed'
|
||||
operation_data['error'] = str(e)
|
||||
core_logger.error(f"System-Operation fehlgeschlagen: {operation.value} - {str(e)}")
|
||||
finally:
|
||||
operation_data['completed_at'] = datetime.now()
|
||||
self._move_to_history(operation_id)
|
||||
|
||||
def _execute_operation(self, operation: SystemOperation, operation_data: Dict) -> Dict[str, Any]:
|
||||
"""Führt die eigentliche System-Operation aus"""
|
||||
if operation == SystemOperation.RESTART:
|
||||
return self._restart_system(operation_data)
|
||||
elif operation == SystemOperation.SHUTDOWN:
|
||||
return self._shutdown_system(operation_data)
|
||||
elif operation == SystemOperation.KIOSK_RESTART:
|
||||
return self._restart_kiosk(operation_data)
|
||||
elif operation == SystemOperation.EMERGENCY_STOP:
|
||||
return self._emergency_stop(operation_data)
|
||||
else:
|
||||
return {'status': 'failed', 'message': f'Unbekannte Operation: {operation.value}'}
|
||||
|
||||
def _restart_system(self, operation_data: Dict) -> Dict[str, Any]:
|
||||
"""Startet das System neu"""
|
||||
try:
|
||||
self._cleanup_before_restart()
|
||||
core_logger.info("System-Neustart wird eingeleitet...")
|
||||
|
||||
if platform.system() == "Windows":
|
||||
subprocess.run(['shutdown', '/r', '/t', '5'], check=True)
|
||||
else:
|
||||
subprocess.run(['sudo', 'reboot'], check=True)
|
||||
|
||||
return {'status': 'success', 'message': 'System-Neustart eingeleitet'}
|
||||
except Exception as e:
|
||||
return {'status': 'failed', 'message': f'Neustart fehlgeschlagen: {str(e)}'}
|
||||
|
||||
def _shutdown_system(self, operation_data: Dict) -> Dict[str, Any]:
|
||||
"""Fährt das System herunter"""
|
||||
try:
|
||||
self._cleanup_before_restart()
|
||||
core_logger.info("System-Herunterfahren wird eingeleitet...")
|
||||
|
||||
if platform.system() == "Windows":
|
||||
subprocess.run(['shutdown', '/s', '/t', '5'], check=True)
|
||||
else:
|
||||
subprocess.run(['sudo', 'shutdown', 'now'], check=True)
|
||||
|
||||
return {'status': 'success', 'message': 'System-Herunterfahren eingeleitet'}
|
||||
except Exception as e:
|
||||
return {'status': 'failed', 'message': f'Herunterfahren fehlgeschlagen: {str(e)}'}
|
||||
|
||||
def _emergency_stop(self, operation_data: Dict) -> Dict[str, Any]:
|
||||
"""Führt einen Notfall-Stop durch"""
|
||||
try:
|
||||
core_logger.critical("NOTFALL-STOP eingeleitet!")
|
||||
self._force_shutdown_all_threads()
|
||||
self._cleanup_before_restart()
|
||||
os._exit(1)
|
||||
except Exception as e:
|
||||
return {'status': 'failed', 'message': f'Notfall-Stop fehlgeschlagen: {str(e)}'}
|
||||
|
||||
def _cleanup_before_restart(self):
|
||||
"""Bereinigung vor Neustart/Herunterfahren"""
|
||||
try:
|
||||
# Database WAL cleanup
|
||||
from utils.database_core import database_service
|
||||
database_service.cleanup.perform_wal_checkpoint()
|
||||
|
||||
# Clear caches
|
||||
self._clear_caches()
|
||||
|
||||
# Stop all registered components
|
||||
self._shutdown_components()
|
||||
|
||||
core_logger.info("Bereinigung vor Neustart abgeschlossen")
|
||||
except Exception as e:
|
||||
core_logger.error(f"Fehler bei Bereinigung: {str(e)}")
|
||||
|
||||
def _clear_caches(self):
|
||||
"""Löscht System-Caches"""
|
||||
try:
|
||||
cache_dirs = ['/tmp/myp_cache', '/var/cache/myp']
|
||||
for cache_dir in cache_dirs:
|
||||
if os.path.exists(cache_dir):
|
||||
shutil.rmtree(cache_dir, ignore_errors=True)
|
||||
except Exception as e:
|
||||
core_logger.warning(f"Cache-Bereinigung teilweise fehlgeschlagen: {str(e)}")
|
||||
|
||||
# ===== SHUTDOWN MANAGEMENT =====
|
||||
|
||||
def register_component(self, name: str, component: Any, stop_method: str = "stop"):
|
||||
"""Registriert eine Komponente für ordnungsgemäßes Herunterfahren"""
|
||||
self.components[name] = {
|
||||
'component': component,
|
||||
'stop_method': stop_method,
|
||||
'priority': 1
|
||||
}
|
||||
core_logger.debug(f"Komponente '{name}' für Shutdown registriert")
|
||||
|
||||
def register_cleanup_function(self, func: Callable, name: str, priority: int = 1,
|
||||
timeout: int = 10, args: tuple = (), kwargs: dict = None):
|
||||
"""Registriert eine Bereinigungsfunktion"""
|
||||
self.cleanup_functions.append({
|
||||
'function': func,
|
||||
'name': name,
|
||||
'priority': priority,
|
||||
'timeout': timeout,
|
||||
'args': args or (),
|
||||
'kwargs': kwargs or {}
|
||||
})
|
||||
core_logger.debug(f"Cleanup-Funktion '{name}' registriert")
|
||||
|
||||
def shutdown(self, exit_code: int = 0):
|
||||
"""Führt ordnungsgemäßes Herunterfahren durch"""
|
||||
if self.shutdown_requested:
|
||||
return
|
||||
|
||||
self.shutdown_requested = True
|
||||
self.shutdown_time = datetime.now()
|
||||
|
||||
core_logger.info("🔄 Ordnungsgemäßes Herunterfahren wird eingeleitet...")
|
||||
|
||||
self._shutdown_components()
|
||||
self._execute_cleanup_functions()
|
||||
|
||||
core_logger.info("✅ Herunterfahren abgeschlossen")
|
||||
sys.exit(exit_code)
|
||||
|
||||
def _shutdown_components(self):
|
||||
"""Fährt alle registrierten Komponenten herunter"""
|
||||
for name, component_info in self.components.items():
|
||||
try:
|
||||
component = component_info['component']
|
||||
stop_method = component_info['stop_method']
|
||||
|
||||
if hasattr(component, stop_method):
|
||||
getattr(component, stop_method)()
|
||||
core_logger.debug(f"Komponente '{name}' erfolgreich heruntergefahren")
|
||||
else:
|
||||
core_logger.warning(f"Komponente '{name}' hat keine '{stop_method}' Methode")
|
||||
|
||||
except Exception as e:
|
||||
core_logger.error(f"Fehler beim Herunterfahren von '{name}': {str(e)}")
|
||||
|
||||
def _execute_cleanup_functions(self):
|
||||
"""Führt alle Cleanup-Funktionen aus"""
|
||||
# Nach Priorität sortieren
|
||||
sorted_functions = sorted(self.cleanup_functions, key=lambda x: x['priority'])
|
||||
|
||||
for cleanup_info in sorted_functions:
|
||||
try:
|
||||
with self.cleanup_timeout(cleanup_info['timeout'], cleanup_info['name']):
|
||||
cleanup_info['function'](*cleanup_info['args'], **cleanup_info['kwargs'])
|
||||
core_logger.debug(f"Cleanup '{cleanup_info['name']}' erfolgreich")
|
||||
|
||||
except Exception as e:
|
||||
core_logger.error(f"Cleanup '{cleanup_info['name']}' fehlgeschlagen: {str(e)}")
|
||||
|
||||
@contextmanager
|
||||
def cleanup_timeout(self, timeout: int, operation_name: str):
|
||||
"""Context Manager für Timeout bei Cleanup-Operationen"""
|
||||
def timeout_handler(signum, frame):
|
||||
raise TimeoutError(f"Timeout bei {operation_name}")
|
||||
|
||||
if platform.system() != "Windows":
|
||||
old_handler = signal.signal(signal.SIGALRM, timeout_handler)
|
||||
signal.alarm(timeout)
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if platform.system() != "Windows":
|
||||
signal.alarm(0)
|
||||
signal.signal(signal.SIGALRM, old_handler)
|
||||
|
||||
# ===== ERROR RECOVERY =====
|
||||
|
||||
def _init_default_patterns(self):
|
||||
"""Initialisiert Standard-Fehlermuster"""
|
||||
patterns = [
|
||||
ErrorPattern(
|
||||
name="database_lock",
|
||||
patterns=[r"database.*locked", r"sqlite.*busy"],
|
||||
severity=ErrorSeverity.HIGH,
|
||||
actions=[RecoveryAction.CLEAR_CACHE, RecoveryAction.RESTART_COMPONENT],
|
||||
description="Datenbank-Sperren"
|
||||
),
|
||||
ErrorPattern(
|
||||
name="memory_exhaustion",
|
||||
patterns=[r"out of memory", r"memory.*exhausted"],
|
||||
severity=ErrorSeverity.CRITICAL,
|
||||
actions=[RecoveryAction.RESTART_SYSTEM],
|
||||
description="Speicher-Erschöpfung"
|
||||
),
|
||||
ErrorPattern(
|
||||
name="network_timeout",
|
||||
patterns=[r"timeout.*network", r"connection.*timed out"],
|
||||
severity=ErrorSeverity.MEDIUM,
|
||||
actions=[RecoveryAction.RESTART_SERVICE],
|
||||
description="Netzwerk-Timeouts"
|
||||
)
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
self.error_patterns[pattern.name] = pattern
|
||||
|
||||
def _init_recovery_handlers(self):
|
||||
"""Initialisiert Recovery-Handler"""
|
||||
self.recovery_handlers = {
|
||||
RecoveryAction.LOG_ONLY: self._handle_log_only,
|
||||
RecoveryAction.RESTART_SERVICE: self._handle_restart_service,
|
||||
RecoveryAction.RESTART_COMPONENT: self._handle_restart_component,
|
||||
RecoveryAction.CLEAR_CACHE: self._handle_clear_cache,
|
||||
RecoveryAction.RESET_DATABASE: self._handle_reset_database,
|
||||
RecoveryAction.RESTART_SYSTEM: self._handle_restart_system,
|
||||
RecoveryAction.EMERGENCY_STOP: self._handle_emergency_stop
|
||||
}
|
||||
|
||||
def start_error_monitoring(self):
|
||||
"""Startet Error-Monitoring"""
|
||||
if self.monitoring_active:
|
||||
return
|
||||
|
||||
self.monitoring_active = True
|
||||
self.monitoring_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
||||
self.monitoring_thread.start()
|
||||
core_logger.info("Error-Monitoring gestartet")
|
||||
|
||||
def stop_error_monitoring(self):
|
||||
"""Stoppt Error-Monitoring"""
|
||||
self.monitoring_active = False
|
||||
if self.monitoring_thread:
|
||||
self.monitoring_thread.join(timeout=5)
|
||||
core_logger.info("Error-Monitoring gestoppt")
|
||||
|
||||
def _monitor_loop(self):
|
||||
"""Haupt-Monitoring-Schleife"""
|
||||
while self.monitoring_active:
|
||||
try:
|
||||
self._check_log_files()
|
||||
self._check_system_metrics()
|
||||
self._cleanup_old_entries()
|
||||
time.sleep(30) # Alle 30 Sekunden prüfen
|
||||
except Exception as e:
|
||||
core_logger.error(f"Monitoring-Fehler: {str(e)}")
|
||||
time.sleep(60) # Bei Fehlern länger warten
|
||||
|
||||
def _check_log_files(self):
|
||||
"""Überprüft Log-Dateien auf Fehlermuster"""
|
||||
try:
|
||||
from utils.settings import LOG_DIR
|
||||
|
||||
if not os.path.exists(LOG_DIR):
|
||||
return
|
||||
|
||||
for log_subdir in os.listdir(LOG_DIR):
|
||||
log_file = os.path.join(LOG_DIR, log_subdir, f"{log_subdir}.log")
|
||||
|
||||
if os.path.exists(log_file):
|
||||
# Nur die letzten 100 Zeilen lesen
|
||||
with open(log_file, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
lines = f.readlines()[-100:]
|
||||
|
||||
for line in lines:
|
||||
self._analyze_log_line(line, log_subdir)
|
||||
|
||||
except Exception as e:
|
||||
core_logger.debug(f"Log-Datei-Prüfung fehlgeschlagen: {str(e)}")
|
||||
|
||||
def _analyze_log_line(self, line: str, source: str):
|
||||
"""Analysiert eine Log-Zeile auf Fehlermuster"""
|
||||
import re
|
||||
|
||||
for pattern_name, pattern in self.error_patterns.items():
|
||||
for regex in pattern.patterns:
|
||||
if re.search(regex, line, re.IGNORECASE):
|
||||
self._handle_error_detection(pattern_name, line, {'source': source})
|
||||
return
|
||||
|
||||
def _handle_error_detection(self, pattern_name: str, error_message: str, context: Dict[str, Any] = None):
|
||||
"""Behandelt erkannte Fehler"""
|
||||
pattern = self.error_patterns[pattern_name]
|
||||
|
||||
# Prüfen, ob zu viele Fehler in kurzer Zeit aufgetreten sind
|
||||
recent_count = self._count_recent_occurrences(pattern_name, pattern.time_window)
|
||||
|
||||
occurrence = ErrorOccurrence(
|
||||
timestamp=datetime.now(),
|
||||
pattern_name=pattern_name,
|
||||
error_message=error_message,
|
||||
severity=pattern.severity,
|
||||
context=context or {}
|
||||
)
|
||||
|
||||
self.error_occurrences.append(occurrence)
|
||||
|
||||
if recent_count >= pattern.max_occurrences:
|
||||
# Eskalation
|
||||
actions = pattern.escalation_actions or [RecoveryAction.EMERGENCY_STOP]
|
||||
core_logger.critical(f"Fehler-Eskalation für {pattern_name}: {recent_count} Vorkommen")
|
||||
else:
|
||||
actions = pattern.actions
|
||||
|
||||
self._execute_recovery_actions(occurrence, actions)
|
||||
|
||||
def _count_recent_occurrences(self, pattern_name: str, time_window: int) -> int:
|
||||
"""Zählt kürzliche Vorkommen eines Fehlermusters"""
|
||||
cutoff_time = datetime.now() - timedelta(seconds=time_window)
|
||||
return len([occ for occ in self.error_occurrences
|
||||
if occ.pattern_name == pattern_name and occ.timestamp > cutoff_time])
|
||||
|
||||
def _execute_recovery_actions(self, occurrence: ErrorOccurrence, actions: List[RecoveryAction]):
|
||||
"""Führt Recovery-Aktionen aus"""
|
||||
for action in actions:
|
||||
if action in self.recovery_handlers:
|
||||
try:
|
||||
success = self.recovery_handlers[action](occurrence)
|
||||
occurrence.recovery_attempted.append(action)
|
||||
if success:
|
||||
occurrence.recovery_successful = True
|
||||
break
|
||||
except Exception as e:
|
||||
core_logger.error(f"Recovery-Aktion {action.value} fehlgeschlagen: {str(e)}")
|
||||
|
||||
def _handle_log_only(self, occurrence: ErrorOccurrence) -> bool:
|
||||
"""Recovery: Nur Logging"""
|
||||
core_logger.warning(f"Fehler erkannt ({occurrence.pattern_name}): {occurrence.error_message}")
|
||||
return True
|
||||
|
||||
def _handle_restart_service(self, occurrence: ErrorOccurrence) -> bool:
|
||||
"""Recovery: Service-Neustart"""
|
||||
try:
|
||||
service_name = occurrence.context.get('service_name', 'myp-https')
|
||||
subprocess.run(['sudo', 'systemctl', 'restart', service_name], check=True)
|
||||
core_logger.info(f"Service {service_name} neu gestartet")
|
||||
return True
|
||||
except Exception as e:
|
||||
core_logger.error(f"Service-Neustart fehlgeschlagen: {str(e)}")
|
||||
return False
|
||||
|
||||
def _handle_clear_cache(self, occurrence: ErrorOccurrence) -> bool:
|
||||
"""Recovery: Cache leeren"""
|
||||
try:
|
||||
self._clear_caches()
|
||||
core_logger.info("Cache erfolgreich geleert")
|
||||
return True
|
||||
except Exception as e:
|
||||
core_logger.error(f"Cache-Bereinigung fehlgeschlagen: {str(e)}")
|
||||
return False
|
||||
|
||||
def _handle_restart_system(self, occurrence: ErrorOccurrence) -> bool:
|
||||
"""Recovery: System-Neustart"""
|
||||
try:
|
||||
self.schedule_operation(SystemOperation.RESTART, delay_seconds=30,
|
||||
reason=f"Error Recovery: {occurrence.pattern_name}", force=True)
|
||||
return True
|
||||
except Exception as e:
|
||||
core_logger.error(f"System-Neustart fehlgeschlagen: {str(e)}")
|
||||
return False
|
||||
|
||||
def _handle_emergency_stop(self, occurrence: ErrorOccurrence) -> bool:
|
||||
"""Recovery: Notfall-Stop"""
|
||||
core_logger.critical(f"NOTFALL-STOP wegen: {occurrence.pattern_name}")
|
||||
self._emergency_stop({})
|
||||
return True
|
||||
|
||||
# ===== WINDOWS-SPEZIFISCHE FIXES =====
|
||||
|
||||
def _apply_platform_fixes(self):
|
||||
"""Wendet plattformspezifische Fixes an"""
|
||||
if platform.system() == "Windows":
|
||||
self._apply_windows_fixes()
|
||||
else:
|
||||
self._apply_unix_fixes()
|
||||
|
||||
def _apply_windows_fixes(self):
|
||||
"""Windows-spezifische Fixes"""
|
||||
try:
|
||||
# UTF-8 Encoding
|
||||
import locale
|
||||
try:
|
||||
locale.setlocale(locale.LC_ALL, 'German_Germany.1252')
|
||||
except:
|
||||
pass
|
||||
|
||||
# Socket-Fixes
|
||||
import socket
|
||||
if hasattr(socket, 'SO_REUSEADDR'):
|
||||
original_bind = socket.socket.bind
|
||||
def windows_bind_with_reuse(self, address):
|
||||
self.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
return original_bind(self, address)
|
||||
socket.socket.bind = windows_bind_with_reuse
|
||||
|
||||
core_logger.info("Windows-Fixes angewendet")
|
||||
except Exception as e:
|
||||
core_logger.warning(f"Windows-Fixes teilweise fehlgeschlagen: {str(e)}")
|
||||
|
||||
def _apply_unix_fixes(self):
|
||||
"""Unix-spezifische Optimierungen"""
|
||||
try:
|
||||
# Umgebungsvariablen setzen
|
||||
os.environ.setdefault('PYTHONIOENCODING', 'utf-8')
|
||||
core_logger.debug("Unix-Optimierungen angewendet")
|
||||
except Exception as e:
|
||||
core_logger.warning(f"Unix-Optimierungen fehlgeschlagen: {str(e)}")
|
||||
|
||||
# ===== THREAD MANAGEMENT =====
|
||||
|
||||
def register_thread(self, thread: threading.Thread):
|
||||
"""Registriert einen Thread für sauberes Shutdown"""
|
||||
self.registered_threads.append(thread)
|
||||
core_logger.debug(f"Thread registriert: {thread.name}")
|
||||
|
||||
def register_cleanup_callback(self, func: Callable):
|
||||
"""Registriert Callback für Cleanup"""
|
||||
self.cleanup_callbacks.append(func)
|
||||
|
||||
def _force_shutdown_all_threads(self):
|
||||
"""Erzwingt Shutdown aller registrierten Threads"""
|
||||
for thread in self.registered_threads:
|
||||
if thread.is_alive():
|
||||
try:
|
||||
# Versuche graceful shutdown
|
||||
if hasattr(thread, 'stop'):
|
||||
thread.stop()
|
||||
|
||||
thread.join(timeout=2)
|
||||
|
||||
if thread.is_alive():
|
||||
core_logger.warning(f"Thread {thread.name} reagiert nicht - erzwinge Beendigung")
|
||||
|
||||
except Exception as e:
|
||||
core_logger.error(f"Fehler beim Thread-Shutdown: {str(e)}")
|
||||
|
||||
# Cleanup-Callbacks ausführen
|
||||
for callback in self.cleanup_callbacks:
|
||||
try:
|
||||
callback()
|
||||
except Exception as e:
|
||||
core_logger.error(f"Cleanup-Callback fehlgeschlagen: {str(e)}")
|
||||
|
||||
# ===== SIGNAL HANDLING =====
|
||||
|
||||
def _register_signal_handlers(self):
|
||||
"""Registriert Signal-Handler für verschiedene Signale"""
|
||||
if platform.system() != "Windows":
|
||||
signal.signal(signal.SIGTERM, self._signal_handler)
|
||||
signal.signal(signal.SIGINT, self._signal_handler)
|
||||
signal.signal(signal.SIGHUP, self._signal_handler)
|
||||
else:
|
||||
signal.signal(signal.SIGTERM, self._signal_handler)
|
||||
signal.signal(signal.SIGINT, self._signal_handler)
|
||||
|
||||
def _signal_handler(self, signum, frame):
|
||||
"""Handler für System-Signale"""
|
||||
signal_names = {2: 'SIGINT', 15: 'SIGTERM', 1: 'SIGHUP'}
|
||||
signal_name = signal_names.get(signum, f'Signal {signum}')
|
||||
|
||||
core_logger.info(f"📡 {signal_name} empfangen - leite ordnungsgemäßes Herunterfahren ein")
|
||||
self.shutdown(0)
|
||||
|
||||
def _cleanup_old_entries(self):
|
||||
"""Bereinigt alte Error-Occurrences"""
|
||||
cutoff_time = datetime.now() - timedelta(hours=24)
|
||||
self.error_occurrences = [
|
||||
occ for occ in self.error_occurrences
|
||||
if occ.timestamp > cutoff_time
|
||||
]
|
||||
|
||||
def _move_to_history(self, operation_id: str):
|
||||
"""Verschiebt Operation in Historie"""
|
||||
if operation_id in self.pending_operations:
|
||||
operation = self.pending_operations.pop(operation_id)
|
||||
self.operation_history.append(operation)
|
||||
|
||||
# Nur die letzten 50 Operationen behalten
|
||||
self.operation_history = self.operation_history[-50:]
|
||||
|
||||
# ===== STATUS & MONITORING =====
|
||||
|
||||
def get_system_status(self) -> Dict[str, Any]:
|
||||
"""Gibt umfassenden System-Status zurück"""
|
||||
return {
|
||||
'shutdown_requested': self.shutdown_requested,
|
||||
'shutdown_time': self.shutdown_time.isoformat() if self.shutdown_time else None,
|
||||
'registered_components': len(self.components),
|
||||
'cleanup_functions': len(self.cleanup_functions),
|
||||
'pending_operations': len(self.pending_operations),
|
||||
'error_monitoring_active': self.monitoring_active,
|
||||
'error_patterns': len(self.error_patterns),
|
||||
'recent_errors': len([occ for occ in self.error_occurrences
|
||||
if occ.timestamp > datetime.now() - timedelta(hours=1)]),
|
||||
'registered_threads': len(self.registered_threads),
|
||||
'platform': platform.system(),
|
||||
'platform_fixes_applied': True
|
||||
}
|
||||
|
||||
# ===== GLOBALE INSTANZ =====
|
||||
|
||||
# Singleton-Pattern für Core System Manager
|
||||
_core_system_manager = None
|
||||
|
||||
def get_core_system_manager(timeout: int = 30) -> CoreSystemManager:
|
||||
"""Gibt die globale CoreSystemManager-Instanz zurück"""
|
||||
global _core_system_manager
|
||||
if _core_system_manager is None:
|
||||
_core_system_manager = CoreSystemManager(timeout)
|
||||
return _core_system_manager
|
||||
|
||||
# ===== CONVENIENCE FUNCTIONS =====
|
||||
|
||||
def schedule_system_restart(delay_seconds: int = 60, user_id: str = None,
|
||||
reason: str = None, force: bool = False) -> Dict[str, Any]:
|
||||
"""Convenience-Funktion für System-Neustart"""
|
||||
manager = get_core_system_manager()
|
||||
return manager.schedule_operation(SystemOperation.RESTART, delay_seconds, user_id, reason, force)
|
||||
|
||||
def schedule_system_shutdown(delay_seconds: int = 30, user_id: str = None,
|
||||
reason: str = None, force: bool = False) -> Dict[str, Any]:
|
||||
"""Convenience-Funktion für System-Herunterfahren"""
|
||||
manager = get_core_system_manager()
|
||||
return manager.schedule_operation(SystemOperation.SHUTDOWN, delay_seconds, user_id, reason, force)
|
||||
|
||||
def register_for_shutdown(component_or_function, name: str,
|
||||
component_stop_method: str = "stop", priority: int = 1,
|
||||
timeout: int = 10):
|
||||
"""Registriert Komponente oder Funktion für Shutdown"""
|
||||
manager = get_core_system_manager()
|
||||
|
||||
if callable(component_or_function):
|
||||
manager.register_cleanup_function(
|
||||
component_or_function, name, priority, timeout
|
||||
)
|
||||
else:
|
||||
manager.register_component(name, component_or_function, component_stop_method)
|
||||
|
||||
def shutdown_application(exit_code: int = 0):
|
||||
"""Fährt die Anwendung ordnungsgemäß herunter"""
|
||||
manager = get_core_system_manager()
|
||||
manager.shutdown(exit_code)
|
||||
|
||||
def start_error_monitoring():
|
||||
"""Startet Error-Monitoring"""
|
||||
manager = get_core_system_manager()
|
||||
manager.start_error_monitoring()
|
||||
|
||||
def stop_error_monitoring():
|
||||
"""Stoppt Error-Monitoring"""
|
||||
manager = get_core_system_manager()
|
||||
manager.stop_error_monitoring()
|
||||
|
||||
def get_system_status() -> Dict[str, Any]:
|
||||
"""Gibt System-Status zurück"""
|
||||
manager = get_core_system_manager()
|
||||
return manager.get_system_status()
|
||||
|
||||
def is_shutdown_requested() -> bool:
|
||||
"""Prüft, ob Shutdown angefordert wurde"""
|
||||
manager = get_core_system_manager()
|
||||
return manager.shutdown_requested
|
||||
|
||||
# ===== INITIALIZATION =====
|
||||
|
||||
# Auto-Initialisierung beim Import
|
||||
core_logger.info("🔧 Core System Manager wird initialisiert...")
|
||||
|
||||
# Legacy-Kompatibilität: Alte Funktionen automatisch verfügbar machen
|
||||
def apply_all_windows_fixes():
|
||||
"""Legacy-Kompatibilität für Windows-Fixes"""
|
||||
manager = get_core_system_manager()
|
||||
if platform.system() == "Windows":
|
||||
manager._apply_windows_fixes()
|
||||
|
||||
def fix_windows_socket_issues():
|
||||
"""Legacy-Kompatibilität für Socket-Fixes"""
|
||||
apply_all_windows_fixes()
|
||||
|
||||
# CLI Interface
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
command = sys.argv[1]
|
||||
manager = get_core_system_manager()
|
||||
|
||||
if command == "status":
|
||||
status = manager.get_system_status()
|
||||
print("=== Core System Status ===")
|
||||
for key, value in status.items():
|
||||
print(f"{key}: {value}")
|
||||
elif command == "restart":
|
||||
result = schedule_system_restart(delay_seconds=10)
|
||||
print(f"Restart scheduled: {result}")
|
||||
elif command == "monitor":
|
||||
print("Starting error monitoring...")
|
||||
start_error_monitoring()
|
||||
try:
|
||||
while True:
|
||||
time.sleep(1)
|
||||
except KeyboardInterrupt:
|
||||
print("Stopping monitoring...")
|
||||
stop_error_monitoring()
|
||||
else:
|
||||
print("Available commands: status, restart, monitor")
|
||||
else:
|
||||
print("Core System Manager - Available commands: status, restart, monitor")
|
Reference in New Issue
Block a user