#!/usr/bin/env python3 """ Robustes Error-Recovery-System für wartungsfreien Produktionsbetrieb Automatische Fehlererkennung, -behebung und -prävention """ import os import sys import time import threading import traceback from datetime import datetime, timedelta from typing import Dict, List, Optional, Callable, Any from dataclasses import dataclass, field from enum import Enum import logging import json import subprocess import psutil from contextlib import contextmanager import signal # Logging-Setup try: from utils.logging_config import get_logger recovery_logger = get_logger("error_recovery") except ImportError: logging.basicConfig(level=logging.INFO) recovery_logger = logging.getLogger("error_recovery") class ErrorSeverity(Enum): """Schweregrade von Fehlern""" LOW = "low" MEDIUM = "medium" HIGH = "high" CRITICAL = "critical" class RecoveryAction(Enum): """Verfügbare Recovery-Aktionen""" LOG_ONLY = "log_only" RESTART_SERVICE = "restart_service" RESTART_COMPONENT = "restart_component" CLEAR_CACHE = "clear_cache" RESET_DATABASE = "reset_database" RESTART_SYSTEM = "restart_system" EMERGENCY_STOP = "emergency_stop" @dataclass class ErrorPattern: """Definiert ein Fehlermuster und zugehörige Recovery-Aktionen""" name: str patterns: List[str] # Regex-Patterns für Fehlererkennung severity: ErrorSeverity actions: List[RecoveryAction] max_occurrences: int = 3 # Maximale Anzahl vor Eskalation time_window: int = 300 # Zeitfenster in Sekunden escalation_actions: List[RecoveryAction] = field(default_factory=list) description: str = "" @dataclass class ErrorOccurrence: """Einzelnes Auftreten eines Fehlers""" timestamp: datetime pattern_name: str error_message: str severity: ErrorSeverity context: Dict[str, Any] = field(default_factory=dict) recovery_attempted: List[RecoveryAction] = field(default_factory=list) recovery_successful: bool = False class ErrorRecoveryManager: """ Zentraler Manager für automatische Fehlererkennung und -behebung. Überwacht kontinuierlich das System und führt automatische Recovery durch. """ def __init__(self): self.is_active = False self.error_patterns: Dict[str, ErrorPattern] = {} self.error_history: List[ErrorOccurrence] = [] self.recovery_handlers: Dict[RecoveryAction, Callable] = {} self.monitoring_thread: Optional[threading.Thread] = None self.lock = threading.Lock() # Konfiguration self.config = { "check_interval": 30, # Sekunden "max_history_size": 1000, "auto_recovery_enabled": True, "critical_error_threshold": 5, "system_restart_threshold": 10, "log_file_paths": [ "logs/app/app.log", "logs/errors/errors.log", "logs/database/database.log" ] } # Initialisiere Standard-Fehlermuster self._init_default_patterns() # Initialisiere Recovery-Handler self._init_recovery_handlers() recovery_logger.info("🛡️ Error-Recovery-Manager initialisiert") def _init_default_patterns(self): """Initialisiert Standard-Fehlermuster für häufige Probleme""" patterns = [ # Datenbank-Fehler ErrorPattern( name="database_lock", patterns=[ r"database is locked", r"SQLite.*locked", r"OperationalError.*locked" ], severity=ErrorSeverity.HIGH, actions=[RecoveryAction.RESET_DATABASE], max_occurrences=3, escalation_actions=[RecoveryAction.RESTART_SERVICE], description="Datenbank-Sperrung" ), # Memory-Fehler ErrorPattern( name="memory_exhausted", patterns=[ r"MemoryError", r"Out of memory", r"Cannot allocate memory" ], severity=ErrorSeverity.CRITICAL, actions=[RecoveryAction.CLEAR_CACHE, RecoveryAction.RESTART_SERVICE], max_occurrences=2, escalation_actions=[RecoveryAction.RESTART_SYSTEM], description="Speicher erschöpft" ), # Network-Fehler ErrorPattern( name="connection_error", patterns=[ r"ConnectionError", r"Network is unreachable", r"Connection refused" ], severity=ErrorSeverity.MEDIUM, actions=[RecoveryAction.RESTART_COMPONENT], max_occurrences=5, escalation_actions=[RecoveryAction.RESTART_SERVICE], description="Netzwerk-Verbindungsfehler" ), # Kiosk-Fehler ErrorPattern( name="kiosk_crash", patterns=[ r"chromium.*crashed", r"firefox.*crashed", r"X11.*error", r"Display.*not found" ], severity=ErrorSeverity.HIGH, actions=[RecoveryAction.RESTART_COMPONENT], max_occurrences=3, escalation_actions=[RecoveryAction.RESTART_SYSTEM], description="Kiosk-Display Fehler" ), # Service-Fehler ErrorPattern( name="service_failure", patterns=[ r"systemctl.*failed", r"Service.*not found", r"Failed to start" ], severity=ErrorSeverity.HIGH, actions=[RecoveryAction.RESTART_SERVICE], max_occurrences=3, escalation_actions=[RecoveryAction.RESTART_SYSTEM], description="System-Service Fehler" ), # Disk-Fehler ErrorPattern( name="disk_full", patterns=[ r"No space left on device", r"Disk full", r"OSError.*28" ], severity=ErrorSeverity.CRITICAL, actions=[RecoveryAction.CLEAR_CACHE], max_occurrences=1, escalation_actions=[RecoveryAction.EMERGENCY_STOP], description="Festplatte voll" ), # Flask-Fehler ErrorPattern( name="flask_error", patterns=[ r"Internal Server Error", r"500 Internal Server Error", r"Application failed to start" ], severity=ErrorSeverity.HIGH, actions=[RecoveryAction.RESTART_SERVICE], max_occurrences=3, escalation_actions=[RecoveryAction.RESTART_SYSTEM], description="Flask-Anwendungsfehler" ) ] for pattern in patterns: self.error_patterns[pattern.name] = pattern def _init_recovery_handlers(self): """Initialisiert Handler für Recovery-Aktionen""" self.recovery_handlers = { RecoveryAction.LOG_ONLY: self._handle_log_only, RecoveryAction.RESTART_SERVICE: self._handle_restart_service, RecoveryAction.RESTART_COMPONENT: self._handle_restart_component, RecoveryAction.CLEAR_CACHE: self._handle_clear_cache, RecoveryAction.RESET_DATABASE: self._handle_reset_database, RecoveryAction.RESTART_SYSTEM: self._handle_restart_system, RecoveryAction.EMERGENCY_STOP: self._handle_emergency_stop } def start_monitoring(self): """Startet kontinuierliche Überwachung""" if self.is_active: recovery_logger.warning("Monitoring bereits aktiv") return self.is_active = True self.monitoring_thread = threading.Thread( target=self._monitor_loop, daemon=True, name="ErrorRecoveryMonitor" ) self.monitoring_thread.start() recovery_logger.info("🔍 Error-Monitoring gestartet") def stop_monitoring(self): """Stoppt Überwachung""" self.is_active = False if self.monitoring_thread and self.monitoring_thread.is_alive(): self.monitoring_thread.join(timeout=5) recovery_logger.info("🛑 Error-Monitoring gestoppt") def _monitor_loop(self): """Hauptschleife für kontinuierliche Überwachung""" while self.is_active: try: # Log-Dateien prüfen self._check_log_files() # System-Metriken prüfen self._check_system_metrics() # Service-Status prüfen self._check_service_status() # Alte Einträge bereinigen self._cleanup_old_entries() time.sleep(self.config["check_interval"]) except Exception as e: recovery_logger.error(f"Fehler in Monitor-Loop: {e}") time.sleep(5) # Kurze Pause bei Fehlern def _check_log_files(self): """Prüft Log-Dateien auf Fehlermuster""" for log_path in self.config["log_file_paths"]: try: if not os.path.exists(log_path): continue # Lese nur neue Zeilen (vereinfacht) with open(log_path, 'r', encoding='utf-8') as f: # Gehe zu den letzten 1000 Zeilen lines = f.readlines() recent_lines = lines[-1000:] if len(lines) > 1000 else lines for line in recent_lines: self._analyze_log_line(line, log_path) except Exception as e: recovery_logger.debug(f"Fehler beim Lesen von {log_path}: {e}") def _analyze_log_line(self, line: str, source: str): """Analysiert einzelne Log-Zeile auf Fehlermuster""" import re for pattern_name, pattern in self.error_patterns.items(): for regex in pattern.patterns: try: if re.search(regex, line, re.IGNORECASE): self._handle_error_detection( pattern_name=pattern_name, error_message=line.strip(), context={"source": source, "pattern": regex} ) break except Exception as e: recovery_logger.debug(f"Regex-Fehler für {regex}: {e}") def _check_system_metrics(self): """Prüft System-Metriken auf kritische Werte""" try: # Memory-Check memory = psutil.virtual_memory() if memory.percent > 95: self._handle_error_detection( pattern_name="memory_exhausted", error_message=f"Speicherverbrauch kritisch: {memory.percent:.1f}%", context={"memory_percent": memory.percent} ) # Disk-Check disk = psutil.disk_usage('/') if disk.percent > 98: self._handle_error_detection( pattern_name="disk_full", error_message=f"Festplatte fast voll: {disk.percent:.1f}%", context={"disk_percent": disk.percent} ) # Load-Check if hasattr(psutil, 'getloadavg'): load_avg = psutil.getloadavg()[0] if load_avg > 5.0: # Sehr hohe Last self._handle_error_detection( pattern_name="system_overload", error_message=f"System-Last kritisch: {load_avg:.2f}", context={"load_average": load_avg} ) except Exception as e: recovery_logger.debug(f"System-Metrics-Check fehlgeschlagen: {e}") def _check_service_status(self): """Prüft Status wichtiger Services""" services = ["myp-https.service", "myp-kiosk.service"] for service in services: try: result = subprocess.run( ["sudo", "systemctl", "is-active", service], capture_output=True, text=True, timeout=10 ) if result.returncode != 0: self._handle_error_detection( pattern_name="service_failure", error_message=f"Service {service} nicht aktiv: {result.stdout.strip()}", context={"service": service, "status": result.stdout.strip()} ) except Exception as e: recovery_logger.debug(f"Service-Check für {service} fehlgeschlagen: {e}") def _handle_error_detection(self, pattern_name: str, error_message: str, context: Dict[str, Any] = None): """Behandelt erkannten Fehler und startet Recovery""" with self.lock: if pattern_name not in self.error_patterns: recovery_logger.warning(f"Unbekanntes Fehlermuster: {pattern_name}") return pattern = self.error_patterns[pattern_name] # Prüfe ob bereits kürzlich aufgetreten recent_occurrences = self._count_recent_occurrences(pattern_name, pattern.time_window) # Erstelle Error-Occurrence occurrence = ErrorOccurrence( timestamp=datetime.now(), pattern_name=pattern_name, error_message=error_message, severity=pattern.severity, context=context or {} ) self.error_history.append(occurrence) recovery_logger.warning(f"🚨 Fehler erkannt: {pattern_name} - {error_message}") # Entscheide über Recovery-Aktionen if recent_occurrences >= pattern.max_occurrences: # Eskalation actions = pattern.escalation_actions recovery_logger.error(f"🔥 Eskalation für {pattern_name}: {recent_occurrences} Vorkommen in {pattern.time_window}s") else: # Normale Recovery actions = pattern.actions # Führe Recovery-Aktionen aus if self.config["auto_recovery_enabled"]: self._execute_recovery_actions(occurrence, actions) def _count_recent_occurrences(self, pattern_name: str, time_window: int) -> int: """Zählt kürzliche Vorkommen eines Fehlermusters""" cutoff_time = datetime.now() - timedelta(seconds=time_window) return sum(1 for err in self.error_history if err.pattern_name == pattern_name and err.timestamp > cutoff_time) def _execute_recovery_actions(self, occurrence: ErrorOccurrence, actions: List[RecoveryAction]): """Führt Recovery-Aktionen aus""" for action in actions: try: recovery_logger.info(f"🔧 Führe Recovery-Aktion aus: {action.value}") handler = self.recovery_handlers.get(action) if handler: success = handler(occurrence) occurrence.recovery_attempted.append(action) if success: occurrence.recovery_successful = True recovery_logger.info(f"✅ Recovery erfolgreich: {action.value}") break # Stoppe bei erfolgreicher Recovery else: recovery_logger.warning(f"❌ Recovery fehlgeschlagen: {action.value}") else: recovery_logger.error(f"Kein Handler für Recovery-Aktion: {action.value}") except Exception as e: recovery_logger.error(f"Fehler bei Recovery-Aktion {action.value}: {e}") def _handle_log_only(self, occurrence: ErrorOccurrence) -> bool: """Handler: Nur Logging, keine weitere Aktion""" recovery_logger.info(f"📝 Log-Only für: {occurrence.error_message}") return True def _handle_restart_service(self, occurrence: ErrorOccurrence) -> bool: """Handler: Service-Neustart""" try: from utils.system_control import get_system_control_manager, SystemOperation manager = get_system_control_manager() result = manager.schedule_operation( SystemOperation.SERVICE_RESTART, delay_seconds=5, reason=f"Automatische Recovery für: {occurrence.pattern_name}" ) return result.get("success", False) except Exception as e: recovery_logger.error(f"Service-Neustart fehlgeschlagen: {e}") return False def _handle_restart_component(self, occurrence: ErrorOccurrence) -> bool: """Handler: Komponenten-Neustart (z.B. Kiosk)""" try: from utils.system_control import get_system_control_manager, SystemOperation manager = get_system_control_manager() result = manager.schedule_operation( SystemOperation.KIOSK_RESTART, delay_seconds=5, reason=f"Automatische Recovery für: {occurrence.pattern_name}" ) return result.get("success", False) except Exception as e: recovery_logger.error(f"Komponenten-Neustart fehlgeschlagen: {e}") return False def _handle_clear_cache(self, occurrence: ErrorOccurrence) -> bool: """Handler: Cache leeren""" try: # App-Caches leeren from app import clear_user_cache, clear_printer_status_cache clear_user_cache() clear_printer_status_cache() # System-Cache leeren if os.name != 'nt': subprocess.run(["sudo", "sync"], timeout=10) return True except Exception as e: recovery_logger.error(f"Cache-Clearing fehlgeschlagen: {e}") return False def _handle_reset_database(self, occurrence: ErrorOccurrence) -> bool: """Handler: Datenbank-Reset""" try: from utils.database_cleanup import safe_database_cleanup result = safe_database_cleanup(force_mode_switch=True) return result.get("success", False) except Exception as e: recovery_logger.error(f"Database-Reset fehlgeschlagen: {e}") return False def _handle_restart_system(self, occurrence: ErrorOccurrence) -> bool: """Handler: System-Neustart""" try: from utils.system_control import schedule_system_restart result = schedule_system_restart( delay_seconds=60, reason=f"Automatische Recovery für kritischen Fehler: {occurrence.pattern_name}", force=True ) return result.get("success", False) except Exception as e: recovery_logger.error(f"System-Neustart fehlgeschlagen: {e}") return False def _handle_emergency_stop(self, occurrence: ErrorOccurrence) -> bool: """Handler: Notfall-Stopp""" try: recovery_logger.critical(f"🚨 NOTFALL-STOPP: {occurrence.error_message}") # Führe sofortigen Shutdown durch from utils.shutdown_manager import get_shutdown_manager shutdown_manager = get_shutdown_manager() shutdown_manager.force_shutdown(1) return True except Exception as e: recovery_logger.error(f"Notfall-Stopp fehlgeschlagen: {e}") return False def _cleanup_old_entries(self): """Bereinigt alte Error-History-Einträge""" with self.lock: if len(self.error_history) > self.config["max_history_size"]: self.error_history = self.error_history[-self.config["max_history_size"]:] def get_error_statistics(self) -> Dict[str, Any]: """Gibt Fehler-Statistiken zurück""" with self.lock: total_errors = len(self.error_history) # Fehler nach Schweregrad by_severity = {} for severity in ErrorSeverity: by_severity[severity.value] = sum(1 for err in self.error_history if err.severity == severity) # Fehler nach Pattern by_pattern = {} for pattern_name in self.error_patterns.keys(): by_pattern[pattern_name] = sum(1 for err in self.error_history if err.pattern_name == pattern_name) # Letzten 24h last_24h = datetime.now() - timedelta(hours=24) recent_errors = sum(1 for err in self.error_history if err.timestamp > last_24h) # Recovery-Erfolgsrate attempted_recoveries = sum(1 for err in self.error_history if err.recovery_attempted) successful_recoveries = sum(1 for err in self.error_history if err.recovery_successful) success_rate = (successful_recoveries / attempted_recoveries * 100) if attempted_recoveries > 0 else 0 return { "total_errors": total_errors, "errors_last_24h": recent_errors, "by_severity": by_severity, "by_pattern": by_pattern, "recovery_success_rate": round(success_rate, 1), "monitoring_active": self.is_active, "auto_recovery_enabled": self.config["auto_recovery_enabled"] } def get_recent_errors(self, limit: int = 50) -> List[Dict[str, Any]]: """Gibt kürzliche Fehler zurück""" with self.lock: recent = self.error_history[-limit:] if limit else self.error_history return [{ "timestamp": err.timestamp.isoformat(), "pattern_name": err.pattern_name, "error_message": err.error_message, "severity": err.severity.value, "context": err.context, "recovery_attempted": [action.value for action in err.recovery_attempted], "recovery_successful": err.recovery_successful } for err in recent] # Globaler Error-Recovery-Manager _error_recovery_manager: Optional[ErrorRecoveryManager] = None _recovery_lock = threading.Lock() def get_error_recovery_manager() -> ErrorRecoveryManager: """ Singleton-Pattern für globalen Error-Recovery-Manager. Returns: ErrorRecoveryManager: Globaler Error-Recovery-Manager """ global _error_recovery_manager with _recovery_lock: if _error_recovery_manager is None: _error_recovery_manager = ErrorRecoveryManager() return _error_recovery_manager def start_error_monitoring(): """Startet Error-Monitoring""" manager = get_error_recovery_manager() manager.start_monitoring() def stop_error_monitoring(): """Stoppt Error-Monitoring""" manager = get_error_recovery_manager() manager.stop_monitoring() def force_error_check(log_message: str = None): """Erzwingt manuelle Fehlerprüfung""" if log_message: manager = get_error_recovery_manager() manager._analyze_log_line(log_message, "manual_check")