641 lines
24 KiB
Python
641 lines
24 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Robustes Error-Recovery-System für wartungsfreien Produktionsbetrieb
|
|
Automatische Fehlererkennung, -behebung und -prävention
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import threading
|
|
import traceback
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Optional, Callable, Any
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
import logging
|
|
import json
|
|
import subprocess
|
|
import psutil
|
|
from contextlib import contextmanager
|
|
import signal
|
|
|
|
# Logging-Setup
|
|
try:
|
|
from utils.logging_config import get_logger
|
|
recovery_logger = get_logger("error_recovery")
|
|
except ImportError:
|
|
logging.basicConfig(level=logging.INFO)
|
|
recovery_logger = logging.getLogger("error_recovery")
|
|
|
|
|
|
class ErrorSeverity(Enum):
|
|
"""Schweregrade von Fehlern"""
|
|
LOW = "low"
|
|
MEDIUM = "medium"
|
|
HIGH = "high"
|
|
CRITICAL = "critical"
|
|
|
|
|
|
class RecoveryAction(Enum):
|
|
"""Verfügbare Recovery-Aktionen"""
|
|
LOG_ONLY = "log_only"
|
|
RESTART_SERVICE = "restart_service"
|
|
RESTART_COMPONENT = "restart_component"
|
|
CLEAR_CACHE = "clear_cache"
|
|
RESET_DATABASE = "reset_database"
|
|
RESTART_SYSTEM = "restart_system"
|
|
EMERGENCY_STOP = "emergency_stop"
|
|
|
|
|
|
@dataclass
|
|
class ErrorPattern:
|
|
"""Definiert ein Fehlermuster und zugehörige Recovery-Aktionen"""
|
|
name: str
|
|
patterns: List[str] # Regex-Patterns für Fehlererkennung
|
|
severity: ErrorSeverity
|
|
actions: List[RecoveryAction]
|
|
max_occurrences: int = 3 # Maximale Anzahl vor Eskalation
|
|
time_window: int = 300 # Zeitfenster in Sekunden
|
|
escalation_actions: List[RecoveryAction] = field(default_factory=list)
|
|
description: str = ""
|
|
|
|
|
|
@dataclass
|
|
class ErrorOccurrence:
|
|
"""Einzelnes Auftreten eines Fehlers"""
|
|
timestamp: datetime
|
|
pattern_name: str
|
|
error_message: str
|
|
severity: ErrorSeverity
|
|
context: Dict[str, Any] = field(default_factory=dict)
|
|
recovery_attempted: List[RecoveryAction] = field(default_factory=list)
|
|
recovery_successful: bool = False
|
|
|
|
|
|
class ErrorRecoveryManager:
|
|
"""
|
|
Zentraler Manager für automatische Fehlererkennung und -behebung.
|
|
Überwacht kontinuierlich das System und führt automatische Recovery durch.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.is_active = False
|
|
self.error_patterns: Dict[str, ErrorPattern] = {}
|
|
self.error_history: List[ErrorOccurrence] = []
|
|
self.recovery_handlers: Dict[RecoveryAction, Callable] = {}
|
|
self.monitoring_thread: Optional[threading.Thread] = None
|
|
self.lock = threading.Lock()
|
|
|
|
# Konfiguration
|
|
self.config = {
|
|
"check_interval": 30, # Sekunden
|
|
"max_history_size": 1000,
|
|
"auto_recovery_enabled": True,
|
|
"critical_error_threshold": 5,
|
|
"system_restart_threshold": 10,
|
|
"log_file_paths": [
|
|
"logs/app/app.log",
|
|
"logs/errors/errors.log",
|
|
"logs/database/database.log"
|
|
]
|
|
}
|
|
|
|
# Initialisiere Standard-Fehlermuster
|
|
self._init_default_patterns()
|
|
|
|
# Initialisiere Recovery-Handler
|
|
self._init_recovery_handlers()
|
|
|
|
recovery_logger.info("🛡️ Error-Recovery-Manager initialisiert")
|
|
|
|
def _init_default_patterns(self):
|
|
"""Initialisiert Standard-Fehlermuster für häufige Probleme"""
|
|
patterns = [
|
|
# Datenbank-Fehler
|
|
ErrorPattern(
|
|
name="database_lock",
|
|
patterns=[
|
|
r"database is locked",
|
|
r"SQLite.*locked",
|
|
r"OperationalError.*locked"
|
|
],
|
|
severity=ErrorSeverity.HIGH,
|
|
actions=[RecoveryAction.RESET_DATABASE],
|
|
max_occurrences=3,
|
|
escalation_actions=[RecoveryAction.RESTART_SERVICE],
|
|
description="Datenbank-Sperrung"
|
|
),
|
|
|
|
# Memory-Fehler
|
|
ErrorPattern(
|
|
name="memory_exhausted",
|
|
patterns=[
|
|
r"MemoryError",
|
|
r"Out of memory",
|
|
r"Cannot allocate memory"
|
|
],
|
|
severity=ErrorSeverity.CRITICAL,
|
|
actions=[RecoveryAction.CLEAR_CACHE, RecoveryAction.RESTART_SERVICE],
|
|
max_occurrences=2,
|
|
escalation_actions=[RecoveryAction.RESTART_SYSTEM],
|
|
description="Speicher erschöpft"
|
|
),
|
|
|
|
# Network-Fehler
|
|
ErrorPattern(
|
|
name="connection_error",
|
|
patterns=[
|
|
r"ConnectionError",
|
|
r"Network is unreachable",
|
|
r"Connection refused"
|
|
],
|
|
severity=ErrorSeverity.MEDIUM,
|
|
actions=[RecoveryAction.RESTART_COMPONENT],
|
|
max_occurrences=5,
|
|
escalation_actions=[RecoveryAction.RESTART_SERVICE],
|
|
description="Netzwerk-Verbindungsfehler"
|
|
),
|
|
|
|
# Kiosk-Fehler
|
|
ErrorPattern(
|
|
name="kiosk_crash",
|
|
patterns=[
|
|
r"chromium.*crashed",
|
|
r"firefox.*crashed",
|
|
r"X11.*error",
|
|
r"Display.*not found"
|
|
],
|
|
severity=ErrorSeverity.HIGH,
|
|
actions=[RecoveryAction.RESTART_COMPONENT],
|
|
max_occurrences=3,
|
|
escalation_actions=[RecoveryAction.RESTART_SYSTEM],
|
|
description="Kiosk-Display Fehler"
|
|
),
|
|
|
|
# Service-Fehler
|
|
ErrorPattern(
|
|
name="service_failure",
|
|
patterns=[
|
|
r"systemctl.*failed",
|
|
r"Service.*not found",
|
|
r"Failed to start"
|
|
],
|
|
severity=ErrorSeverity.HIGH,
|
|
actions=[RecoveryAction.RESTART_SERVICE],
|
|
max_occurrences=3,
|
|
escalation_actions=[RecoveryAction.RESTART_SYSTEM],
|
|
description="System-Service Fehler"
|
|
),
|
|
|
|
# Disk-Fehler
|
|
ErrorPattern(
|
|
name="disk_full",
|
|
patterns=[
|
|
r"No space left on device",
|
|
r"Disk full",
|
|
r"OSError.*28"
|
|
],
|
|
severity=ErrorSeverity.CRITICAL,
|
|
actions=[RecoveryAction.CLEAR_CACHE],
|
|
max_occurrences=1,
|
|
escalation_actions=[RecoveryAction.EMERGENCY_STOP],
|
|
description="Festplatte voll"
|
|
),
|
|
|
|
# Flask-Fehler
|
|
ErrorPattern(
|
|
name="flask_error",
|
|
patterns=[
|
|
r"Internal Server Error",
|
|
r"500 Internal Server Error",
|
|
r"Application failed to start"
|
|
],
|
|
severity=ErrorSeverity.HIGH,
|
|
actions=[RecoveryAction.RESTART_SERVICE],
|
|
max_occurrences=3,
|
|
escalation_actions=[RecoveryAction.RESTART_SYSTEM],
|
|
description="Flask-Anwendungsfehler"
|
|
)
|
|
]
|
|
|
|
for pattern in patterns:
|
|
self.error_patterns[pattern.name] = pattern
|
|
|
|
def _init_recovery_handlers(self):
|
|
"""Initialisiert Handler für Recovery-Aktionen"""
|
|
self.recovery_handlers = {
|
|
RecoveryAction.LOG_ONLY: self._handle_log_only,
|
|
RecoveryAction.RESTART_SERVICE: self._handle_restart_service,
|
|
RecoveryAction.RESTART_COMPONENT: self._handle_restart_component,
|
|
RecoveryAction.CLEAR_CACHE: self._handle_clear_cache,
|
|
RecoveryAction.RESET_DATABASE: self._handle_reset_database,
|
|
RecoveryAction.RESTART_SYSTEM: self._handle_restart_system,
|
|
RecoveryAction.EMERGENCY_STOP: self._handle_emergency_stop
|
|
}
|
|
|
|
def start_monitoring(self):
|
|
"""Startet kontinuierliche Überwachung"""
|
|
if self.is_active:
|
|
recovery_logger.warning("Monitoring bereits aktiv")
|
|
return
|
|
|
|
self.is_active = True
|
|
self.monitoring_thread = threading.Thread(
|
|
target=self._monitor_loop,
|
|
daemon=True,
|
|
name="ErrorRecoveryMonitor"
|
|
)
|
|
self.monitoring_thread.start()
|
|
recovery_logger.info("🔍 Error-Monitoring gestartet")
|
|
|
|
def stop_monitoring(self):
|
|
"""Stoppt Überwachung"""
|
|
self.is_active = False
|
|
if self.monitoring_thread and self.monitoring_thread.is_alive():
|
|
self.monitoring_thread.join(timeout=5)
|
|
recovery_logger.info("🛑 Error-Monitoring gestoppt")
|
|
|
|
def _monitor_loop(self):
|
|
"""Hauptschleife für kontinuierliche Überwachung"""
|
|
while self.is_active:
|
|
try:
|
|
# Log-Dateien prüfen
|
|
self._check_log_files()
|
|
|
|
# System-Metriken prüfen
|
|
self._check_system_metrics()
|
|
|
|
# Service-Status prüfen
|
|
self._check_service_status()
|
|
|
|
# Alte Einträge bereinigen
|
|
self._cleanup_old_entries()
|
|
|
|
time.sleep(self.config["check_interval"])
|
|
|
|
except Exception as e:
|
|
recovery_logger.error(f"Fehler in Monitor-Loop: {e}")
|
|
time.sleep(5) # Kurze Pause bei Fehlern
|
|
|
|
def _check_log_files(self):
|
|
"""Prüft Log-Dateien auf Fehlermuster"""
|
|
for log_path in self.config["log_file_paths"]:
|
|
try:
|
|
if not os.path.exists(log_path):
|
|
continue
|
|
|
|
# Lese nur neue Zeilen (vereinfacht)
|
|
with open(log_path, 'r', encoding='utf-8') as f:
|
|
# Gehe zu den letzten 1000 Zeilen
|
|
lines = f.readlines()
|
|
recent_lines = lines[-1000:] if len(lines) > 1000 else lines
|
|
|
|
for line in recent_lines:
|
|
self._analyze_log_line(line, log_path)
|
|
|
|
except Exception as e:
|
|
recovery_logger.debug(f"Fehler beim Lesen von {log_path}: {e}")
|
|
|
|
def _analyze_log_line(self, line: str, source: str):
|
|
"""Analysiert einzelne Log-Zeile auf Fehlermuster"""
|
|
import re
|
|
|
|
for pattern_name, pattern in self.error_patterns.items():
|
|
for regex in pattern.patterns:
|
|
try:
|
|
if re.search(regex, line, re.IGNORECASE):
|
|
self._handle_error_detection(
|
|
pattern_name=pattern_name,
|
|
error_message=line.strip(),
|
|
context={"source": source, "pattern": regex}
|
|
)
|
|
break
|
|
except Exception as e:
|
|
recovery_logger.debug(f"Regex-Fehler für {regex}: {e}")
|
|
|
|
def _check_system_metrics(self):
|
|
"""Prüft System-Metriken auf kritische Werte"""
|
|
try:
|
|
# Memory-Check
|
|
memory = psutil.virtual_memory()
|
|
if memory.percent > 95:
|
|
self._handle_error_detection(
|
|
pattern_name="memory_exhausted",
|
|
error_message=f"Speicherverbrauch kritisch: {memory.percent:.1f}%",
|
|
context={"memory_percent": memory.percent}
|
|
)
|
|
|
|
# Disk-Check
|
|
disk = psutil.disk_usage('/')
|
|
if disk.percent > 98:
|
|
self._handle_error_detection(
|
|
pattern_name="disk_full",
|
|
error_message=f"Festplatte fast voll: {disk.percent:.1f}%",
|
|
context={"disk_percent": disk.percent}
|
|
)
|
|
|
|
# Load-Check
|
|
if hasattr(psutil, 'getloadavg'):
|
|
load_avg = psutil.getloadavg()[0]
|
|
if load_avg > 5.0: # Sehr hohe Last
|
|
self._handle_error_detection(
|
|
pattern_name="system_overload",
|
|
error_message=f"System-Last kritisch: {load_avg:.2f}",
|
|
context={"load_average": load_avg}
|
|
)
|
|
|
|
except Exception as e:
|
|
recovery_logger.debug(f"System-Metrics-Check fehlgeschlagen: {e}")
|
|
|
|
def _check_service_status(self):
|
|
"""Prüft Status wichtiger Services"""
|
|
services = ["myp-https.service", "myp-kiosk.service"]
|
|
|
|
for service in services:
|
|
try:
|
|
result = subprocess.run(
|
|
["sudo", "systemctl", "is-active", service],
|
|
capture_output=True, text=True, timeout=10
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
self._handle_error_detection(
|
|
pattern_name="service_failure",
|
|
error_message=f"Service {service} nicht aktiv: {result.stdout.strip()}",
|
|
context={"service": service, "status": result.stdout.strip()}
|
|
)
|
|
|
|
except Exception as e:
|
|
recovery_logger.debug(f"Service-Check für {service} fehlgeschlagen: {e}")
|
|
|
|
def _handle_error_detection(self, pattern_name: str, error_message: str, context: Dict[str, Any] = None):
|
|
"""Behandelt erkannten Fehler und startet Recovery"""
|
|
with self.lock:
|
|
if pattern_name not in self.error_patterns:
|
|
recovery_logger.warning(f"Unbekanntes Fehlermuster: {pattern_name}")
|
|
return
|
|
|
|
pattern = self.error_patterns[pattern_name]
|
|
|
|
# Prüfe ob bereits kürzlich aufgetreten
|
|
recent_occurrences = self._count_recent_occurrences(pattern_name, pattern.time_window)
|
|
|
|
# Erstelle Error-Occurrence
|
|
occurrence = ErrorOccurrence(
|
|
timestamp=datetime.now(),
|
|
pattern_name=pattern_name,
|
|
error_message=error_message,
|
|
severity=pattern.severity,
|
|
context=context or {}
|
|
)
|
|
|
|
self.error_history.append(occurrence)
|
|
|
|
recovery_logger.warning(f"🚨 Fehler erkannt: {pattern_name} - {error_message}")
|
|
|
|
# Entscheide über Recovery-Aktionen
|
|
if recent_occurrences >= pattern.max_occurrences:
|
|
# Eskalation
|
|
actions = pattern.escalation_actions
|
|
recovery_logger.error(f"🔥 Eskalation für {pattern_name}: {recent_occurrences} Vorkommen in {pattern.time_window}s")
|
|
else:
|
|
# Normale Recovery
|
|
actions = pattern.actions
|
|
|
|
# Führe Recovery-Aktionen aus
|
|
if self.config["auto_recovery_enabled"]:
|
|
self._execute_recovery_actions(occurrence, actions)
|
|
|
|
def _count_recent_occurrences(self, pattern_name: str, time_window: int) -> int:
|
|
"""Zählt kürzliche Vorkommen eines Fehlermusters"""
|
|
cutoff_time = datetime.now() - timedelta(seconds=time_window)
|
|
return sum(1 for err in self.error_history
|
|
if err.pattern_name == pattern_name and err.timestamp > cutoff_time)
|
|
|
|
def _execute_recovery_actions(self, occurrence: ErrorOccurrence, actions: List[RecoveryAction]):
|
|
"""Führt Recovery-Aktionen aus"""
|
|
for action in actions:
|
|
try:
|
|
recovery_logger.info(f"🔧 Führe Recovery-Aktion aus: {action.value}")
|
|
|
|
handler = self.recovery_handlers.get(action)
|
|
if handler:
|
|
success = handler(occurrence)
|
|
occurrence.recovery_attempted.append(action)
|
|
|
|
if success:
|
|
occurrence.recovery_successful = True
|
|
recovery_logger.info(f"✅ Recovery erfolgreich: {action.value}")
|
|
break # Stoppe bei erfolgreicher Recovery
|
|
else:
|
|
recovery_logger.warning(f"❌ Recovery fehlgeschlagen: {action.value}")
|
|
else:
|
|
recovery_logger.error(f"Kein Handler für Recovery-Aktion: {action.value}")
|
|
|
|
except Exception as e:
|
|
recovery_logger.error(f"Fehler bei Recovery-Aktion {action.value}: {e}")
|
|
|
|
def _handle_log_only(self, occurrence: ErrorOccurrence) -> bool:
|
|
"""Handler: Nur Logging, keine weitere Aktion"""
|
|
recovery_logger.info(f"📝 Log-Only für: {occurrence.error_message}")
|
|
return True
|
|
|
|
def _handle_restart_service(self, occurrence: ErrorOccurrence) -> bool:
|
|
"""Handler: Service-Neustart"""
|
|
try:
|
|
from utils.system_control import get_system_control_manager, SystemOperation
|
|
|
|
manager = get_system_control_manager()
|
|
result = manager.schedule_operation(
|
|
SystemOperation.SERVICE_RESTART,
|
|
delay_seconds=5,
|
|
reason=f"Automatische Recovery für: {occurrence.pattern_name}"
|
|
)
|
|
|
|
return result.get("success", False)
|
|
|
|
except Exception as e:
|
|
recovery_logger.error(f"Service-Neustart fehlgeschlagen: {e}")
|
|
return False
|
|
|
|
def _handle_restart_component(self, occurrence: ErrorOccurrence) -> bool:
|
|
"""Handler: Komponenten-Neustart (z.B. Kiosk)"""
|
|
try:
|
|
from utils.system_control import get_system_control_manager, SystemOperation
|
|
|
|
manager = get_system_control_manager()
|
|
result = manager.schedule_operation(
|
|
SystemOperation.KIOSK_RESTART,
|
|
delay_seconds=5,
|
|
reason=f"Automatische Recovery für: {occurrence.pattern_name}"
|
|
)
|
|
|
|
return result.get("success", False)
|
|
|
|
except Exception as e:
|
|
recovery_logger.error(f"Komponenten-Neustart fehlgeschlagen: {e}")
|
|
return False
|
|
|
|
def _handle_clear_cache(self, occurrence: ErrorOccurrence) -> bool:
|
|
"""Handler: Cache leeren"""
|
|
try:
|
|
# App-Caches leeren
|
|
from app import clear_user_cache, clear_printer_status_cache
|
|
clear_user_cache()
|
|
clear_printer_status_cache()
|
|
|
|
# System-Cache leeren
|
|
if os.name != 'nt':
|
|
subprocess.run(["sudo", "sync"], timeout=10)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
recovery_logger.error(f"Cache-Clearing fehlgeschlagen: {e}")
|
|
return False
|
|
|
|
def _handle_reset_database(self, occurrence: ErrorOccurrence) -> bool:
|
|
"""Handler: Datenbank-Reset"""
|
|
try:
|
|
from utils.database_cleanup import safe_database_cleanup
|
|
|
|
result = safe_database_cleanup(force_mode_switch=True)
|
|
return result.get("success", False)
|
|
|
|
except Exception as e:
|
|
recovery_logger.error(f"Database-Reset fehlgeschlagen: {e}")
|
|
return False
|
|
|
|
def _handle_restart_system(self, occurrence: ErrorOccurrence) -> bool:
|
|
"""Handler: System-Neustart"""
|
|
try:
|
|
from utils.system_control import schedule_system_restart
|
|
|
|
result = schedule_system_restart(
|
|
delay_seconds=60,
|
|
reason=f"Automatische Recovery für kritischen Fehler: {occurrence.pattern_name}",
|
|
force=True
|
|
)
|
|
|
|
return result.get("success", False)
|
|
|
|
except Exception as e:
|
|
recovery_logger.error(f"System-Neustart fehlgeschlagen: {e}")
|
|
return False
|
|
|
|
def _handle_emergency_stop(self, occurrence: ErrorOccurrence) -> bool:
|
|
"""Handler: Notfall-Stopp"""
|
|
try:
|
|
recovery_logger.critical(f"🚨 NOTFALL-STOPP: {occurrence.error_message}")
|
|
|
|
# Führe sofortigen Shutdown durch
|
|
from utils.shutdown_manager import get_shutdown_manager
|
|
shutdown_manager = get_shutdown_manager()
|
|
shutdown_manager.force_shutdown(1)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
recovery_logger.error(f"Notfall-Stopp fehlgeschlagen: {e}")
|
|
return False
|
|
|
|
def _cleanup_old_entries(self):
|
|
"""Bereinigt alte Error-History-Einträge"""
|
|
with self.lock:
|
|
if len(self.error_history) > self.config["max_history_size"]:
|
|
self.error_history = self.error_history[-self.config["max_history_size"]:]
|
|
|
|
def get_error_statistics(self) -> Dict[str, Any]:
|
|
"""Gibt Fehler-Statistiken zurück"""
|
|
with self.lock:
|
|
total_errors = len(self.error_history)
|
|
|
|
# Fehler nach Schweregrad
|
|
by_severity = {}
|
|
for severity in ErrorSeverity:
|
|
by_severity[severity.value] = sum(1 for err in self.error_history
|
|
if err.severity == severity)
|
|
|
|
# Fehler nach Pattern
|
|
by_pattern = {}
|
|
for pattern_name in self.error_patterns.keys():
|
|
by_pattern[pattern_name] = sum(1 for err in self.error_history
|
|
if err.pattern_name == pattern_name)
|
|
|
|
# Letzten 24h
|
|
last_24h = datetime.now() - timedelta(hours=24)
|
|
recent_errors = sum(1 for err in self.error_history
|
|
if err.timestamp > last_24h)
|
|
|
|
# Recovery-Erfolgsrate
|
|
attempted_recoveries = sum(1 for err in self.error_history
|
|
if err.recovery_attempted)
|
|
successful_recoveries = sum(1 for err in self.error_history
|
|
if err.recovery_successful)
|
|
|
|
success_rate = (successful_recoveries / attempted_recoveries * 100) if attempted_recoveries > 0 else 0
|
|
|
|
return {
|
|
"total_errors": total_errors,
|
|
"errors_last_24h": recent_errors,
|
|
"by_severity": by_severity,
|
|
"by_pattern": by_pattern,
|
|
"recovery_success_rate": round(success_rate, 1),
|
|
"monitoring_active": self.is_active,
|
|
"auto_recovery_enabled": self.config["auto_recovery_enabled"]
|
|
}
|
|
|
|
def get_recent_errors(self, limit: int = 50) -> List[Dict[str, Any]]:
|
|
"""Gibt kürzliche Fehler zurück"""
|
|
with self.lock:
|
|
recent = self.error_history[-limit:] if limit else self.error_history
|
|
|
|
return [{
|
|
"timestamp": err.timestamp.isoformat(),
|
|
"pattern_name": err.pattern_name,
|
|
"error_message": err.error_message,
|
|
"severity": err.severity.value,
|
|
"context": err.context,
|
|
"recovery_attempted": [action.value for action in err.recovery_attempted],
|
|
"recovery_successful": err.recovery_successful
|
|
} for err in recent]
|
|
|
|
|
|
# Globaler Error-Recovery-Manager
|
|
_error_recovery_manager: Optional[ErrorRecoveryManager] = None
|
|
_recovery_lock = threading.Lock()
|
|
|
|
|
|
def get_error_recovery_manager() -> ErrorRecoveryManager:
|
|
"""
|
|
Singleton-Pattern für globalen Error-Recovery-Manager.
|
|
|
|
Returns:
|
|
ErrorRecoveryManager: Globaler Error-Recovery-Manager
|
|
"""
|
|
global _error_recovery_manager
|
|
|
|
with _recovery_lock:
|
|
if _error_recovery_manager is None:
|
|
_error_recovery_manager = ErrorRecoveryManager()
|
|
return _error_recovery_manager
|
|
|
|
|
|
def start_error_monitoring():
|
|
"""Startet Error-Monitoring"""
|
|
manager = get_error_recovery_manager()
|
|
manager.start_monitoring()
|
|
|
|
|
|
def stop_error_monitoring():
|
|
"""Stoppt Error-Monitoring"""
|
|
manager = get_error_recovery_manager()
|
|
manager.stop_monitoring()
|
|
|
|
|
|
def force_error_check(log_message: str = None):
|
|
"""Erzwingt manuelle Fehlerprüfung"""
|
|
if log_message:
|
|
manager = get_error_recovery_manager()
|
|
manager._analyze_log_line(log_message, "manual_check") |