🎉 Improved core system functionality, refactored error recovery, enhanced hardware integration, optimized timeout force quit manager, improved watchdog manager, and resolved Windows-specific issues. 🖥️📈

This commit is contained in:
2025-06-11 13:10:36 +02:00
parent 85cd38830a
commit cb7dc6d95c
6 changed files with 843 additions and 3056 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,641 +0,0 @@
#!/usr/bin/env python3
"""
Robustes Error-Recovery-System für wartungsfreien Produktionsbetrieb
Automatische Fehlererkennung, -behebung und -prävention
"""
import os
import sys
import time
import threading
import traceback
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Callable, Any
from dataclasses import dataclass, field
from enum import Enum
import logging
import json
import subprocess
import psutil
from contextlib import contextmanager
import signal
# Logging-Setup
try:
from utils.logging_config import get_logger
recovery_logger = get_logger("error_recovery")
except ImportError:
logging.basicConfig(level=logging.INFO)
recovery_logger = logging.getLogger("error_recovery")
class ErrorSeverity(Enum):
"""Schweregrade von Fehlern"""
LOW = "low"
MEDIUM = "medium"
HIGH = "high"
CRITICAL = "critical"
class RecoveryAction(Enum):
"""Verfügbare Recovery-Aktionen"""
LOG_ONLY = "log_only"
RESTART_SERVICE = "restart_service"
RESTART_COMPONENT = "restart_component"
CLEAR_CACHE = "clear_cache"
RESET_DATABASE = "reset_database"
RESTART_SYSTEM = "restart_system"
EMERGENCY_STOP = "emergency_stop"
@dataclass
class ErrorPattern:
"""Definiert ein Fehlermuster und zugehörige Recovery-Aktionen"""
name: str
patterns: List[str] # Regex-Patterns für Fehlererkennung
severity: ErrorSeverity
actions: List[RecoveryAction]
max_occurrences: int = 3 # Maximale Anzahl vor Eskalation
time_window: int = 300 # Zeitfenster in Sekunden
escalation_actions: List[RecoveryAction] = field(default_factory=list)
description: str = ""
@dataclass
class ErrorOccurrence:
"""Einzelnes Auftreten eines Fehlers"""
timestamp: datetime
pattern_name: str
error_message: str
severity: ErrorSeverity
context: Dict[str, Any] = field(default_factory=dict)
recovery_attempted: List[RecoveryAction] = field(default_factory=list)
recovery_successful: bool = False
class ErrorRecoveryManager:
"""
Zentraler Manager für automatische Fehlererkennung und -behebung.
Überwacht kontinuierlich das System und führt automatische Recovery durch.
"""
def __init__(self):
self.is_active = False
self.error_patterns: Dict[str, ErrorPattern] = {}
self.error_history: List[ErrorOccurrence] = []
self.recovery_handlers: Dict[RecoveryAction, Callable] = {}
self.monitoring_thread: Optional[threading.Thread] = None
self.lock = threading.Lock()
# Konfiguration
self.config = {
"check_interval": 30, # Sekunden
"max_history_size": 1000,
"auto_recovery_enabled": True,
"critical_error_threshold": 5,
"system_restart_threshold": 10,
"log_file_paths": [
"logs/app/app.log",
"logs/errors/errors.log",
"logs/database/database.log"
]
}
# Initialisiere Standard-Fehlermuster
self._init_default_patterns()
# Initialisiere Recovery-Handler
self._init_recovery_handlers()
recovery_logger.info("🛡️ Error-Recovery-Manager initialisiert")
def _init_default_patterns(self):
"""Initialisiert Standard-Fehlermuster für häufige Probleme"""
patterns = [
# Datenbank-Fehler
ErrorPattern(
name="database_lock",
patterns=[
r"database is locked",
r"SQLite.*locked",
r"OperationalError.*locked"
],
severity=ErrorSeverity.HIGH,
actions=[RecoveryAction.RESET_DATABASE],
max_occurrences=3,
escalation_actions=[RecoveryAction.RESTART_SERVICE],
description="Datenbank-Sperrung"
),
# Memory-Fehler
ErrorPattern(
name="memory_exhausted",
patterns=[
r"MemoryError",
r"Out of memory",
r"Cannot allocate memory"
],
severity=ErrorSeverity.CRITICAL,
actions=[RecoveryAction.CLEAR_CACHE, RecoveryAction.RESTART_SERVICE],
max_occurrences=2,
escalation_actions=[RecoveryAction.RESTART_SYSTEM],
description="Speicher erschöpft"
),
# Network-Fehler
ErrorPattern(
name="connection_error",
patterns=[
r"ConnectionError",
r"Network is unreachable",
r"Connection refused"
],
severity=ErrorSeverity.MEDIUM,
actions=[RecoveryAction.RESTART_COMPONENT],
max_occurrences=5,
escalation_actions=[RecoveryAction.RESTART_SERVICE],
description="Netzwerk-Verbindungsfehler"
),
# Kiosk-Fehler
ErrorPattern(
name="kiosk_crash",
patterns=[
r"chromium.*crashed",
r"firefox.*crashed",
r"X11.*error",
r"Display.*not found"
],
severity=ErrorSeverity.HIGH,
actions=[RecoveryAction.RESTART_COMPONENT],
max_occurrences=3,
escalation_actions=[RecoveryAction.RESTART_SYSTEM],
description="Kiosk-Display Fehler"
),
# Service-Fehler
ErrorPattern(
name="service_failure",
patterns=[
r"systemctl.*failed",
r"Service.*not found",
r"Failed to start"
],
severity=ErrorSeverity.HIGH,
actions=[RecoveryAction.RESTART_SERVICE],
max_occurrences=3,
escalation_actions=[RecoveryAction.RESTART_SYSTEM],
description="System-Service Fehler"
),
# Disk-Fehler
ErrorPattern(
name="disk_full",
patterns=[
r"No space left on device",
r"Disk full",
r"OSError.*28"
],
severity=ErrorSeverity.CRITICAL,
actions=[RecoveryAction.CLEAR_CACHE],
max_occurrences=1,
escalation_actions=[RecoveryAction.EMERGENCY_STOP],
description="Festplatte voll"
),
# Flask-Fehler
ErrorPattern(
name="flask_error",
patterns=[
r"Internal Server Error",
r"500 Internal Server Error",
r"Application failed to start"
],
severity=ErrorSeverity.HIGH,
actions=[RecoveryAction.RESTART_SERVICE],
max_occurrences=3,
escalation_actions=[RecoveryAction.RESTART_SYSTEM],
description="Flask-Anwendungsfehler"
)
]
for pattern in patterns:
self.error_patterns[pattern.name] = pattern
def _init_recovery_handlers(self):
"""Initialisiert Handler für Recovery-Aktionen"""
self.recovery_handlers = {
RecoveryAction.LOG_ONLY: self._handle_log_only,
RecoveryAction.RESTART_SERVICE: self._handle_restart_service,
RecoveryAction.RESTART_COMPONENT: self._handle_restart_component,
RecoveryAction.CLEAR_CACHE: self._handle_clear_cache,
RecoveryAction.RESET_DATABASE: self._handle_reset_database,
RecoveryAction.RESTART_SYSTEM: self._handle_restart_system,
RecoveryAction.EMERGENCY_STOP: self._handle_emergency_stop
}
def start_monitoring(self):
"""Startet kontinuierliche Überwachung"""
if self.is_active:
recovery_logger.warning("Monitoring bereits aktiv")
return
self.is_active = True
self.monitoring_thread = threading.Thread(
target=self._monitor_loop,
daemon=True,
name="ErrorRecoveryMonitor"
)
self.monitoring_thread.start()
recovery_logger.info("🔍 Error-Monitoring gestartet")
def stop_monitoring(self):
"""Stoppt Überwachung"""
self.is_active = False
if self.monitoring_thread and self.monitoring_thread.is_alive():
self.monitoring_thread.join(timeout=5)
recovery_logger.info("🛑 Error-Monitoring gestoppt")
def _monitor_loop(self):
"""Hauptschleife für kontinuierliche Überwachung"""
while self.is_active:
try:
# Log-Dateien prüfen
self._check_log_files()
# System-Metriken prüfen
self._check_system_metrics()
# Service-Status prüfen
self._check_service_status()
# Alte Einträge bereinigen
self._cleanup_old_entries()
time.sleep(self.config["check_interval"])
except Exception as e:
recovery_logger.error(f"Fehler in Monitor-Loop: {e}")
time.sleep(5) # Kurze Pause bei Fehlern
def _check_log_files(self):
"""Prüft Log-Dateien auf Fehlermuster"""
for log_path in self.config["log_file_paths"]:
try:
if not os.path.exists(log_path):
continue
# Lese nur neue Zeilen (vereinfacht)
with open(log_path, 'r', encoding='utf-8') as f:
# Gehe zu den letzten 1000 Zeilen
lines = f.readlines()
recent_lines = lines[-1000:] if len(lines) > 1000 else lines
for line in recent_lines:
self._analyze_log_line(line, log_path)
except Exception as e:
recovery_logger.debug(f"Fehler beim Lesen von {log_path}: {e}")
def _analyze_log_line(self, line: str, source: str):
"""Analysiert einzelne Log-Zeile auf Fehlermuster"""
import re
for pattern_name, pattern in self.error_patterns.items():
for regex in pattern.patterns:
try:
if re.search(regex, line, re.IGNORECASE):
self._handle_error_detection(
pattern_name=pattern_name,
error_message=line.strip(),
context={"source": source, "pattern": regex}
)
break
except Exception as e:
recovery_logger.debug(f"Regex-Fehler für {regex}: {e}")
def _check_system_metrics(self):
"""Prüft System-Metriken auf kritische Werte"""
try:
# Memory-Check
memory = psutil.virtual_memory()
if memory.percent > 95:
self._handle_error_detection(
pattern_name="memory_exhausted",
error_message=f"Speicherverbrauch kritisch: {memory.percent:.1f}%",
context={"memory_percent": memory.percent}
)
# Disk-Check
disk = psutil.disk_usage('/')
if disk.percent > 98:
self._handle_error_detection(
pattern_name="disk_full",
error_message=f"Festplatte fast voll: {disk.percent:.1f}%",
context={"disk_percent": disk.percent}
)
# Load-Check
if hasattr(psutil, 'getloadavg'):
load_avg = psutil.getloadavg()[0]
if load_avg > 5.0: # Sehr hohe Last
self._handle_error_detection(
pattern_name="system_overload",
error_message=f"System-Last kritisch: {load_avg:.2f}",
context={"load_average": load_avg}
)
except Exception as e:
recovery_logger.debug(f"System-Metrics-Check fehlgeschlagen: {e}")
def _check_service_status(self):
"""Prüft Status wichtiger Services"""
services = ["myp-https.service", "myp-kiosk.service"]
for service in services:
try:
result = subprocess.run(
["sudo", "systemctl", "is-active", service],
capture_output=True, text=True, timeout=10
)
if result.returncode != 0:
self._handle_error_detection(
pattern_name="service_failure",
error_message=f"Service {service} nicht aktiv: {result.stdout.strip()}",
context={"service": service, "status": result.stdout.strip()}
)
except Exception as e:
recovery_logger.debug(f"Service-Check für {service} fehlgeschlagen: {e}")
def _handle_error_detection(self, pattern_name: str, error_message: str, context: Dict[str, Any] = None):
"""Behandelt erkannten Fehler und startet Recovery"""
with self.lock:
if pattern_name not in self.error_patterns:
recovery_logger.warning(f"Unbekanntes Fehlermuster: {pattern_name}")
return
pattern = self.error_patterns[pattern_name]
# Prüfe ob bereits kürzlich aufgetreten
recent_occurrences = self._count_recent_occurrences(pattern_name, pattern.time_window)
# Erstelle Error-Occurrence
occurrence = ErrorOccurrence(
timestamp=datetime.now(),
pattern_name=pattern_name,
error_message=error_message,
severity=pattern.severity,
context=context or {}
)
self.error_history.append(occurrence)
recovery_logger.warning(f"🚨 Fehler erkannt: {pattern_name} - {error_message}")
# Entscheide über Recovery-Aktionen
if recent_occurrences >= pattern.max_occurrences:
# Eskalation
actions = pattern.escalation_actions
recovery_logger.error(f"🔥 Eskalation für {pattern_name}: {recent_occurrences} Vorkommen in {pattern.time_window}s")
else:
# Normale Recovery
actions = pattern.actions
# Führe Recovery-Aktionen aus
if self.config["auto_recovery_enabled"]:
self._execute_recovery_actions(occurrence, actions)
def _count_recent_occurrences(self, pattern_name: str, time_window: int) -> int:
"""Zählt kürzliche Vorkommen eines Fehlermusters"""
cutoff_time = datetime.now() - timedelta(seconds=time_window)
return sum(1 for err in self.error_history
if err.pattern_name == pattern_name and err.timestamp > cutoff_time)
def _execute_recovery_actions(self, occurrence: ErrorOccurrence, actions: List[RecoveryAction]):
"""Führt Recovery-Aktionen aus"""
for action in actions:
try:
recovery_logger.info(f"🔧 Führe Recovery-Aktion aus: {action.value}")
handler = self.recovery_handlers.get(action)
if handler:
success = handler(occurrence)
occurrence.recovery_attempted.append(action)
if success:
occurrence.recovery_successful = True
recovery_logger.info(f"✅ Recovery erfolgreich: {action.value}")
break # Stoppe bei erfolgreicher Recovery
else:
recovery_logger.warning(f"❌ Recovery fehlgeschlagen: {action.value}")
else:
recovery_logger.error(f"Kein Handler für Recovery-Aktion: {action.value}")
except Exception as e:
recovery_logger.error(f"Fehler bei Recovery-Aktion {action.value}: {e}")
def _handle_log_only(self, occurrence: ErrorOccurrence) -> bool:
"""Handler: Nur Logging, keine weitere Aktion"""
recovery_logger.info(f"📝 Log-Only für: {occurrence.error_message}")
return True
def _handle_restart_service(self, occurrence: ErrorOccurrence) -> bool:
"""Handler: Service-Neustart"""
try:
from utils.system_control import get_system_control_manager, SystemOperation
manager = get_system_control_manager()
result = manager.schedule_operation(
SystemOperation.SERVICE_RESTART,
delay_seconds=5,
reason=f"Automatische Recovery für: {occurrence.pattern_name}"
)
return result.get("success", False)
except Exception as e:
recovery_logger.error(f"Service-Neustart fehlgeschlagen: {e}")
return False
def _handle_restart_component(self, occurrence: ErrorOccurrence) -> bool:
"""Handler: Komponenten-Neustart (z.B. Kiosk)"""
try:
from utils.system_control import get_system_control_manager, SystemOperation
manager = get_system_control_manager()
result = manager.schedule_operation(
SystemOperation.KIOSK_RESTART,
delay_seconds=5,
reason=f"Automatische Recovery für: {occurrence.pattern_name}"
)
return result.get("success", False)
except Exception as e:
recovery_logger.error(f"Komponenten-Neustart fehlgeschlagen: {e}")
return False
def _handle_clear_cache(self, occurrence: ErrorOccurrence) -> bool:
"""Handler: Cache leeren"""
try:
# App-Caches leeren
from app import clear_user_cache, clear_printer_status_cache
clear_user_cache()
clear_printer_status_cache()
# System-Cache leeren
if os.name != 'nt':
subprocess.run(["sudo", "sync"], timeout=10)
return True
except Exception as e:
recovery_logger.error(f"Cache-Clearing fehlgeschlagen: {e}")
return False
def _handle_reset_database(self, occurrence: ErrorOccurrence) -> bool:
"""Handler: Datenbank-Reset"""
try:
from utils.database_cleanup import safe_database_cleanup
result = safe_database_cleanup(force_mode_switch=True)
return result.get("success", False)
except Exception as e:
recovery_logger.error(f"Database-Reset fehlgeschlagen: {e}")
return False
def _handle_restart_system(self, occurrence: ErrorOccurrence) -> bool:
"""Handler: System-Neustart"""
try:
from utils.system_control import schedule_system_restart
result = schedule_system_restart(
delay_seconds=60,
reason=f"Automatische Recovery für kritischen Fehler: {occurrence.pattern_name}",
force=True
)
return result.get("success", False)
except Exception as e:
recovery_logger.error(f"System-Neustart fehlgeschlagen: {e}")
return False
def _handle_emergency_stop(self, occurrence: ErrorOccurrence) -> bool:
"""Handler: Notfall-Stopp"""
try:
recovery_logger.critical(f"🚨 NOTFALL-STOPP: {occurrence.error_message}")
# Führe sofortigen Shutdown durch
from utils.shutdown_manager import get_shutdown_manager
shutdown_manager = get_shutdown_manager()
shutdown_manager.force_shutdown(1)
return True
except Exception as e:
recovery_logger.error(f"Notfall-Stopp fehlgeschlagen: {e}")
return False
def _cleanup_old_entries(self):
"""Bereinigt alte Error-History-Einträge"""
with self.lock:
if len(self.error_history) > self.config["max_history_size"]:
self.error_history = self.error_history[-self.config["max_history_size"]:]
def get_error_statistics(self) -> Dict[str, Any]:
"""Gibt Fehler-Statistiken zurück"""
with self.lock:
total_errors = len(self.error_history)
# Fehler nach Schweregrad
by_severity = {}
for severity in ErrorSeverity:
by_severity[severity.value] = sum(1 for err in self.error_history
if err.severity == severity)
# Fehler nach Pattern
by_pattern = {}
for pattern_name in self.error_patterns.keys():
by_pattern[pattern_name] = sum(1 for err in self.error_history
if err.pattern_name == pattern_name)
# Letzten 24h
last_24h = datetime.now() - timedelta(hours=24)
recent_errors = sum(1 for err in self.error_history
if err.timestamp > last_24h)
# Recovery-Erfolgsrate
attempted_recoveries = sum(1 for err in self.error_history
if err.recovery_attempted)
successful_recoveries = sum(1 for err in self.error_history
if err.recovery_successful)
success_rate = (successful_recoveries / attempted_recoveries * 100) if attempted_recoveries > 0 else 0
return {
"total_errors": total_errors,
"errors_last_24h": recent_errors,
"by_severity": by_severity,
"by_pattern": by_pattern,
"recovery_success_rate": round(success_rate, 1),
"monitoring_active": self.is_active,
"auto_recovery_enabled": self.config["auto_recovery_enabled"]
}
def get_recent_errors(self, limit: int = 50) -> List[Dict[str, Any]]:
"""Gibt kürzliche Fehler zurück"""
with self.lock:
recent = self.error_history[-limit:] if limit else self.error_history
return [{
"timestamp": err.timestamp.isoformat(),
"pattern_name": err.pattern_name,
"error_message": err.error_message,
"severity": err.severity.value,
"context": err.context,
"recovery_attempted": [action.value for action in err.recovery_attempted],
"recovery_successful": err.recovery_successful
} for err in recent]
# Globaler Error-Recovery-Manager
_error_recovery_manager: Optional[ErrorRecoveryManager] = None
_recovery_lock = threading.Lock()
def get_error_recovery_manager() -> ErrorRecoveryManager:
"""
Singleton-Pattern für globalen Error-Recovery-Manager.
Returns:
ErrorRecoveryManager: Globaler Error-Recovery-Manager
"""
global _error_recovery_manager
with _recovery_lock:
if _error_recovery_manager is None:
_error_recovery_manager = ErrorRecoveryManager()
return _error_recovery_manager
def start_error_monitoring():
"""Startet Error-Monitoring"""
manager = get_error_recovery_manager()
manager.start_monitoring()
def stop_error_monitoring():
"""Stoppt Error-Monitoring"""
manager = get_error_recovery_manager()
manager.stop_monitoring()
def force_error_check(log_message: str = None):
"""Erzwingt manuelle Fehlerprüfung"""
if log_message:
manager = get_error_recovery_manager()
manager._analyze_log_line(log_message, "manual_check")

View File

@ -27,10 +27,22 @@ hardware_logger = get_logger("hardware_integration")
# ===== TAPO SMART PLUG CONTROLLER =====
class TapoController:
"""TP-Link Tapo Smart Plug Controller"""
"""TP-Link Tapo Smart Plug Controller - Konsolidiert aus tapo_controller.py"""
def __init__(self):
self.default_username = "till.tomczak@mercedes-benz.com"
"""Initialisiere den Tapo Controller"""
from utils.settings import TAPO_USERNAME, TAPO_PASSWORD, DEFAULT_TAPO_IPS, TAPO_TIMEOUT, TAPO_RETRY_COUNT
self.username = TAPO_USERNAME
self.password = TAPO_PASSWORD
self.timeout = TAPO_TIMEOUT
self.retry_count = TAPO_RETRY_COUNT
self.auto_discovered = False
if not TAPO_AVAILABLE:
hardware_logger.error("❌ PyP100-modul nicht installiert - tapo-funktionalität eingeschränkt")
else:
hardware_logger.info("✅ tapo controller initialisiert") self.default_username = "till.tomczak@mercedes-benz.com"
self.default_password = "744563017196A"
hardware_logger.info("🔌 Tapo Controller initialisiert")

View File

@ -1,647 +0,0 @@
#!/usr/bin/env python3
"""
Timeout Force-Quit Manager mit Terminal-Countdown
Spezialiserter Manager für Force-Quit-Timeouts mit visueller Terminal-Anzeige
und robuster Datenbankbereinigung (WAL/SHM-Dateien).
Funktionen:
- Terminal-Countdown mit Fortschrittsbalken
- Automatische Datenbankbereinigung
- Force-Quit bei Timeout
- Integration mit bestehendem Timer-System
- Robuste WAL/SHM-Dateibereinigung
Autor: System
Erstellt: 2025
"""
import os
import sys
import threading
import time
import signal
import shutil
from datetime import datetime, timedelta
from typing import Optional, Callable, Dict, Any
from contextlib import contextmanager
# Logging
try:
from utils.logging_config import get_logger
logger = get_logger("timeout_force_quit")
except ImportError:
import logging
logger = logging.getLogger("timeout_force_quit")
logging.basicConfig(level=logging.INFO)
# Timer-System Integration
try:
from utils.timer_manager import (
get_timer_manager, TimerType, ForceQuitAction, TimerStatus
)
from models import SystemTimer, get_cached_session
TIMER_SYSTEM_AVAILABLE = True
except ImportError:
logger.warning("Timer-System nicht verfügbar - verwende Fallback-Implementation")
TIMER_SYSTEM_AVAILABLE = False
# Datenbank-Cleanup
try:
from utils.database_cleanup import safe_database_cleanup
DATABASE_CLEANUP_AVAILABLE = True
except ImportError:
logger.warning("Database-Cleanup-Manager nicht verfügbar - verwende Basis-Cleanup")
DATABASE_CLEANUP_AVAILABLE = False
class TimeoutForceQuitManager:
"""
Manager für Timeout-basierte Force-Quit-Operationen mit Terminal-Countdown.
Bietet:
- Visueller Terminal-Countdown
- Automatische Datenbankbereinigung
- Robuste WAL/SHM-Dateibereinigung
- Konfigurierbare Timeout-Aktionen
"""
def __init__(self,
timeout_seconds: int = 45,
warning_seconds: int = 15,
database_cleanup: bool = True,
force_wal_cleanup: bool = True):
"""
Initialisiert den Timeout Force-Quit Manager.
Args:
timeout_seconds: Gesamttimeout in Sekunden
warning_seconds: Warnzeit vor Force-Quit in Sekunden
database_cleanup: Datenbankbereinigung aktivieren
force_wal_cleanup: Aggressive WAL/SHM-Bereinigung
"""
self.timeout_seconds = timeout_seconds
self.warning_seconds = warning_seconds
self.database_cleanup = database_cleanup
self.force_wal_cleanup = force_wal_cleanup
# Countdown-Status
self.is_active = False
self.start_time = None
self.timer_thread = None
self.countdown_thread = None
self.shutdown_callback: Optional[Callable] = None
# Terminal-Kontrolle
self.show_terminal_countdown = True
self.terminal_lock = threading.Lock()
logger.info(f"🔧 Timeout Force-Quit Manager initialisiert - Timeout: {timeout_seconds}s, Warnung: {warning_seconds}s")
def set_shutdown_callback(self, callback: Callable):
"""Setzt eine Callback-Funktion für den Shutdown"""
self.shutdown_callback = callback
logger.debug("Shutdown-Callback registriert")
def start_timeout(self, reason: str = "System-Timeout") -> bool:
"""
Startet den Timeout-Countdown.
Args:
reason: Grund für den Timeout
Returns:
bool: True wenn erfolgreich gestartet
"""
if self.is_active:
logger.warning("Timeout bereits aktiv")
return False
try:
self.is_active = True
self.start_time = datetime.now()
logger.warning(f"🚨 TIMEOUT GESTARTET - {reason}")
logger.warning(f"⏱️ Force-Quit in {self.timeout_seconds} Sekunden")
# Timer für Force-Quit
self.timer_thread = threading.Thread(
target=self._timeout_worker,
args=(reason,),
name="TimeoutForceQuit-Timer",
daemon=True
)
self.timer_thread.start()
# Terminal-Countdown (nur wenn stdout verfügbar)
if self.show_terminal_countdown and sys.stdout.isatty():
self.countdown_thread = threading.Thread(
target=self._terminal_countdown_worker,
name="TimeoutForceQuit-Countdown",
daemon=True
)
self.countdown_thread.start()
# Integration mit Timer-System falls verfügbar
if TIMER_SYSTEM_AVAILABLE:
self._create_system_timer(reason)
return True
except Exception as e:
logger.error(f"❌ Fehler beim Starten des Timeouts: {e}")
self.is_active = False
return False
def cancel_timeout(self) -> bool:
"""
Bricht den laufenden Timeout ab.
Returns:
bool: True wenn erfolgreich abgebrochen
"""
if not self.is_active:
return False
try:
self.is_active = False
logger.info("✅ Timeout abgebrochen")
# Terminal-Ausgabe löschen
if self.show_terminal_countdown and sys.stdout.isatty():
with self.terminal_lock:
print("\r" + " " * 80 + "\r", end="", flush=True)
print("✅ Timeout abgebrochen")
return True
except Exception as e:
logger.error(f"❌ Fehler beim Abbrechen des Timeouts: {e}")
return False
def extend_timeout(self, additional_seconds: int) -> bool:
"""
Verlängert den laufenden Timeout.
Args:
additional_seconds: Zusätzliche Sekunden
Returns:
bool: True wenn erfolgreich verlängert
"""
if not self.is_active:
logger.warning("Kein aktiver Timeout zum Verlängern")
return False
try:
self.timeout_seconds += additional_seconds
logger.info(f"⏰ Timeout um {additional_seconds} Sekunden verlängert")
return True
except Exception as e:
logger.error(f"❌ Fehler beim Verlängern des Timeouts: {e}")
return False
def _timeout_worker(self, reason: str):
"""Worker-Thread für den eigentlichen Timeout"""
try:
# Warte bis zum Timeout
time.sleep(self.timeout_seconds)
if self.is_active:
logger.critical(f"🚨 FORCE-QUIT TIMEOUT ERREICHT - {reason}")
self._execute_force_quit()
except Exception as e:
logger.error(f"❌ Fehler im Timeout-Worker: {e}")
def _terminal_countdown_worker(self):
"""Worker-Thread für den visuellen Terminal-Countdown"""
try:
while self.is_active:
elapsed = (datetime.now() - self.start_time).total_seconds()
remaining = max(0, self.timeout_seconds - elapsed)
if remaining <= 0:
break
# Fortschrittsbalken und Countdown
progress = 1.0 - (remaining / self.timeout_seconds)
bar_width = 40
filled_width = int(bar_width * progress)
# Warnung-Status
is_warning = remaining <= self.warning_seconds
warning_icon = "🚨" if is_warning else ""
# Terminal-Ausgabe mit Lock
with self.terminal_lock:
bar = "" * filled_width + "" * (bar_width - filled_width)
countdown_text = (
f"\r{warning_icon} FORCE-QUIT in: {int(remaining):3d}s "
f"[{bar}] {progress*100:6.1f}% "
)
print(countdown_text, end="", flush=True)
# Warnung ausgeben
if is_warning and int(remaining) % 5 == 0:
logger.warning(f"⚠️ WARNUNG: Force-Quit in {int(remaining)} Sekunden!")
time.sleep(0.1) # 100ms Update-Intervall
# Letzte Ausgabe
if self.is_active:
with self.terminal_lock:
print("\r🚨 FORCE-QUIT WIRD AUSGEFÜHRT!" + " " * 30, flush=True)
except Exception as e:
logger.error(f"❌ Fehler im Terminal-Countdown: {e}")
def _create_system_timer(self, reason: str):
"""Erstellt einen System-Timer für Integration mit bestehendem Timer-System"""
try:
timer_manager = get_timer_manager()
timer_name = f"force_quit_{int(time.time())}"
timer = timer_manager.create_timer(
name=timer_name,
timer_type=TimerType.SYSTEM,
duration_seconds=self.timeout_seconds,
force_quit_action=ForceQuitAction.SHUTDOWN,
auto_start=True,
warning_message=f"Force-Quit wegen: {reason}",
force_quit_warning_seconds=self.warning_seconds
)
if timer:
logger.debug(f"System-Timer '{timer_name}' erstellt")
except Exception as e:
logger.warning(f"System-Timer konnte nicht erstellt werden: {e}")
def _execute_force_quit(self):
"""Führt den Force-Quit aus"""
try:
logger.critical("🚨 FORCE-QUIT WIRD AUSGEFÜHRT")
# Terminal-Ausgabe stoppen
self.is_active = False
if self.show_terminal_countdown and sys.stdout.isatty():
with self.terminal_lock:
print("\r🚨 FORCE-QUIT AKTIV - DATENBANKBEREINIGUNG..." + " " * 20, flush=True)
# 1. Shutdown-Callback ausführen (falls gesetzt)
if self.shutdown_callback:
try:
logger.info("📞 Führe Shutdown-Callback aus...")
self.shutdown_callback()
except Exception as e:
logger.error(f"❌ Fehler im Shutdown-Callback: {e}")
# 2. Datenbankbereinigung
if self.database_cleanup:
self._perform_database_cleanup()
# 3. System beenden
logger.critical("💀 FORCE-QUIT ABGESCHLOSSEN - SYSTEM WIRD BEENDET")
if self.show_terminal_countdown and sys.stdout.isatty():
with self.terminal_lock:
print("💀 FORCE-QUIT ABGESCHLOSSEN", flush=True)
# Kurze Verzögerung für Log-Ausgabe
time.sleep(1)
# System beenden
os._exit(1)
except Exception as e:
logger.critical(f"❌ KRITISCHER FEHLER IM FORCE-QUIT: {e}")
# Notfall-Exit
os._exit(1)
def _perform_database_cleanup(self):
"""Führt robuste Datenbankbereinigung durch"""
try:
logger.info("💾 Starte Datenbankbereinigung...")
if self.show_terminal_countdown and sys.stdout.isatty():
with self.terminal_lock:
print("\r💾 Datenbankbereinigung läuft..." + " " * 30, flush=True)
# 1. Verwende modernen DatabaseCleanupManager falls verfügbar
if DATABASE_CLEANUP_AVAILABLE:
logger.info("🔧 Verwende DatabaseCleanupManager...")
result = safe_database_cleanup(
force_mode_switch=True, # Aggressive Bereinigung
max_cleanup_time=10 # 10 Sekunden Maximum
)
if result.get("success", False):
logger.info(f"✅ Database-Cleanup erfolgreich: {', '.join(result.get('operations', []))}")
else:
logger.warning(f"⚠️ Database-Cleanup mit Problemen: {', '.join(result.get('errors', []))}")
# Fallback verwenden
self._fallback_database_cleanup()
else:
# 2. Fallback: Direkter SQLite-Cleanup
self._fallback_database_cleanup()
# 3. WAL/SHM-Dateien manuell bereinigen falls gewünscht
if self.force_wal_cleanup:
self._force_wal_shm_cleanup()
logger.info("✅ Datenbankbereinigung abgeschlossen")
except Exception as e:
logger.error(f"❌ Fehler bei Datenbankbereinigung: {e}")
# Versuche trotzdem WAL/SHM-Cleanup
if self.force_wal_cleanup:
try:
self._force_wal_shm_cleanup()
except:
pass
def _fallback_database_cleanup(self):
"""Fallback-Datenbankbereinigung mit direkten SQLite-Befehlen"""
try:
from models import create_optimized_engine
from sqlalchemy import text
logger.info("🔄 Fallback Database-Cleanup...")
engine = create_optimized_engine()
with engine.connect() as conn:
# WAL-Checkpoint (TRUNCATE für vollständige Bereinigung)
result = conn.execute(text("PRAGMA wal_checkpoint(TRUNCATE)")).fetchone()
if result and result[1] > 0:
logger.info(f"WAL-Checkpoint: {result[1]} Seiten übertragen")
# Alle ausstehenden Transaktionen committen
conn.commit()
# Verbindung optimieren
conn.execute(text("PRAGMA optimize"))
logger.info("✅ Fallback Database-Cleanup abgeschlossen")
# Engine ordnungsgemäß schließen
engine.dispose()
except Exception as e:
logger.error(f"❌ Fehler im Fallback Database-Cleanup: {e}")
def _force_wal_shm_cleanup(self):
"""Aggressive Bereinigung von WAL/SHM-Dateien"""
try:
from utils.settings import DATABASE_PATH
logger.info("🧹 Force WAL/SHM-Cleanup...")
if self.show_terminal_countdown and sys.stdout.isatty():
with self.terminal_lock:
print("\r🧹 WAL/SHM-Dateien werden bereinigt..." + " " * 20, flush=True)
# Kurze Pause um sicherzustellen, dass alle DB-Verbindungen geschlossen sind
time.sleep(0.5)
# WAL-Datei
wal_path = DATABASE_PATH + "-wal"
if os.path.exists(wal_path):
try:
# Versuche erst normales Löschen
os.remove(wal_path)
logger.info(f"✅ WAL-Datei gelöscht: {wal_path}")
except OSError:
# Falls blockiert, versuche Umbenennung und Löschung
try:
backup_path = wal_path + f".backup_{int(time.time())}"
shutil.move(wal_path, backup_path)
os.remove(backup_path)
logger.info(f"✅ WAL-Datei über Backup gelöscht: {wal_path}")
except Exception as e:
logger.warning(f"⚠️ WAL-Datei konnte nicht gelöscht werden: {e}")
# SHM-Datei
shm_path = DATABASE_PATH + "-shm"
if os.path.exists(shm_path):
try:
os.remove(shm_path)
logger.info(f"✅ SHM-Datei gelöscht: {shm_path}")
except OSError:
try:
backup_path = shm_path + f".backup_{int(time.time())}"
shutil.move(shm_path, backup_path)
os.remove(backup_path)
logger.info(f"✅ SHM-Datei über Backup gelöscht: {shm_path}")
except Exception as e:
logger.warning(f"⚠️ SHM-Datei konnte nicht gelöscht werden: {e}")
logger.info("✅ Force WAL/SHM-Cleanup abgeschlossen")
except Exception as e:
logger.error(f"❌ Fehler bei Force WAL/SHM-Cleanup: {e}")
def get_status(self) -> Dict[str, Any]:
"""Gibt den aktuellen Status zurück"""
if not self.is_active:
return {
"active": False,
"remaining_seconds": 0,
"progress_percent": 0.0
}
elapsed = (datetime.now() - self.start_time).total_seconds()
remaining = max(0, self.timeout_seconds - elapsed)
progress = 1.0 - (remaining / self.timeout_seconds) if self.timeout_seconds > 0 else 1.0
return {
"active": True,
"remaining_seconds": int(remaining),
"progress_percent": round(progress * 100, 1),
"is_warning": remaining <= self.warning_seconds,
"start_time": self.start_time.isoformat() if self.start_time else None
}
# ===== GLOBALER MANAGER UND UTILITY-FUNKTIONEN =====
_timeout_manager: Optional[TimeoutForceQuitManager] = None
_manager_lock = threading.Lock()
def get_timeout_manager(timeout_seconds: int = 45,
warning_seconds: int = 15,
database_cleanup: bool = True,
force_wal_cleanup: bool = True) -> TimeoutForceQuitManager:
"""
Singleton-Pattern für globalen Timeout-Manager.
Args:
timeout_seconds: Gesamttimeout in Sekunden
warning_seconds: Warnzeit vor Force-Quit
database_cleanup: Datenbankbereinigung aktivieren
force_wal_cleanup: Aggressive WAL/SHM-Bereinigung
Returns:
TimeoutForceQuitManager: Globaler Timeout-Manager
"""
global _timeout_manager
with _manager_lock:
if _timeout_manager is None:
_timeout_manager = TimeoutForceQuitManager(
timeout_seconds=timeout_seconds,
warning_seconds=warning_seconds,
database_cleanup=database_cleanup,
force_wal_cleanup=force_wal_cleanup
)
return _timeout_manager
def start_force_quit_timeout(reason: str = "System-Timeout",
timeout_seconds: int = 45,
warning_seconds: int = 15,
database_cleanup: bool = True,
force_wal_cleanup: bool = True) -> bool:
"""
Startet einen Force-Quit-Timeout mit Terminal-Countdown.
Args:
reason: Grund für den Timeout
timeout_seconds: Gesamttimeout in Sekunden
warning_seconds: Warnzeit vor Force-Quit
database_cleanup: Datenbankbereinigung aktivieren
force_wal_cleanup: Aggressive WAL/SHM-Bereinigung
Returns:
bool: True wenn erfolgreich gestartet
"""
manager = get_timeout_manager(timeout_seconds, warning_seconds, database_cleanup, force_wal_cleanup)
return manager.start_timeout(reason)
def cancel_force_quit_timeout() -> bool:
"""
Bricht den aktuellen Force-Quit-Timeout ab.
Returns:
bool: True wenn erfolgreich abgebrochen
"""
global _timeout_manager
if _timeout_manager:
return _timeout_manager.cancel_timeout()
return False
def extend_force_quit_timeout(additional_seconds: int) -> bool:
"""
Verlängert den aktuellen Force-Quit-Timeout.
Args:
additional_seconds: Zusätzliche Sekunden
Returns:
bool: True wenn erfolgreich verlängert
"""
global _timeout_manager
if _timeout_manager:
return _timeout_manager.extend_timeout(additional_seconds)
return False
def get_force_quit_status() -> Dict[str, Any]:
"""
Gibt den Status des aktuellen Force-Quit-Timeouts zurück.
Returns:
Dict: Status-Informationen
"""
global _timeout_manager
if _timeout_manager:
return _timeout_manager.get_status()
return {"active": False, "remaining_seconds": 0, "progress_percent": 0.0}
@contextmanager
def timeout_context(timeout_seconds: int = 45,
reason: str = "Operation-Timeout",
auto_cancel: bool = True):
"""
Context-Manager für automatischen Timeout-Schutz.
Args:
timeout_seconds: Timeout in Sekunden
reason: Grund für den Timeout
auto_cancel: Automatisch abbrechen beim Verlassen des Contexts
Usage:
with timeout_context(30, "Datenbank-Migration"):
# Lange Operation...
pass
"""
manager = get_timeout_manager(timeout_seconds)
success = manager.start_timeout(reason)
try:
yield manager
finally:
if success and auto_cancel:
manager.cancel_timeout()
def register_shutdown_callback(callback: Callable):
"""
Registriert eine Callback-Funktion für den Shutdown.
Args:
callback: Callback-Funktion die beim Shutdown ausgeführt wird
"""
manager = get_timeout_manager()
manager.set_shutdown_callback(callback)
# ===== INTEGRATION MIT SHUTDOWN-MANAGER =====
def integrate_with_shutdown_manager():
"""Integriert den Timeout-Manager mit dem bestehenden Shutdown-Manager"""
try:
from utils.shutdown_manager import get_shutdown_manager
shutdown_manager = get_shutdown_manager()
# Force-Quit-Timeout als Cleanup-Funktion registrieren
def timeout_cleanup():
global _timeout_manager
if _timeout_manager and _timeout_manager.is_active:
logger.info("🔄 Timeout-Manager wird im Shutdown-Prozess gestoppt")
_timeout_manager.cancel_timeout()
shutdown_manager.register_cleanup_function(
func=timeout_cleanup,
name="Timeout Force-Quit Manager",
priority=1, # Hohe Priorität
timeout=5
)
logger.debug("✅ Timeout-Manager in Shutdown-Manager integriert")
except ImportError:
logger.debug("Shutdown-Manager nicht verfügbar - keine Integration")
except Exception as e:
logger.warning(f"Fehler bei Shutdown-Manager-Integration: {e}")
# Automatische Integration beim Import
integrate_with_shutdown_manager()

View File

@ -1,590 +0,0 @@
#!/usr/bin/env python3
"""
Intelligenter Watchdog-Manager für MYP Druckerverwaltung
Erweiterte Überwachung mit Python für bessere Fehlerbehandlung und Logging
Optimiert für Debian/Linux-Systeme im Kiosk-Modus
"""
import os
import sys
import time
import json
import logging
import subprocess
import threading
import signal
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Callable
import psutil
import requests
from urllib3.exceptions import InsecureRequestWarning
# SSL-Warnungen unterdrücken für localhost
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
class WatchdogConfig:
"""Konfiguration für den Watchdog-Manager"""
def __init__(self, app_dir: str = "/opt/myp"):
self.app_dir = Path(app_dir)
self.config_file = self.app_dir / "config" / "watchdog.json"
# Standard-Konfiguration
self.defaults = {
"https_service": "myp-https",
"kiosk_service": "myp-kiosk",
"kiosk_user": "kiosk",
"https_url": "https://localhost:443",
"check_interval": 30,
"https_timeout": 10,
"restart_delay": 15,
"max_memory_percent": 85,
"cert_expire_days": 7,
"log_rotation_size_mb": 10,
"max_restart_attempts": 3,
"restart_cooldown": 300,
"enable_auto_cleanup": True,
"enable_performance_monitoring": True
}
self.config = self.load_config()
def load_config(self) -> Dict:
"""Lädt Konfiguration aus Datei oder verwendet Defaults"""
try:
if self.config_file.exists():
with open(self.config_file, 'r', encoding='utf-8') as f:
config = json.load(f)
# Merge mit Defaults
merged = self.defaults.copy()
merged.update(config)
return merged
else:
self.save_config(self.defaults)
return self.defaults.copy()
except Exception as e:
logging.error(f"Fehler beim Laden der Konfiguration: {e}")
return self.defaults.copy()
def save_config(self, config: Dict) -> None:
"""Speichert Konfiguration in Datei"""
try:
self.config_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.config_file, 'w', encoding='utf-8') as f:
json.dump(config, f, indent=2, ensure_ascii=False)
except Exception as e:
logging.error(f"Fehler beim Speichern der Konfiguration: {e}")
def get(self, key: str, default=None):
"""Holt Konfigurationswert"""
return self.config.get(key, default)
def set(self, key: str, value) -> None:
"""Setzt Konfigurationswert"""
self.config[key] = value
self.save_config(self.config)
class ServiceMonitor:
"""Überwacht systemd-Services"""
def __init__(self, config: WatchdogConfig):
self.config = config
self.restart_counts = {}
self.last_restart_times = {}
def is_service_active(self, service_name: str) -> bool:
"""Prüft ob Service aktiv ist"""
try:
result = subprocess.run(
["systemctl", "is-active", "--quiet", service_name],
capture_output=True
)
return result.returncode == 0
except Exception:
return False
def is_service_enabled(self, service_name: str) -> bool:
"""Prüft ob Service aktiviert ist"""
try:
result = subprocess.run(
["systemctl", "is-enabled", "--quiet", service_name],
capture_output=True
)
return result.returncode == 0
except Exception:
return False
def restart_service(self, service_name: str) -> bool:
"""Startet Service neu mit Cooldown-Logik"""
now = datetime.now()
# Prüfe Restart-Cooldown
if service_name in self.last_restart_times:
time_since_last = (now - self.last_restart_times[service_name]).total_seconds()
if time_since_last < self.config.get("restart_cooldown", 300):
logging.warning(f"Service {service_name} im Cooldown ({time_since_last:.0f}s)")
return False
# Prüfe maximale Restart-Versuche
restart_count = self.restart_counts.get(service_name, 0)
max_attempts = self.config.get("max_restart_attempts", 3)
if restart_count >= max_attempts:
logging.error(f"Service {service_name} erreichte maximale Restart-Versuche ({max_attempts})")
return False
try:
logging.info(f"Starte Service neu: {service_name} (Versuch {restart_count + 1}/{max_attempts})")
result = subprocess.run(
["systemctl", "restart", service_name],
capture_output=True,
text=True,
timeout=30
)
if result.returncode == 0:
self.restart_counts[service_name] = restart_count + 1
self.last_restart_times[service_name] = now
time.sleep(self.config.get("restart_delay", 15))
logging.info(f"Service {service_name} erfolgreich neugestartet")
return True
else:
logging.error(f"Service-Neustart fehlgeschlagen: {result.stderr}")
return False
except subprocess.TimeoutExpired:
logging.error(f"Service-Neustart Timeout: {service_name}")
return False
except Exception as e:
logging.error(f"Service-Neustart Fehler: {e}")
return False
def reset_restart_counter(self, service_name: str) -> None:
"""Setzt Restart-Zähler zurück"""
if service_name in self.restart_counts:
del self.restart_counts[service_name]
if service_name in self.last_restart_times:
del self.last_restart_times[service_name]
class HTTPSMonitor:
"""Überwacht HTTPS-Backend"""
def __init__(self, config: WatchdogConfig):
self.config = config
self.session = requests.Session()
self.session.verify = False # Selbstsignierte Zertifikate
def check_connectivity(self) -> bool:
"""Prüft HTTPS-Erreichbarkeit"""
try:
url = self.config.get("https_url", "https://localhost:443")
timeout = self.config.get("https_timeout", 10)
response = self.session.get(
url,
timeout=timeout,
allow_redirects=True
)
return response.status_code < 500
except Exception as e:
logging.debug(f"HTTPS-Konnektivitätsprüfung fehlgeschlagen: {e}")
return False
def check_ssl_certificate(self) -> bool:
"""Prüft SSL-Zertifikat-Gültigkeit"""
try:
cert_file = self.config.app_dir / "certs" / "localhost" / "localhost.crt"
if not cert_file.exists():
return False
expire_days = self.config.get("cert_expire_days", 7)
expire_seconds = expire_days * 86400
result = subprocess.run([
"openssl", "x509",
"-in", str(cert_file),
"-noout", "-checkend", str(expire_seconds)
], capture_output=True)
return result.returncode == 0
except Exception as e:
logging.error(f"SSL-Zertifikat-Prüfung fehlgeschlagen: {e}")
return False
def regenerate_ssl_certificate(self) -> bool:
"""Regeneriert SSL-Zertifikat"""
try:
logging.info("Regeneriere SSL-Zertifikat...")
# Importiere SSL-Konfiguration
sys.path.insert(0, str(self.config.app_dir))
from utils.ssl_config import ensure_ssl_certificates
success = ensure_ssl_certificates(str(self.config.app_dir), force_regenerate=True)
if success:
logging.info("SSL-Zertifikat erfolgreich regeneriert")
else:
logging.error("SSL-Zertifikat-Regenerierung fehlgeschlagen")
return success
except Exception as e:
logging.error(f"SSL-Zertifikat-Regenerierung Fehler: {e}")
return False
class KioskMonitor:
"""Überwacht Kiosk-Session und Browser"""
def __init__(self, config: WatchdogConfig):
self.config = config
self.kiosk_user = config.get("kiosk_user", "kiosk")
def check_user_session(self) -> bool:
"""Prüft ob Kiosk-User-Session aktiv ist"""
try:
for proc in psutil.process_iter(['username']):
if proc.info['username'] == self.kiosk_user:
return True
return False
except Exception:
return False
def check_chromium_process(self) -> bool:
"""Prüft ob Chromium-Kiosk-Prozess läuft"""
try:
for proc in psutil.process_iter(['username', 'cmdline']):
if (proc.info['username'] == self.kiosk_user and
proc.info['cmdline'] and
any('chromium' in arg and 'kiosk' in arg for arg in proc.info['cmdline'])):
return True
return False
except Exception:
return False
def check_x_server(self) -> bool:
"""Prüft ob X-Server läuft"""
try:
for proc in psutil.process_iter(['cmdline']):
if (proc.info['cmdline'] and
any('X' in arg and ':0' in arg for arg in proc.info['cmdline'])):
return True
return False
except Exception:
return False
def check_display_availability(self) -> bool:
"""Prüft ob Display verfügbar ist"""
try:
result = subprocess.run(
["xdpyinfo"],
env={"DISPLAY": ":0"},
capture_output=True,
timeout=5
)
return result.returncode == 0
except Exception:
return False
def restart_kiosk_session(self) -> bool:
"""Startet Kiosk-Session neu"""
try:
logging.info("Starte Kiosk-Session neu...")
# Beende Kiosk-Prozesse sanft
subprocess.run(["pkill", "-u", self.kiosk_user, "-TERM"], timeout=10)
time.sleep(5)
# Erzwinge Beendigung falls nötig
subprocess.run(["pkill", "-u", self.kiosk_user, "-KILL"], timeout=5)
time.sleep(2)
# Starte Getty-Service neu für Autologin
subprocess.run(["systemctl", "restart", "getty@tty1.service"], timeout=15)
time.sleep(self.config.get("restart_delay", 15))
logging.info("Kiosk-Session neugestartet")
return True
except Exception as e:
logging.error(f"Kiosk-Session-Neustart fehlgeschlagen: {e}")
return False
class SystemMonitor:
"""Überwacht Systemressourcen"""
def __init__(self, config: WatchdogConfig):
self.config = config
def get_memory_usage(self) -> float:
"""Gibt Speichernutzung in Prozent zurück"""
try:
return psutil.virtual_memory().percent
except Exception:
return 0.0
def get_cpu_usage(self) -> float:
"""Gibt CPU-Nutzung in Prozent zurück"""
try:
return psutil.cpu_percent(interval=1)
except Exception:
return 0.0
def get_disk_usage(self) -> float:
"""Gibt Festplatten-Nutzung in Prozent zurück"""
try:
return psutil.disk_usage('/').percent
except Exception:
return 0.0
def cleanup_system_resources(self) -> None:
"""Bereinigt Systemressourcen"""
try:
memory_before = self.get_memory_usage()
logging.info(f"Bereinige Systemressourcen (Speicher: {memory_before:.1f}%)")
kiosk_user = self.config.get("kiosk_user", "kiosk")
app_dir = self.config.app_dir
# Browser-Cache bereinigen
cache_dirs = [
f"/home/{kiosk_user}/.chromium-kiosk/Default/Cache",
f"/home/{kiosk_user}/.cache"
]
for cache_dir in cache_dirs:
if os.path.exists(cache_dir):
subprocess.run(["rm", "-rf", f"{cache_dir}/*"], shell=True)
# Temporäre Dateien bereinigen
temp_dirs = [
"/tmp",
str(app_dir / "uploads" / "temp")
]
for temp_dir in temp_dirs:
if os.path.exists(temp_dir):
subprocess.run([
"find", temp_dir, "-type", "f", "-atime", "+1", "-delete"
], timeout=30)
# System-Cache leeren
subprocess.run(["sync"])
with open("/proc/sys/vm/drop_caches", "w") as f:
f.write("3")
memory_after = self.get_memory_usage()
logging.info(f"Systemressourcen bereinigt (Speicher: {memory_after:.1f}%)")
except Exception as e:
logging.error(f"Systemressourcen-Bereinigung fehlgeschlagen: {e}")
class WatchdogManager:
"""Hauptklasse für Watchdog-Management"""
def __init__(self, app_dir: str = "/opt/myp"):
self.config = WatchdogConfig(app_dir)
self.service_monitor = ServiceMonitor(self.config)
self.https_monitor = HTTPSMonitor(self.config)
self.kiosk_monitor = KioskMonitor(self.config)
self.system_monitor = SystemMonitor(self.config)
self.running = False
self.setup_logging()
self.setup_signal_handlers()
def setup_logging(self) -> None:
"""Konfiguriert Logging"""
log_file = Path("/var/log/kiosk-watchdog-python.log")
log_file.parent.mkdir(parents=True, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler()
]
)
def setup_signal_handlers(self) -> None:
"""Konfiguriert Signal-Handler für sauberes Beenden"""
def signal_handler(signum, frame):
logging.info(f"Signal {signum} empfangen - beende Watchdog...")
self.running = False
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
def rotate_log_if_needed(self) -> None:
"""Rotiert Log-Datei bei Bedarf"""
try:
log_file = Path("/var/log/kiosk-watchdog-python.log")
max_size = self.config.get("log_rotation_size_mb", 10) * 1024 * 1024
if log_file.exists() and log_file.stat().st_size > max_size:
# Behalte nur die letzten 1000 Zeilen
subprocess.run([
"tail", "-n", "1000", str(log_file)
], stdout=open(f"{log_file}.tmp", "w"))
log_file.unlink()
Path(f"{log_file}.tmp").rename(log_file)
logging.info("Log-Datei rotiert (>10MB)")
except Exception as e:
logging.error(f"Log-Rotation fehlgeschlagen: {e}")
def check_https_backend(self) -> None:
"""Prüft HTTPS-Backend"""
service_name = self.config.get("https_service", "myp-https")
if not self.service_monitor.is_service_active(service_name):
logging.error("HTTPS-Service nicht aktiv")
self.service_monitor.restart_service(service_name)
elif not self.https_monitor.check_connectivity():
logging.error("HTTPS Backend nicht erreichbar")
self.service_monitor.restart_service(service_name)
else:
# Service läuft - Reset Restart-Counter
self.service_monitor.reset_restart_counter(service_name)
def check_ssl_certificate(self) -> None:
"""Prüft SSL-Zertifikat"""
if not self.https_monitor.check_ssl_certificate():
cert_file = self.config.app_dir / "certs" / "localhost" / "localhost.crt"
if cert_file.exists():
expire_days = self.config.get("cert_expire_days", 7)
logging.warning(f"SSL-Zertifikat läuft in {expire_days} Tagen ab")
else:
logging.error("SSL-Zertifikat fehlt")
if self.https_monitor.regenerate_ssl_certificate():
service_name = self.config.get("https_service", "myp-https")
self.service_monitor.restart_service(service_name)
def check_kiosk_session(self) -> None:
"""Prüft Kiosk-Session"""
if not self.kiosk_monitor.check_user_session():
logging.error("Kiosk-Benutzer-Session nicht aktiv")
self.kiosk_monitor.restart_kiosk_session()
elif not self.kiosk_monitor.check_x_server():
logging.error("X-Server nicht verfügbar")
self.kiosk_monitor.restart_kiosk_session()
elif not self.kiosk_monitor.check_display_availability():
logging.error("Display :0 nicht verfügbar")
self.kiosk_monitor.restart_kiosk_session()
elif not self.kiosk_monitor.check_chromium_process():
logging.warning("Chromium-Kiosk-Prozess nicht gefunden")
# Versuche Kiosk-Service zu starten
kiosk_service = self.config.get("kiosk_service", "myp-kiosk")
if self.service_monitor.is_service_enabled(kiosk_service):
subprocess.run(["systemctl", "--user", "start", kiosk_service])
else:
# Fallback: Browser direkt starten
https_url = self.config.get("https_url", "https://localhost:443")
kiosk_user = self.config.get("kiosk_user", "kiosk")
subprocess.Popen([
"sudo", "-u", kiosk_user,
"DISPLAY=:0", "chromium",
"--kiosk", "--no-sandbox", "--ignore-certificate-errors",
https_url
], env={"DISPLAY": ":0"})
time.sleep(self.config.get("restart_delay", 15))
def check_system_resources(self) -> None:
"""Prüft Systemressourcen"""
if not self.config.get("enable_performance_monitoring", True):
return
memory_usage = self.system_monitor.get_memory_usage()
max_memory = self.config.get("max_memory_percent", 85)
if memory_usage > max_memory:
logging.warning(f"Hohe Speichernutzung: {memory_usage:.1f}%")
if self.config.get("enable_auto_cleanup", True):
self.system_monitor.cleanup_system_resources()
def run_monitoring_cycle(self) -> None:
"""Führt einen Überwachungszyklus durch"""
try:
# HTTPS Backend prüfen
self.check_https_backend()
# SSL-Zertifikat prüfen
self.check_ssl_certificate()
# Kiosk-Session prüfen
self.check_kiosk_session()
# Systemressourcen prüfen
self.check_system_resources()
# Log-Rotation
self.rotate_log_if_needed()
except Exception as e:
logging.error(f"Fehler im Überwachungszyklus: {e}")
def run(self) -> None:
"""Startet Hauptüberwachungsschleife"""
self.running = True
check_interval = self.config.get("check_interval", 30)
logging.info(f"Kiosk-Watchdog gestartet (PID: {os.getpid()})")
logging.info(f"Überwachungsintervall: {check_interval}s")
while self.running:
try:
self.run_monitoring_cycle()
time.sleep(check_interval)
except KeyboardInterrupt:
logging.info("Watchdog durch Benutzer beendet")
break
except Exception as e:
logging.error(f"Unerwarteter Fehler: {e}")
time.sleep(check_interval)
logging.info("Kiosk-Watchdog beendet")
def main():
"""Hauptfunktion"""
import argparse
parser = argparse.ArgumentParser(description="MYP Kiosk Watchdog Manager")
parser.add_argument("--app-dir", default="/opt/myp", help="Anwendungsverzeichnis")
parser.add_argument("--config", help="Konfigurationsdatei")
parser.add_argument("--daemon", action="store_true", help="Als Daemon ausführen")
args = parser.parse_args()
try:
watchdog = WatchdogManager(args.app_dir)
if args.daemon:
# Daemon-Modus (für systemd)
watchdog.run()
else:
# Interaktiver Modus
print("Starte Watchdog... (Strg+C zum Beenden)")
watchdog.run()
except Exception as e:
logging.error(f"Watchdog-Start fehlgeschlagen: {e}")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@ -1,398 +0,0 @@
"""
Windows-spezifische Fixes für Thread- und Socket-Probleme
Behebt bekannte Issues mit Flask Auto-Reload auf Windows.
"""
import os
import sys
import signal
import threading
import time
import atexit
from typing import List, Callable
from utils.logging_config import get_logger
# Logger für Windows-Fixes
windows_logger = get_logger("windows_fixes")
# Exportierte Funktionen
__all__ = [
'WindowsThreadManager',
'get_windows_thread_manager',
'fix_windows_socket_issues',
'apply_safe_socket_options',
'setup_windows_environment',
'is_flask_reloader_process',
'apply_all_windows_fixes',
'safe_subprocess_run',
'patch_subprocess',
'apply_global_subprocess_patch',
'apply_encoding_fixes',
'apply_threading_fixes',
'apply_signal_fixes'
]
# Globale Flags um doppelte Anwendung zu verhindern
_windows_fixes_applied = False
_socket_patches_applied = False
class WindowsThreadManager:
"""
Verwaltet Threads und deren ordnungsgemäße Beendigung auf Windows.
Behebt Socket-Fehler beim Flask Auto-Reload.
"""
def __init__(self):
self.managed_threads: List[threading.Thread] = []
self.cleanup_functions: List[Callable] = []
self.shutdown_event = threading.Event()
self._lock = threading.Lock()
self._is_shutting_down = False
# Signal-Handler nur auf Windows registrieren
if os.name == 'nt':
self._register_signal_handlers()
def _register_signal_handlers(self):
"""Registriert Windows-spezifische Signal-Handler."""
try:
signal.signal(signal.SIGINT, self._signal_handler)
signal.signal(signal.SIGTERM, self._signal_handler)
# Windows-spezifisches SIGBREAK
if hasattr(signal, 'SIGBREAK'):
signal.signal(signal.SIGBREAK, self._signal_handler)
windows_logger.debug("✅ Windows Signal-Handler registriert")
except Exception as e:
windows_logger.warning(f"⚠️ Signal-Handler konnten nicht registriert werden: {str(e)}")
def _signal_handler(self, sig, frame):
"""Signal-Handler für ordnungsgemäßes Shutdown."""
if not self._is_shutting_down:
windows_logger.warning(f"🛑 Windows Signal {sig} empfangen - initiiere Shutdown")
self.shutdown_all()
def register_thread(self, thread: threading.Thread):
"""Registriert einen Thread für ordnungsgemäße Beendigung."""
with self._lock:
if thread not in self.managed_threads:
self.managed_threads.append(thread)
windows_logger.debug(f"📝 Thread {thread.name} registriert")
def register_cleanup_function(self, func: Callable):
"""Registriert eine Cleanup-Funktion."""
with self._lock:
if func not in self.cleanup_functions:
self.cleanup_functions.append(func)
windows_logger.debug(f"📝 Cleanup-Funktion registriert")
def shutdown_all(self):
"""Beendet alle verwalteten Threads und führt Cleanup durch."""
if self._is_shutting_down:
return
with self._lock:
self._is_shutting_down = True
windows_logger.info("🔄 Starte Windows Thread-Shutdown...")
# Shutdown-Event setzen
self.shutdown_event.set()
# Cleanup-Funktionen ausführen
for func in self.cleanup_functions:
try:
windows_logger.debug(f"🧹 Führe Cleanup-Funktion aus: {func.__name__}")
func()
except Exception as e:
windows_logger.error(f"❌ Fehler bei Cleanup-Funktion {func.__name__}: {str(e)}")
# Threads beenden
active_threads = [t for t in self.managed_threads if t.is_alive()]
if active_threads:
windows_logger.info(f"⏳ Warte auf {len(active_threads)} aktive Threads...")
for thread in active_threads:
try:
windows_logger.debug(f"🔄 Beende Thread: {thread.name}")
thread.join(timeout=5)
if thread.is_alive():
windows_logger.warning(f"⚠️ Thread {thread.name} konnte nicht ordnungsgemäß beendet werden")
else:
windows_logger.debug(f"✅ Thread {thread.name} erfolgreich beendet")
except Exception as e:
windows_logger.error(f"❌ Fehler beim Beenden von Thread {thread.name}: {str(e)}")
windows_logger.info("✅ Windows Thread-Shutdown abgeschlossen")
# Globale Instanz
_windows_thread_manager = None
def get_windows_thread_manager() -> WindowsThreadManager:
"""Gibt die globale Instanz des Windows Thread-Managers zurück."""
global _windows_thread_manager
if _windows_thread_manager is None:
_windows_thread_manager = WindowsThreadManager()
return _windows_thread_manager
def fix_windows_socket_issues():
"""
Anwendung von Windows-spezifischen Socket-Fixes.
Vereinfachte, sichere Version ohne Monkey-Patching.
"""
global _socket_patches_applied
if os.name != 'nt':
return
if _socket_patches_applied:
windows_logger.debug("⏭️ Socket-Patches bereits angewendet")
return
try:
# SICHERERE Alternative: Nur TCP Socket-Optionen setzen ohne Monkey-Patching
import socket
# Erweitere die Socket-Klasse mit einer Hilfsmethode
if not hasattr(socket.socket, 'windows_bind_with_reuse'):
def windows_bind_with_reuse(self, address):
"""Windows-optimierte bind-Methode mit SO_REUSEADDR."""
try:
# SO_REUSEADDR aktivieren
self.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
windows_logger.debug(f"SO_REUSEADDR aktiviert für Socket {address}")
except Exception as e:
windows_logger.debug(f"SO_REUSEADDR konnte nicht gesetzt werden: {str(e)}")
# Standard-bind ausführen
return self.bind(address)
# Füge die Hilfsmethode hinzu ohne die ursprüngliche bind-Methode zu überschreiben
socket.socket.windows_bind_with_reuse = windows_bind_with_reuse
# Setze globale Socket-Optionen für bessere Windows-Kompatibilität
socket.setdefaulttimeout(30) # 30 Sekunden Standard-Timeout
_socket_patches_applied = True
windows_logger.debug("✅ Windows Socket-Optimierungen angewendet (sicher)")
except Exception as e:
windows_logger.warning(f"⚠️ Socket-Optimierungen konnten nicht angewendet werden: {str(e)}")
def apply_safe_socket_options():
"""
Wendet sichere Socket-Optionen für Windows an ohne Monkey-Patching.
"""
if os.name != 'nt':
return
try:
import socket
# Sichere Socket-Defaults für Windows
if hasattr(socket, 'TCP_NODELAY'):
# TCP_NODELAY als Standard aktivieren für bessere Performance
pass # Wird pro Socket gesetzt, nicht global
windows_logger.debug("✅ Sichere Socket-Optionen angewendet")
except Exception as e:
windows_logger.debug(f"Socket-Optionen konnten nicht gesetzt werden: {str(e)}")
def setup_windows_environment():
"""
Richtet die Windows-Umgebung für bessere Flask-Kompatibilität ein.
"""
if os.name != 'nt':
return
try:
# Umgebungsvariablen für bessere Windows-Kompatibilität
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['PYTHONUTF8'] = '1'
windows_logger.debug("✅ Windows-Umgebung optimiert")
except Exception as e:
windows_logger.warning(f"⚠️ Windows-Umgebung konnte nicht optimiert werden: {str(e)}")
def is_flask_reloader_process() -> bool:
"""
Prüft, ob der aktuelle Prozess der Flask-Reloader-Prozess ist.
"""
return os.environ.get('WERKZEUG_RUN_MAIN') != 'true'
# ===== ENCODING-FIXES =====
def apply_encoding_fixes():
"""Wendet Windows-spezifische Encoding-Fixes an."""
try:
# Umgebungsvariablen für bessere Windows-Kompatibilität
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['PYTHONUTF8'] = '1'
windows_logger.debug("✅ Windows-Encoding-Fixes angewendet")
except Exception as e:
windows_logger.warning(f"⚠️ Encoding-Fixes konnten nicht angewendet werden: {str(e)}")
# ===== THREADING-FIXES =====
def apply_threading_fixes():
"""Wendet Windows-spezifische Threading-Fixes an."""
try:
# Thread-Manager initialisieren
get_windows_thread_manager()
# Socket-Fixes anwenden
fix_windows_socket_issues()
apply_safe_socket_options()
windows_logger.debug("✅ Windows-Threading-Fixes angewendet")
except Exception as e:
windows_logger.warning(f"⚠️ Threading-Fixes konnten nicht angewendet werden: {str(e)}")
# ===== SIGNAL-FIXES =====
def apply_signal_fixes():
"""Wendet Windows-spezifische Signal-Handler-Fixes an."""
try:
# Signal-Handler werden bereits im WindowsThreadManager registriert
windows_logger.debug("✅ Windows-Signal-Fixes angewendet")
except Exception as e:
windows_logger.warning(f"⚠️ Signal-Fixes konnten nicht angewendet werden: {str(e)}")
# ===== SICHERE SUBPROCESS-WRAPPER =====
def safe_subprocess_run(*args, **kwargs):
"""
Sicherer subprocess.run Wrapper für Windows mit UTF-8 Encoding.
Verhindert charmap-Fehler durch explizite Encoding-Einstellungen.
"""
import subprocess
# Standard-Encoding für Windows setzen
if 'encoding' not in kwargs and kwargs.get('text', False):
kwargs['encoding'] = 'utf-8'
kwargs['errors'] = 'replace'
# Timeout-Standard setzen falls nicht vorhanden
if 'timeout' not in kwargs:
kwargs['timeout'] = 30
try:
return subprocess.run(*args, **kwargs)
except subprocess.TimeoutExpired as e:
windows_logger.warning(f"Subprocess-Timeout nach {kwargs.get('timeout', 30)}s: {' '.join(args[0]) if args and isinstance(args[0], list) else str(args)}")
raise e
except UnicodeDecodeError as e:
windows_logger.error(f"Unicode-Decode-Fehler in subprocess: {str(e)}")
# Fallback ohne text=True
kwargs_fallback = kwargs.copy()
kwargs_fallback.pop('text', None)
kwargs_fallback.pop('encoding', None)
kwargs_fallback.pop('errors', None)
return subprocess.run(*args, **kwargs_fallback)
except Exception as e:
windows_logger.error(f"Subprocess-Fehler: {str(e)}")
raise e
# ===== SUBPROCESS-MONKEY-PATCH =====
def patch_subprocess():
"""
Patcht subprocess.run und subprocess.Popen um automatisch sichere Encoding-Einstellungen zu verwenden.
"""
import subprocess
# Original-Funktionen speichern
if not hasattr(subprocess, '_original_run'):
subprocess._original_run = subprocess.run
subprocess._original_popen = subprocess.Popen
def patched_run(*args, **kwargs):
# Automatisch UTF-8 Encoding für text=True setzen
if kwargs.get('text', False) and 'encoding' not in kwargs:
kwargs['encoding'] = 'utf-8'
kwargs['errors'] = 'replace'
return subprocess._original_run(*args, **kwargs)
def patched_popen(*args, **kwargs):
# Automatisch UTF-8 Encoding für text=True setzen
if kwargs.get('text', False) and 'encoding' not in kwargs:
kwargs['encoding'] = 'utf-8'
kwargs['errors'] = 'replace'
# Auch für universal_newlines (ältere Python-Versionen)
if kwargs.get('universal_newlines', False) and 'encoding' not in kwargs:
kwargs['encoding'] = 'utf-8'
kwargs['errors'] = 'replace'
return subprocess._original_popen(*args, **kwargs)
subprocess.run = patched_run
subprocess.Popen = patched_popen
windows_logger.info("✅ Subprocess automatisch gepatcht für UTF-8 Encoding (run + Popen)")
# ===== GLOBALER SUBPROCESS-PATCH =====
def apply_global_subprocess_patch():
"""
Wendet den subprocess-Patch global an, auch für bereits importierte Module.
"""
import sys
import subprocess
# Patch subprocess direkt
patch_subprocess()
# Patch auch in bereits importierten Modulen
for module_name, module in sys.modules.items():
if hasattr(module, 'subprocess') and module.subprocess is subprocess:
# Modul verwendet subprocess - patch es
module.subprocess = subprocess
windows_logger.debug(f"✅ Subprocess in Modul {module_name} gepatcht")
windows_logger.info("✅ Globaler subprocess-Patch angewendet")
def apply_all_windows_fixes():
"""Wendet alle Windows-spezifischen Fixes an."""
global _windows_fixes_applied
if _windows_fixes_applied:
return
try:
windows_logger.info("🔧 Wende Windows-spezifische Fixes an...")
# 1. Encoding-Fixes
apply_encoding_fixes()
# 2. Threading-Fixes
apply_threading_fixes()
# 3. Signal-Handler-Fixes
apply_signal_fixes()
# 4. Subprocess-Patch für UTF-8 Encoding
patch_subprocess()
# 5. Globaler Subprocess-Patch für bereits importierte Module
apply_global_subprocess_patch()
_windows_fixes_applied = True
windows_logger.info("✅ Alle Windows-Fixes erfolgreich angewendet")
except Exception as e:
windows_logger.error(f"❌ Fehler beim Anwenden der Windows-Fixes: {str(e)}")
raise e
# Automatisch Windows-Fixes beim Import anwenden (nur einmal)
if os.name == 'nt' and not _windows_fixes_applied:
try:
apply_all_windows_fixes()
except Exception as e:
windows_logger.warning(f"⚠️ Windows-Fixes konnten nicht automatisch angewendet werden: {str(e)}")