Projektarbeit-MYP/backend/monitoring.py

330 lines
10 KiB
Python

"""
Monitoring und Health Check Module für die MYP Flask-Anwendung.
Bietet Endpunkte für Systemüberwachung und Performance-Metriken.
"""
from flask import Blueprint, jsonify, current_app
import psutil
import os
import sqlite3
import datetime
import threading
import time
from collections import defaultdict
# Blueprint für Monitoring-Endpunkte
monitoring_bp = Blueprint('monitoring', __name__, url_prefix='/monitoring')
# Metriken-Speicher
metrics = {
'requests_total': defaultdict(int),
'request_duration': defaultdict(list),
'database_queries': 0,
'active_jobs': 0,
'error_count': defaultdict(int),
'startup_time': datetime.datetime.now()
}
class HealthCheck:
"""Klasse für System-Health-Checks."""
@staticmethod
def check_database():
"""
Überprüft die Datenbankverbindung.
Returns:
dict: Status und Details der Datenbankverbindung
"""
try:
db_path = current_app.config.get('DATABASE', 'instance/myp.db')
# Bei In-Memory-DB für Tests
if db_path == ':memory:':
return {'status': 'healthy', 'message': 'In-Memory-Datenbank aktiv'}
# Datei-basierte Datenbank prüfen
if not os.path.exists(db_path):
return {'status': 'unhealthy', 'message': 'Datenbankdatei nicht gefunden'}
# Verbindung testen
conn = sqlite3.connect(db_path, timeout=5)
cursor = conn.cursor()
cursor.execute('SELECT 1')
conn.close()
# Dateigröße ermitteln
db_size = os.path.getsize(db_path)
return {
'status': 'healthy',
'message': 'Datenbankverbindung erfolgreich',
'database_path': db_path,
'database_size_bytes': db_size
}
except Exception as e:
return {
'status': 'unhealthy',
'message': f'Datenbankfehler: {str(e)}'
}
@staticmethod
def check_disk_space():
"""
Überprüft den verfügbaren Festplattenspeicher.
Returns:
dict: Status und Details des Festplattenspeichers
"""
try:
disk_usage = psutil.disk_usage('.')
free_gb = disk_usage.free / (1024**3)
total_gb = disk_usage.total / (1024**3)
used_percent = (disk_usage.used / disk_usage.total) * 100
status = 'healthy'
if used_percent > 90:
status = 'critical'
elif used_percent > 80:
status = 'warning'
return {
'status': status,
'free_gb': round(free_gb, 2),
'total_gb': round(total_gb, 2),
'used_percent': round(used_percent, 2)
}
except Exception as e:
return {
'status': 'unhealthy',
'message': f'Festplattenfehler: {str(e)}'
}
@staticmethod
def check_memory():
"""
Überprüft die Speichernutzung.
Returns:
dict: Status und Details der Speichernutzung
"""
try:
memory = psutil.virtual_memory()
status = 'healthy'
if memory.percent > 90:
status = 'critical'
elif memory.percent > 80:
status = 'warning'
return {
'status': status,
'total_gb': round(memory.total / (1024**3), 2),
'available_gb': round(memory.available / (1024**3), 2),
'used_percent': round(memory.percent, 2)
}
except Exception as e:
return {
'status': 'unhealthy',
'message': f'Speicherfehler: {str(e)}'
}
@staticmethod
def check_background_threads():
"""
Überprüft die Hintergrund-Threads.
Returns:
dict: Status der Hintergrund-Threads
"""
try:
active_threads = [t.name for t in threading.enumerate() if t.is_alive()]
job_checker_running = any('job_checker' in name for name in active_threads)
return {
'status': 'healthy' if job_checker_running else 'warning',
'job_checker_running': job_checker_running,
'active_threads': active_threads,
'thread_count': len(active_threads)
}
except Exception as e:
return {
'status': 'unhealthy',
'message': f'Thread-Fehler: {str(e)}'
}
@monitoring_bp.route('/health')
def health_check():
"""
Umfassender Health Check aller Systemkomponenten.
Returns:
JSON: Status aller Systemkomponenten
"""
checks = {
'database': HealthCheck.check_database(),
'disk_space': HealthCheck.check_disk_space(),
'memory': HealthCheck.check_memory(),
'background_threads': HealthCheck.check_background_threads()
}
# Gesamtstatus bestimmen
overall_status = 'healthy'
for check in checks.values():
if check['status'] == 'unhealthy':
overall_status = 'unhealthy'
break
elif check['status'] in ['warning', 'critical']:
overall_status = 'degraded'
response = {
'status': overall_status,
'timestamp': datetime.datetime.now().isoformat(),
'checks': checks
}
status_code = 200 if overall_status == 'healthy' else 503
return jsonify(response), status_code
@monitoring_bp.route('/health/simple')
def simple_health_check():
"""
Einfacher Health Check für Load Balancer.
Returns:
JSON: Einfacher Status
"""
return jsonify({'status': 'ok', 'timestamp': datetime.datetime.now().isoformat()})
@monitoring_bp.route('/metrics')
def get_metrics():
"""
Sammelt und gibt Performance-Metriken zurück.
Returns:
JSON: System- und Anwendungsmetriken
"""
try:
# System-Metriken
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
disk = psutil.disk_usage('.')
# Uptime berechnen
uptime = datetime.datetime.now() - metrics['startup_time']
# Anwendungsmetriken
app_metrics = {
'system': {
'cpu_percent': cpu_percent,
'memory_percent': memory.percent,
'disk_percent': (disk.used / disk.total) * 100,
'uptime_seconds': uptime.total_seconds()
},
'application': {
'requests_total': dict(metrics['requests_total']),
'database_queries_total': metrics['database_queries'],
'active_jobs': metrics['active_jobs'],
'error_count': dict(metrics['error_count']),
'startup_time': metrics['startup_time'].isoformat()
}
}
return jsonify(app_metrics)
except Exception as e:
current_app.logger.error(f"Fehler beim Sammeln der Metriken: {e}")
return jsonify({'error': 'Metriken nicht verfügbar'}), 500
@monitoring_bp.route('/info')
def get_info():
"""
Gibt allgemeine Informationen über die Anwendung zurück.
Returns:
JSON: Anwendungsinformationen
"""
return jsonify({
'application': 'MYP Backend',
'version': '2.0.0',
'flask_env': current_app.config.get('FLASK_ENV', 'unknown'),
'debug': current_app.debug,
'startup_time': metrics['startup_time'].isoformat(),
'python_version': os.sys.version,
'config': {
'database': current_app.config.get('DATABASE'),
'job_check_interval': current_app.config.get('JOB_CHECK_INTERVAL'),
'security_enabled': current_app.config.get('SECURITY_ENABLED', False),
'rate_limit_enabled': current_app.config.get('RATE_LIMIT_ENABLED', False)
}
})
def record_request_metric(endpoint, method, status_code, duration):
"""
Zeichnet Request-Metriken auf.
Args:
endpoint: API-Endpunkt
method: HTTP-Methode
status_code: HTTP-Status-Code
duration: Request-Dauer in Sekunden
"""
key = f"{method}_{endpoint}"
metrics['requests_total'][key] += 1
metrics['request_duration'][key].append(duration)
if status_code >= 400:
metrics['error_count'][str(status_code)] += 1
def record_database_query():
"""Zeichnet eine Datenbankabfrage auf."""
metrics['database_queries'] += 1
def update_active_jobs(count):
"""
Aktualisiert die Anzahl aktiver Jobs.
Args:
count: Anzahl aktiver Jobs
"""
metrics['active_jobs'] = count
class RequestMetricsMiddleware:
"""Middleware für automatisches Request-Tracking."""
def __init__(self, app=None):
self.app = app
if app is not None:
self.init_app(app)
def init_app(self, app):
"""Initialisiert die Middleware mit der Flask-App."""
app.before_request(self.before_request)
app.after_request(self.after_request)
def before_request(self):
"""Startet die Zeitmessung für den Request."""
from flask import g
g.start_time = time.time()
def after_request(self, response):
"""Zeichnet Metriken nach dem Request auf."""
from flask import g, request
if hasattr(g, 'start_time'):
duration = time.time() - g.start_time
record_request_metric(
request.endpoint or 'unknown',
request.method,
response.status_code,
duration
)
return response
# Globale Middleware-Instanz
request_metrics = RequestMetricsMiddleware()