first commit
This commit is contained in:
284
backend/app/services/monitoring.py
Normal file
284
backend/app/services/monitoring.py
Normal file
@@ -0,0 +1,284 @@
|
||||
"""监控服务,负责系统监控、性能指标收集和告警"""
|
||||
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime, timedelta
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from collections import defaultdict, deque
|
||||
import psutil
|
||||
import os
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import func
|
||||
|
||||
from app.models.models import AlgorithmCall, User, Algorithm
|
||||
from app.services.service_manager import service_manager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
"""指标收集器,收集系统和业务指标"""
|
||||
|
||||
def __init__(self):
|
||||
self.metrics_history = defaultdict(lambda: deque(maxlen=1000)) # 保留最近1000个指标
|
||||
self.start_time = datetime.utcnow()
|
||||
|
||||
def collect_system_metrics(self) -> Dict[str, Any]:
|
||||
"""收集系统指标"""
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
memory_info = psutil.virtual_memory()
|
||||
disk_usage = psutil.disk_usage('/')
|
||||
|
||||
metrics = {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"cpu_percent": cpu_percent,
|
||||
"memory_percent": memory_info.percent,
|
||||
"memory_available": memory_info.available,
|
||||
"memory_total": memory_info.total,
|
||||
"disk_percent": disk_usage.percent,
|
||||
"disk_free": disk_usage.free,
|
||||
"disk_total": disk_usage.total,
|
||||
"process_count": len(psutil.pids()),
|
||||
"uptime": (datetime.utcnow() - self.start_time).total_seconds()
|
||||
}
|
||||
|
||||
# 存储指标历史
|
||||
self.metrics_history['system'].append(metrics)
|
||||
|
||||
return metrics
|
||||
|
||||
def collect_business_metrics(self, db: Session) -> Dict[str, Any]:
|
||||
"""收集业务指标"""
|
||||
# 算法调用统计
|
||||
total_calls = db.query(func.count(AlgorithmCall.id)).scalar()
|
||||
today_calls = db.query(func.count(AlgorithmCall.id)).filter(
|
||||
AlgorithmCall.created_at >= datetime.utcnow().date()
|
||||
).scalar()
|
||||
|
||||
# 用户统计
|
||||
total_users = db.query(func.count(User.id)).scalar()
|
||||
active_users = db.query(func.count(User.id)).filter(User.status == 'active').scalar()
|
||||
|
||||
# 算法统计
|
||||
total_algorithms = db.query(func.count(Algorithm.id)).scalar()
|
||||
active_algorithms = db.query(func.count(Algorithm.id)).filter(Algorithm.status == 'active').scalar()
|
||||
|
||||
# 按状态统计调用
|
||||
status_counts = db.query(AlgorithmCall.status, func.count(AlgorithmCall.id)).group_by(AlgorithmCall.status).all()
|
||||
status_dict = {status: count for status, count in status_counts}
|
||||
|
||||
# 平均响应时间(最近1小时)
|
||||
recent_calls = db.query(AlgorithmCall.response_time).filter(
|
||||
AlgorithmCall.response_time.isnot(None),
|
||||
AlgorithmCall.created_at >= datetime.utcnow() - timedelta(hours=1)
|
||||
).all()
|
||||
|
||||
avg_response_time = None
|
||||
if recent_calls:
|
||||
response_times = [call.response_time for call in recent_calls if call.response_time is not None]
|
||||
if response_times:
|
||||
avg_response_time = sum(response_times) / len(response_times)
|
||||
|
||||
metrics = {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"business": {
|
||||
"total_calls": total_calls,
|
||||
"today_calls": today_calls,
|
||||
"total_users": total_users,
|
||||
"active_users": active_users,
|
||||
"total_algorithms": total_algorithms,
|
||||
"active_algorithms": active_algorithms,
|
||||
"call_status_counts": status_dict,
|
||||
"avg_response_time_recent_hour": avg_response_time
|
||||
}
|
||||
}
|
||||
|
||||
# 存储指标历史
|
||||
self.metrics_history['business'].append(metrics)
|
||||
|
||||
return metrics
|
||||
|
||||
def get_metric_history(self, metric_type: str, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""获取指标历史"""
|
||||
history = list(self.metrics_history[metric_type])
|
||||
return history[-limit:] if len(history) > limit else history
|
||||
|
||||
def get_current_metrics(self, db: Session) -> Dict[str, Any]:
|
||||
"""获取当前所有指标"""
|
||||
return {
|
||||
"system": self.collect_system_metrics(),
|
||||
"business": self.collect_business_metrics(db)
|
||||
}
|
||||
|
||||
|
||||
class AlertManager:
|
||||
"""告警管理器,处理阈值告警"""
|
||||
|
||||
def __init__(self):
|
||||
self.alert_rules = []
|
||||
self.active_alerts = {}
|
||||
self.alert_history = deque(maxlen=1000)
|
||||
|
||||
def add_alert_rule(self, name: str, condition_func, severity: str = "warning"):
|
||||
"""添加告警规则"""
|
||||
rule = {
|
||||
"name": name,
|
||||
"condition": condition_func,
|
||||
"severity": severity,
|
||||
"triggered": False
|
||||
}
|
||||
self.alert_rules.append(rule)
|
||||
|
||||
def check_alerts(self, metrics: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""检查告警条件"""
|
||||
triggered_alerts = []
|
||||
|
||||
for rule in self.alert_rules:
|
||||
try:
|
||||
is_triggered = rule["condition"](metrics)
|
||||
|
||||
if is_triggered and not rule["triggered"]:
|
||||
# 告警首次触发
|
||||
alert = {
|
||||
"name": rule["name"],
|
||||
"severity": rule["severity"],
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"metrics": metrics
|
||||
}
|
||||
|
||||
self.active_alerts[rule["name"]] = alert
|
||||
self.alert_history.append(alert)
|
||||
triggered_alerts.append(alert)
|
||||
rule["triggered"] = True
|
||||
|
||||
logger.warning(f"Alert triggered: {rule['name']} - {alert}")
|
||||
|
||||
elif not is_triggered and rule["triggered"]:
|
||||
# 告警解除
|
||||
logger.info(f"Alert cleared: {rule['name']}")
|
||||
rule["triggered"] = False
|
||||
if rule["name"] in self.active_alerts:
|
||||
del self.active_alerts[rule["name"]]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking alert rule {rule['name']}: {str(e)}")
|
||||
|
||||
return triggered_alerts
|
||||
|
||||
def get_active_alerts(self) -> List[Dict[str, Any]]:
|
||||
"""获取当前激活的告警"""
|
||||
return list(self.active_alerts.values())
|
||||
|
||||
def get_alert_history(self, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""获取告警历史"""
|
||||
history = list(self.alert_history)
|
||||
return history[-limit:] if len(history) > limit else history
|
||||
|
||||
|
||||
class MonitoringService:
|
||||
"""监控服务主类"""
|
||||
|
||||
def __init__(self):
|
||||
self.metrics_collector = MetricsCollector()
|
||||
self.alert_manager = AlertManager()
|
||||
self.monitoring_task = None
|
||||
self.is_monitoring = False
|
||||
|
||||
# 添加默认告警规则
|
||||
self._setup_default_alerts()
|
||||
|
||||
def _setup_default_alerts(self):
|
||||
"""设置默认告警规则"""
|
||||
# CPU使用率过高
|
||||
def cpu_high_condition(metrics):
|
||||
cpu_percent = metrics.get("system", {}).get("cpu_percent", 0)
|
||||
return cpu_percent > 80
|
||||
|
||||
# 内存使用率过高
|
||||
def memory_high_condition(metrics):
|
||||
memory_percent = metrics.get("system", {}).get("memory_percent", 0)
|
||||
return memory_percent > 85
|
||||
|
||||
# 调用失败率过高
|
||||
def high_failure_rate_condition(metrics):
|
||||
business = metrics.get("business", {})
|
||||
status_counts = business.get("call_status_counts", {})
|
||||
total = sum(status_counts.values()) if status_counts else 1
|
||||
failed = status_counts.get("failed", 0)
|
||||
failure_rate = failed / total if total > 0 else 0
|
||||
return failure_rate > 0.1 # 失败率超过10%
|
||||
|
||||
self.alert_manager.add_alert_rule("High CPU Usage", cpu_high_condition, "warning")
|
||||
self.alert_manager.add_alert_rule("High Memory Usage", memory_high_condition, "warning")
|
||||
self.alert_manager.add_alert_rule("High Failure Rate", high_failure_rate_condition, "critical")
|
||||
|
||||
async def start_monitoring(self, db: Session, interval: int = 60):
|
||||
"""启动监控"""
|
||||
if self.is_monitoring:
|
||||
logger.warning("Monitoring already started")
|
||||
return
|
||||
|
||||
self.is_monitoring = True
|
||||
logger.info("Starting monitoring...")
|
||||
|
||||
while self.is_monitoring:
|
||||
try:
|
||||
# 收集指标
|
||||
metrics = self.metrics_collector.get_current_metrics(db)
|
||||
|
||||
# 检查告警
|
||||
triggered_alerts = self.alert_manager.check_alerts(metrics)
|
||||
|
||||
# 记录指标到日志
|
||||
logger.info(f"Collected metrics - CPU: {metrics['system']['cpu_percent']:.1f}%, "
|
||||
f"Memory: {metrics['system']['memory_percent']:.1f}%, "
|
||||
f"Total calls: {metrics['business']['business']['total_calls']}")
|
||||
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in monitoring loop: {str(e)}")
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
async def stop_monitoring(self):
|
||||
"""停止监控"""
|
||||
self.is_monitoring = False
|
||||
logger.info("Monitoring stopped")
|
||||
|
||||
def get_system_health(self) -> Dict[str, Any]:
|
||||
"""获取系统健康状况"""
|
||||
system_metrics = self.metrics_collector.collect_system_metrics()
|
||||
|
||||
health_status = "healthy"
|
||||
if system_metrics["cpu_percent"] > 90 or system_metrics["memory_percent"] > 95:
|
||||
health_status = "critical"
|
||||
elif system_metrics["cpu_percent"] > 80 or system_metrics["memory_percent"] > 85:
|
||||
health_status = "warning"
|
||||
|
||||
return {
|
||||
"status": health_status,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"system_metrics": system_metrics,
|
||||
"active_alerts": len(self.alert_manager.active_alerts),
|
||||
"uptime": system_metrics["uptime"]
|
||||
}
|
||||
|
||||
def get_dashboard_data(self, db: Session) -> Dict[str, Any]:
|
||||
"""获取仪表板数据"""
|
||||
current_metrics = self.metrics_collector.get_current_metrics(db)
|
||||
active_alerts = self.alert_manager.get_active_alerts()
|
||||
|
||||
return {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"system": current_metrics["system"],
|
||||
"business": current_metrics["business"]["business"],
|
||||
"active_alerts_count": len(active_alerts),
|
||||
"recent_alerts": active_alerts[-5:], # 最近5个告警
|
||||
"system_health": self.get_system_health()
|
||||
}
|
||||
|
||||
|
||||
# 全局监控服务实例
|
||||
monitoring_service = MonitoringService()
|
||||
Reference in New Issue
Block a user