284 lines
11 KiB
Python
284 lines
11 KiB
Python
"""监控服务,负责系统监控、性能指标收集和告警"""
|
||
|
||
from typing import Dict, Any, List, Optional
|
||
from datetime import datetime, timedelta
|
||
import asyncio
|
||
import logging
|
||
import time
|
||
from collections import defaultdict, deque
|
||
import psutil
|
||
import os
|
||
|
||
from sqlalchemy.orm import Session
|
||
from sqlalchemy import func
|
||
|
||
from app.models.models import AlgorithmCall, User, Algorithm
|
||
from app.services.service_manager import service_manager
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class MetricsCollector:
|
||
"""指标收集器,收集系统和业务指标"""
|
||
|
||
def __init__(self):
|
||
self.metrics_history = defaultdict(lambda: deque(maxlen=1000)) # 保留最近1000个指标
|
||
self.start_time = datetime.utcnow()
|
||
|
||
def collect_system_metrics(self) -> Dict[str, Any]:
|
||
"""收集系统指标"""
|
||
cpu_percent = psutil.cpu_percent(interval=1)
|
||
memory_info = psutil.virtual_memory()
|
||
disk_usage = psutil.disk_usage('/')
|
||
|
||
metrics = {
|
||
"timestamp": datetime.utcnow().isoformat(),
|
||
"cpu_percent": cpu_percent,
|
||
"memory_percent": memory_info.percent,
|
||
"memory_available": memory_info.available,
|
||
"memory_total": memory_info.total,
|
||
"disk_percent": disk_usage.percent,
|
||
"disk_free": disk_usage.free,
|
||
"disk_total": disk_usage.total,
|
||
"process_count": len(psutil.pids()),
|
||
"uptime": (datetime.utcnow() - self.start_time).total_seconds()
|
||
}
|
||
|
||
# 存储指标历史
|
||
self.metrics_history['system'].append(metrics)
|
||
|
||
return metrics
|
||
|
||
def collect_business_metrics(self, db: Session) -> Dict[str, Any]:
|
||
"""收集业务指标"""
|
||
# 算法调用统计
|
||
total_calls = db.query(func.count(AlgorithmCall.id)).scalar()
|
||
today_calls = db.query(func.count(AlgorithmCall.id)).filter(
|
||
AlgorithmCall.created_at >= datetime.utcnow().date()
|
||
).scalar()
|
||
|
||
# 用户统计
|
||
total_users = db.query(func.count(User.id)).scalar()
|
||
active_users = db.query(func.count(User.id)).filter(User.status == 'active').scalar()
|
||
|
||
# 算法统计
|
||
total_algorithms = db.query(func.count(Algorithm.id)).scalar()
|
||
active_algorithms = db.query(func.count(Algorithm.id)).filter(Algorithm.status == 'active').scalar()
|
||
|
||
# 按状态统计调用
|
||
status_counts = db.query(AlgorithmCall.status, func.count(AlgorithmCall.id)).group_by(AlgorithmCall.status).all()
|
||
status_dict = {status: count for status, count in status_counts}
|
||
|
||
# 平均响应时间(最近1小时)
|
||
recent_calls = db.query(AlgorithmCall.response_time).filter(
|
||
AlgorithmCall.response_time.isnot(None),
|
||
AlgorithmCall.created_at >= datetime.utcnow() - timedelta(hours=1)
|
||
).all()
|
||
|
||
avg_response_time = None
|
||
if recent_calls:
|
||
response_times = [call.response_time for call in recent_calls if call.response_time is not None]
|
||
if response_times:
|
||
avg_response_time = sum(response_times) / len(response_times)
|
||
|
||
metrics = {
|
||
"timestamp": datetime.utcnow().isoformat(),
|
||
"business": {
|
||
"total_calls": total_calls,
|
||
"today_calls": today_calls,
|
||
"total_users": total_users,
|
||
"active_users": active_users,
|
||
"total_algorithms": total_algorithms,
|
||
"active_algorithms": active_algorithms,
|
||
"call_status_counts": status_dict,
|
||
"avg_response_time_recent_hour": avg_response_time
|
||
}
|
||
}
|
||
|
||
# 存储指标历史
|
||
self.metrics_history['business'].append(metrics)
|
||
|
||
return metrics
|
||
|
||
def get_metric_history(self, metric_type: str, limit: int = 100) -> List[Dict[str, Any]]:
|
||
"""获取指标历史"""
|
||
history = list(self.metrics_history[metric_type])
|
||
return history[-limit:] if len(history) > limit else history
|
||
|
||
def get_current_metrics(self, db: Session) -> Dict[str, Any]:
|
||
"""获取当前所有指标"""
|
||
return {
|
||
"system": self.collect_system_metrics(),
|
||
"business": self.collect_business_metrics(db)
|
||
}
|
||
|
||
|
||
class AlertManager:
|
||
"""告警管理器,处理阈值告警"""
|
||
|
||
def __init__(self):
|
||
self.alert_rules = []
|
||
self.active_alerts = {}
|
||
self.alert_history = deque(maxlen=1000)
|
||
|
||
def add_alert_rule(self, name: str, condition_func, severity: str = "warning"):
|
||
"""添加告警规则"""
|
||
rule = {
|
||
"name": name,
|
||
"condition": condition_func,
|
||
"severity": severity,
|
||
"triggered": False
|
||
}
|
||
self.alert_rules.append(rule)
|
||
|
||
def check_alerts(self, metrics: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||
"""检查告警条件"""
|
||
triggered_alerts = []
|
||
|
||
for rule in self.alert_rules:
|
||
try:
|
||
is_triggered = rule["condition"](metrics)
|
||
|
||
if is_triggered and not rule["triggered"]:
|
||
# 告警首次触发
|
||
alert = {
|
||
"name": rule["name"],
|
||
"severity": rule["severity"],
|
||
"timestamp": datetime.utcnow().isoformat(),
|
||
"metrics": metrics
|
||
}
|
||
|
||
self.active_alerts[rule["name"]] = alert
|
||
self.alert_history.append(alert)
|
||
triggered_alerts.append(alert)
|
||
rule["triggered"] = True
|
||
|
||
logger.warning(f"Alert triggered: {rule['name']} - {alert}")
|
||
|
||
elif not is_triggered and rule["triggered"]:
|
||
# 告警解除
|
||
logger.info(f"Alert cleared: {rule['name']}")
|
||
rule["triggered"] = False
|
||
if rule["name"] in self.active_alerts:
|
||
del self.active_alerts[rule["name"]]
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error checking alert rule {rule['name']}: {str(e)}")
|
||
|
||
return triggered_alerts
|
||
|
||
def get_active_alerts(self) -> List[Dict[str, Any]]:
|
||
"""获取当前激活的告警"""
|
||
return list(self.active_alerts.values())
|
||
|
||
def get_alert_history(self, limit: int = 100) -> List[Dict[str, Any]]:
|
||
"""获取告警历史"""
|
||
history = list(self.alert_history)
|
||
return history[-limit:] if len(history) > limit else history
|
||
|
||
|
||
class MonitoringService:
|
||
"""监控服务主类"""
|
||
|
||
def __init__(self):
|
||
self.metrics_collector = MetricsCollector()
|
||
self.alert_manager = AlertManager()
|
||
self.monitoring_task = None
|
||
self.is_monitoring = False
|
||
|
||
# 添加默认告警规则
|
||
self._setup_default_alerts()
|
||
|
||
def _setup_default_alerts(self):
|
||
"""设置默认告警规则"""
|
||
# CPU使用率过高
|
||
def cpu_high_condition(metrics):
|
||
cpu_percent = metrics.get("system", {}).get("cpu_percent", 0)
|
||
return cpu_percent > 80
|
||
|
||
# 内存使用率过高
|
||
def memory_high_condition(metrics):
|
||
memory_percent = metrics.get("system", {}).get("memory_percent", 0)
|
||
return memory_percent > 85
|
||
|
||
# 调用失败率过高
|
||
def high_failure_rate_condition(metrics):
|
||
business = metrics.get("business", {})
|
||
status_counts = business.get("call_status_counts", {})
|
||
total = sum(status_counts.values()) if status_counts else 1
|
||
failed = status_counts.get("failed", 0)
|
||
failure_rate = failed / total if total > 0 else 0
|
||
return failure_rate > 0.1 # 失败率超过10%
|
||
|
||
self.alert_manager.add_alert_rule("High CPU Usage", cpu_high_condition, "warning")
|
||
self.alert_manager.add_alert_rule("High Memory Usage", memory_high_condition, "warning")
|
||
self.alert_manager.add_alert_rule("High Failure Rate", high_failure_rate_condition, "critical")
|
||
|
||
async def start_monitoring(self, db: Session, interval: int = 60):
|
||
"""启动监控"""
|
||
if self.is_monitoring:
|
||
logger.warning("Monitoring already started")
|
||
return
|
||
|
||
self.is_monitoring = True
|
||
logger.info("Starting monitoring...")
|
||
|
||
while self.is_monitoring:
|
||
try:
|
||
# 收集指标
|
||
metrics = self.metrics_collector.get_current_metrics(db)
|
||
|
||
# 检查告警
|
||
triggered_alerts = self.alert_manager.check_alerts(metrics)
|
||
|
||
# 记录指标到日志
|
||
logger.info(f"Collected metrics - CPU: {metrics['system']['cpu_percent']:.1f}%, "
|
||
f"Memory: {metrics['system']['memory_percent']:.1f}%, "
|
||
f"Total calls: {metrics['business']['business']['total_calls']}")
|
||
|
||
await asyncio.sleep(interval)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error in monitoring loop: {str(e)}")
|
||
await asyncio.sleep(interval)
|
||
|
||
async def stop_monitoring(self):
|
||
"""停止监控"""
|
||
self.is_monitoring = False
|
||
logger.info("Monitoring stopped")
|
||
|
||
def get_system_health(self) -> Dict[str, Any]:
|
||
"""获取系统健康状况"""
|
||
system_metrics = self.metrics_collector.collect_system_metrics()
|
||
|
||
health_status = "healthy"
|
||
if system_metrics["cpu_percent"] > 90 or system_metrics["memory_percent"] > 95:
|
||
health_status = "critical"
|
||
elif system_metrics["cpu_percent"] > 80 or system_metrics["memory_percent"] > 85:
|
||
health_status = "warning"
|
||
|
||
return {
|
||
"status": health_status,
|
||
"timestamp": datetime.utcnow().isoformat(),
|
||
"system_metrics": system_metrics,
|
||
"active_alerts": len(self.alert_manager.active_alerts),
|
||
"uptime": system_metrics["uptime"]
|
||
}
|
||
|
||
def get_dashboard_data(self, db: Session) -> Dict[str, Any]:
|
||
"""获取仪表板数据"""
|
||
current_metrics = self.metrics_collector.get_current_metrics(db)
|
||
active_alerts = self.alert_manager.get_active_alerts()
|
||
|
||
return {
|
||
"timestamp": datetime.utcnow().isoformat(),
|
||
"system": current_metrics["system"],
|
||
"business": current_metrics["business"]["business"],
|
||
"active_alerts_count": len(active_alerts),
|
||
"recent_alerts": active_alerts[-5:], # 最近5个告警
|
||
"system_health": self.get_system_health()
|
||
}
|
||
|
||
|
||
# 全局监控服务实例
|
||
monitoring_service = MonitoringService() |