algorithm/backend/app/services/monitoring.py

"""监控服务，负责系统监控、性能指标收集和告警"""

from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
import asyncio
import logging
import time
from collections import defaultdict, deque
import psutil
import os

from sqlalchemy.orm import Session
from sqlalchemy import func

from app.models.models import AlgorithmCall, User, Algorithm
from app.services.service_manager import service_manager

logger = logging.getLogger(__name__)


class MetricsCollector:
    """指标收集器，收集系统和业务指标"""

    def __init__(self):
        self.metrics_history = defaultdict(lambda: deque(maxlen=1000))  # 保留最近1000个指标
        self.start_time = datetime.utcnow()

    def collect_system_metrics(self) -> Dict[str, Any]:
        """收集系统指标"""
        cpu_percent = psutil.cpu_percent(interval=1)
        memory_info = psutil.virtual_memory()
        disk_usage = psutil.disk_usage('/')

        metrics = {
            "timestamp": datetime.utcnow().isoformat(),
            "cpu_percent": cpu_percent,
            "memory_percent": memory_info.percent,
            "memory_available": memory_info.available,
            "memory_total": memory_info.total,
            "disk_percent": disk_usage.percent,
            "disk_free": disk_usage.free,
            "disk_total": disk_usage.total,
            "process_count": len(psutil.pids()),
            "uptime": (datetime.utcnow() - self.start_time).total_seconds()
        }

        # 存储指标历史
        self.metrics_history['system'].append(metrics)

        return metrics

    def collect_business_metrics(self, db: Session) -> Dict[str, Any]:
        """收集业务指标"""
        # 算法调用统计
        total_calls = db.query(func.count(AlgorithmCall.id)).scalar()
        today_calls = db.query(func.count(AlgorithmCall.id)).filter(
            AlgorithmCall.created_at >= datetime.utcnow().date()
        ).scalar()

        # 用户统计
        total_users = db.query(func.count(User.id)).scalar()
        active_users = db.query(func.count(User.id)).filter(User.status == 'active').scalar()

        # 算法统计
        total_algorithms = db.query(func.count(Algorithm.id)).scalar()
        active_algorithms = db.query(func.count(Algorithm.id)).filter(Algorithm.status == 'active').scalar()

        # 按状态统计调用
        status_counts = db.query(AlgorithmCall.status, func.count(AlgorithmCall.id)).group_by(AlgorithmCall.status).all()
        status_dict = {status: count for status, count in status_counts}

        # 平均响应时间（最近1小时）
        recent_calls = db.query(AlgorithmCall.response_time).filter(
            AlgorithmCall.response_time.isnot(None),
            AlgorithmCall.created_at >= datetime.utcnow() - timedelta(hours=1)
        ).all()

        avg_response_time = None
        if recent_calls:
            response_times = [call.response_time for call in recent_calls if call.response_time is not None]
            if response_times:
                avg_response_time = sum(response_times) / len(response_times)

        metrics = {
            "timestamp": datetime.utcnow().isoformat(),
            "business": {
                "total_calls": total_calls,
                "today_calls": today_calls,
                "total_users": total_users,
                "active_users": active_users,
                "total_algorithms": total_algorithms,
                "active_algorithms": active_algorithms,
                "call_status_counts": status_dict,
                "avg_response_time_recent_hour": avg_response_time
            }
        }

        # 存储指标历史
        self.metrics_history['business'].append(metrics)

        return metrics

    def get_metric_history(self, metric_type: str, limit: int = 100) -> List[Dict[str, Any]]:
        """获取指标历史"""
        history = list(self.metrics_history[metric_type])
        return history[-limit:] if len(history) > limit else history

    def get_current_metrics(self, db: Session) -> Dict[str, Any]:
        """获取当前所有指标"""
        return {
            "system": self.collect_system_metrics(),
            "business": self.collect_business_metrics(db)
        }


class AlertManager:
    """告警管理器，处理阈值告警"""

    def __init__(self):
        self.alert_rules = []
        self.active_alerts = {}
        self.alert_history = deque(maxlen=1000)

    def add_alert_rule(self, name: str, condition_func, severity: str = "warning"):
        """添加告警规则"""
        rule = {
            "name": name,
            "condition": condition_func,
            "severity": severity,
            "triggered": False
        }
        self.alert_rules.append(rule)

    def check_alerts(self, metrics: Dict[str, Any]) -> List[Dict[str, Any]]:
        """检查告警条件"""
        triggered_alerts = []

        for rule in self.alert_rules:
            try:
                is_triggered = rule["condition"](metrics)

                if is_triggered and not rule["triggered"]:
                    # 告警首次触发
                    alert = {
                        "name": rule["name"],
                        "severity": rule["severity"],
                        "timestamp": datetime.utcnow().isoformat(),
                        "metrics": metrics
                    }

                    self.active_alerts[rule["name"]] = alert
                    self.alert_history.append(alert)
                    triggered_alerts.append(alert)
                    rule["triggered"] = True

                    logger.warning(f"Alert triggered: {rule['name']} - {alert}")

                elif not is_triggered and rule["triggered"]:
                    # 告警解除
                    logger.info(f"Alert cleared: {rule['name']}")
                    rule["triggered"] = False
                    if rule["name"] in self.active_alerts:
                        del self.active_alerts[rule["name"]]

            except Exception as e:
                logger.error(f"Error checking alert rule {rule['name']}: {str(e)}")

        return triggered_alerts

    def get_active_alerts(self) -> List[Dict[str, Any]]:
        """获取当前激活的告警"""
        return list(self.active_alerts.values())

    def get_alert_history(self, limit: int = 100) -> List[Dict[str, Any]]:
        """获取告警历史"""
        history = list(self.alert_history)
        return history[-limit:] if len(history) > limit else history


class MonitoringService:
    """监控服务主类"""

    def __init__(self):
        self.metrics_collector = MetricsCollector()
        self.alert_manager = AlertManager()
        self.monitoring_task = None
        self.is_monitoring = False

        # 添加默认告警规则
        self._setup_default_alerts()

    def _setup_default_alerts(self):
        """设置默认告警规则"""
        # CPU使用率过高
        def cpu_high_condition(metrics):
            cpu_percent = metrics.get("system", {}).get("cpu_percent", 0)
            return cpu_percent > 80

        # 内存使用率过高
        def memory_high_condition(metrics):
            memory_percent = metrics.get("system", {}).get("memory_percent", 0)
            return memory_percent > 85

        # 调用失败率过高
        def high_failure_rate_condition(metrics):
            business = metrics.get("business", {})
            status_counts = business.get("call_status_counts", {})
            total = sum(status_counts.values()) if status_counts else 1
            failed = status_counts.get("failed", 0)
            failure_rate = failed / total if total > 0 else 0
            return failure_rate > 0.1  # 失败率超过10%

        self.alert_manager.add_alert_rule("High CPU Usage", cpu_high_condition, "warning")
        self.alert_manager.add_alert_rule("High Memory Usage", memory_high_condition, "warning")
        self.alert_manager.add_alert_rule("High Failure Rate", high_failure_rate_condition, "critical")

    async def start_monitoring(self, db: Session, interval: int = 60):
        """启动监控"""
        if self.is_monitoring:
            logger.warning("Monitoring already started")
            return

        self.is_monitoring = True
        logger.info("Starting monitoring...")

        while self.is_monitoring:
            try:
                # 收集指标
                metrics = self.metrics_collector.get_current_metrics(db)

                # 检查告警
                triggered_alerts = self.alert_manager.check_alerts(metrics)

                # 记录指标到日志
                logger.info(f"Collected metrics - CPU: {metrics['system']['cpu_percent']:.1f}%, "
                           f"Memory: {metrics['system']['memory_percent']:.1f}%, "
                           f"Total calls: {metrics['business']['business']['total_calls']}")

                await asyncio.sleep(interval)

            except Exception as e:
                logger.error(f"Error in monitoring loop: {str(e)}")
                await asyncio.sleep(interval)

    async def stop_monitoring(self):
        """停止监控"""
        self.is_monitoring = False
        logger.info("Monitoring stopped")

    def get_system_health(self) -> Dict[str, Any]:
        """获取系统健康状况"""
        system_metrics = self.metrics_collector.collect_system_metrics()

        health_status = "healthy"
        if system_metrics["cpu_percent"] > 90 or system_metrics["memory_percent"] > 95:
            health_status = "critical"
        elif system_metrics["cpu_percent"] > 80 or system_metrics["memory_percent"] > 85:
            health_status = "warning"

        return {
            "status": health_status,
            "timestamp": datetime.utcnow().isoformat(),
            "system_metrics": system_metrics,
            "active_alerts": len(self.alert_manager.active_alerts),
            "uptime": system_metrics["uptime"]
        }

    def get_dashboard_data(self, db: Session) -> Dict[str, Any]:
        """获取仪表板数据"""
        current_metrics = self.metrics_collector.get_current_metrics(db)
        active_alerts = self.alert_manager.get_active_alerts()

        return {
            "timestamp": datetime.utcnow().isoformat(),
            "system": current_metrics["system"],
            "business": current_metrics["business"]["business"],
            "active_alerts_count": len(active_alerts),
            "recent_alerts": active_alerts[-5:],  # 最近5个告警
            "system_health": self.get_system_health()
        }


# 全局监控服务实例
monitoring_service = MonitoringService()