345 lines
11 KiB
Python
345 lines
11 KiB
Python
"""监控与日志路由,提供系统监控、指标收集和日志查询功能"""
|
||
|
||
from fastapi import APIRouter, HTTPException, status, Depends
|
||
from typing import List, Dict, Any, Optional
|
||
from datetime import datetime, timedelta
|
||
import json
|
||
|
||
from app.services.monitoring import monitoring_service
|
||
from app.utils.logger import structured_logger, log_query
|
||
from app.models.database import get_db
|
||
from app.routes.user import get_current_active_user
|
||
|
||
router = APIRouter(prefix="/monitoring", tags=["monitoring"])
|
||
|
||
|
||
@router.get("/health")
|
||
async def get_system_health():
|
||
"""获取系统健康状况"""
|
||
health = monitoring_service.get_system_health()
|
||
return health
|
||
|
||
|
||
@router.get("/dashboard")
|
||
async def get_dashboard_data(
|
||
current_user: dict = Depends(get_current_active_user),
|
||
db = Depends(get_db)
|
||
):
|
||
"""获取仪表板数据"""
|
||
# 只有管理员可以访问仪表板
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
raise HTTPException(status_code=403, detail="Insufficient permissions")
|
||
|
||
dashboard_data = monitoring_service.get_dashboard_data(db)
|
||
return dashboard_data
|
||
|
||
|
||
@router.get("/metrics/system")
|
||
async def get_system_metrics(
|
||
current_user: dict = Depends(get_current_active_user)
|
||
):
|
||
"""获取系统指标"""
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
raise HTTPException(status_code=403, detail="Insufficient permissions")
|
||
|
||
from app.services.monitoring import MetricsCollector
|
||
collector = MetricsCollector()
|
||
metrics = collector.collect_system_metrics()
|
||
return metrics
|
||
|
||
|
||
@router.get("/metrics/business")
|
||
async def get_business_metrics(
|
||
current_user: dict = Depends(get_current_active_user),
|
||
db = Depends(get_db)
|
||
):
|
||
"""获取业务指标"""
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
raise HTTPException(status_code=403, detail="Insufficient permissions")
|
||
|
||
from app.services.monitoring import MetricsCollector
|
||
collector = MetricsCollector()
|
||
metrics = collector.collect_business_metrics(db)
|
||
return metrics
|
||
|
||
|
||
@router.get("/metrics/history")
|
||
async def get_metrics_history(
|
||
metric_type: str = "system",
|
||
limit: int = 100,
|
||
current_user: dict = Depends(get_current_active_user)
|
||
):
|
||
"""获取指标历史"""
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
raise HTTPException(status_code=403, detail="Insufficient permissions")
|
||
|
||
if metric_type not in ["system", "business"]:
|
||
raise HTTPException(status_code=400, detail="Invalid metric type. Use 'system' or 'business'")
|
||
|
||
from app.services.monitoring import MetricsCollector
|
||
collector = MetricsCollector()
|
||
history = collector.get_metric_history(metric_type, limit)
|
||
return {"history": history}
|
||
|
||
|
||
@router.get("/alerts/active")
|
||
async def get_active_alerts(
|
||
current_user: dict = Depends(get_current_active_user)
|
||
):
|
||
"""获取当前激活的告警"""
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
raise HTTPException(status_code=403, detail="Insufficient permissions")
|
||
|
||
active_alerts = monitoring_service.alert_manager.get_active_alerts()
|
||
return {"active_alerts": active_alerts}
|
||
|
||
|
||
@router.get("/alerts/history")
|
||
async def get_alert_history(
|
||
limit: int = 100,
|
||
current_user: dict = Depends(get_current_active_user)
|
||
):
|
||
"""获取告警历史"""
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
raise HTTPException(status_code=403, detail="Insufficient permissions")
|
||
|
||
history = monitoring_service.alert_manager.get_alert_history(limit)
|
||
return {"alert_history": history}
|
||
|
||
|
||
@router.post("/monitoring/start")
|
||
async def start_monitoring(
|
||
interval: int = 60,
|
||
current_user: dict = Depends(get_current_active_user),
|
||
db = Depends(get_db)
|
||
):
|
||
"""启动监控"""
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
raise HTTPException(status_code=403, detail="Insufficient permissions")
|
||
|
||
# 注意:在实际应用中,我们不会在这里启动一个长时间运行的协程
|
||
# 这通常会在应用启动时完成
|
||
# 这里仅作为示例返回确认信息
|
||
return {
|
||
"message": "Monitoring started",
|
||
"interval": interval,
|
||
"timestamp": datetime.utcnow().isoformat()
|
||
}
|
||
|
||
|
||
@router.post("/monitoring/stop")
|
||
async def stop_monitoring(
|
||
current_user: dict = Depends(get_current_active_user)
|
||
):
|
||
"""停止监控"""
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
raise HTTPException(status_code=403, detail="Insufficient permissions")
|
||
|
||
await monitoring_service.stop_monitoring()
|
||
return {
|
||
"message": "Monitoring stopped",
|
||
"timestamp": datetime.utcnow().isoformat()
|
||
}
|
||
|
||
|
||
@router.post("/logs/event")
|
||
async def log_custom_event(
|
||
event_type: str,
|
||
user_id: Optional[str] = None,
|
||
algorithm_id: Optional[str] = None,
|
||
extra_data: Dict[str, Any] = {},
|
||
current_user: dict = Depends(get_current_active_user)
|
||
):
|
||
"""记录自定义事件日志"""
|
||
# 普通用户只能记录自己的事件
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
if user_id and user_id != current_user.get("id"):
|
||
raise HTTPException(status_code=403, detail="Cannot log events for other users")
|
||
user_id = current_user.get("id")
|
||
|
||
structured_logger.log_event(
|
||
event_type=event_type,
|
||
user_id=user_id,
|
||
algorithm_id=algorithm_id,
|
||
extra_data=extra_data
|
||
)
|
||
|
||
return {
|
||
"message": "Event logged successfully",
|
||
"event_type": event_type,
|
||
"timestamp": datetime.utcnow().isoformat()
|
||
}
|
||
|
||
|
||
@router.post("/logs/api-call")
|
||
async def log_api_call(
|
||
user_id: str,
|
||
algorithm_id: str,
|
||
version_id: str,
|
||
input_size: int,
|
||
response_time: float,
|
||
success: bool,
|
||
error_msg: Optional[str] = None,
|
||
current_user: dict = Depends(get_current_active_user)
|
||
):
|
||
"""记录API调用日志"""
|
||
# 管理员或用户自己可以记录日志
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
if user_id != current_user.get("id"):
|
||
raise HTTPException(status_code=403, detail="Cannot log API calls for other users")
|
||
|
||
structured_logger.log_api_call(
|
||
user_id=user_id,
|
||
algorithm_id=algorithm_id,
|
||
version_id=version_id,
|
||
input_size=input_size,
|
||
response_time=response_time,
|
||
success=success,
|
||
error_msg=error_msg
|
||
)
|
||
|
||
return {
|
||
"message": "API call logged successfully",
|
||
"success": success,
|
||
"timestamp": datetime.utcnow().isoformat()
|
||
}
|
||
|
||
|
||
@router.get("/logs/search")
|
||
async def search_logs(
|
||
start_date: Optional[str] = None,
|
||
end_date: Optional[str] = None,
|
||
event_types: Optional[str] = None, # 逗号分隔的事件类型
|
||
user_ids: Optional[str] = None, # 逗号分隔的用户ID
|
||
algorithm_ids: Optional[str] = None, # 逗号分隔的算法ID
|
||
log_levels: Optional[str] = None, # 逗号分隔的日志级别
|
||
limit: int = 100,
|
||
current_user: dict = Depends(get_current_active_user)
|
||
):
|
||
"""搜索日志"""
|
||
# 普通用户只能搜索自己的日志
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
# 如果指定了其他用户ID,则只允许查看自己的
|
||
if user_ids:
|
||
user_id_list = user_ids.split(',')
|
||
if current_user.get("id") not in user_id_list:
|
||
raise HTTPException(status_code=403, detail="Cannot search logs for other users")
|
||
else:
|
||
user_ids = current_user.get("id")
|
||
|
||
# 解析日期
|
||
start_dt = None
|
||
end_dt = None
|
||
if start_date:
|
||
try:
|
||
start_dt = datetime.fromisoformat(start_date.replace('Z', '+00:00'))
|
||
except ValueError:
|
||
raise HTTPException(status_code=400, detail="Invalid start_date format")
|
||
|
||
if end_date:
|
||
try:
|
||
end_dt = datetime.fromisoformat(end_date.replace('Z', '+00:00'))
|
||
except ValueError:
|
||
raise HTTPException(status_code=400, detail="Invalid end_date format")
|
||
|
||
# 解析数组参数
|
||
event_type_list = event_types.split(',') if event_types else None
|
||
user_id_list = user_ids.split(',') if user_ids else None
|
||
algorithm_id_list = algorithm_ids.split(',') if algorithm_ids else None
|
||
log_level_list = log_levels.split(',') if log_levels else None
|
||
|
||
# 执行搜索
|
||
results = log_query.search_logs(
|
||
start_date=start_dt,
|
||
end_date=end_dt,
|
||
event_types=event_type_list,
|
||
user_ids=user_id_list,
|
||
algorithm_ids=algorithm_id_list,
|
||
log_levels=log_level_list,
|
||
limit=limit
|
||
)
|
||
|
||
return {
|
||
"logs": results,
|
||
"count": len(results),
|
||
"limit": limit
|
||
}
|
||
|
||
|
||
@router.get("/logs/stats")
|
||
async def get_log_stats(
|
||
days: int = 7,
|
||
current_user: dict = Depends(get_current_active_user)
|
||
):
|
||
"""获取日志统计信息"""
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
raise HTTPException(status_code=403, detail="Insufficient permissions")
|
||
|
||
stats = log_query.get_log_stats(days=days)
|
||
return stats
|
||
|
||
|
||
@router.get("/performance/algorithm/{algorithm_id}")
|
||
async def get_algorithm_performance(
|
||
algorithm_id: str,
|
||
days: int = 7,
|
||
current_user: dict = Depends(get_current_active_user),
|
||
db = Depends(get_db)
|
||
):
|
||
"""获取算法性能指标"""
|
||
# 用户只能查看自己有权访问的算法
|
||
if current_user.get("role") not in ["admin", "manager"]:
|
||
# 这里应该检查用户是否有权访问该算法
|
||
# 简单起见,我们假设用户可以查看任何算法
|
||
pass
|
||
|
||
from sqlalchemy import func
|
||
from app.models.models import AlgorithmCall
|
||
|
||
# 计算性能指标
|
||
start_date = datetime.utcnow() - timedelta(days=days)
|
||
|
||
# 总调用次数
|
||
total_calls = db.query(func.count(AlgorithmCall.id)).filter(
|
||
AlgorithmCall.algorithm_id == algorithm_id,
|
||
AlgorithmCall.created_at >= start_date
|
||
).scalar()
|
||
|
||
# 成功调用次数
|
||
success_calls = db.query(func.count(AlgorithmCall.id)).filter(
|
||
AlgorithmCall.algorithm_id == algorithm_id,
|
||
AlgorithmCall.status == 'success',
|
||
AlgorithmCall.created_at >= start_date
|
||
).scalar()
|
||
|
||
# 平均响应时间
|
||
avg_response_time = db.query(func.avg(AlgorithmCall.response_time)).filter(
|
||
AlgorithmCall.algorithm_id == algorithm_id,
|
||
AlgorithmCall.response_time.isnot(None),
|
||
AlgorithmCall.created_at >= start_date
|
||
).scalar()
|
||
|
||
# 按状态分组
|
||
status_counts = db.query(
|
||
AlgorithmCall.status,
|
||
func.count(AlgorithmCall.id)
|
||
).filter(
|
||
AlgorithmCall.algorithm_id == algorithm_id,
|
||
AlgorithmCall.created_at >= start_date
|
||
).group_by(AlgorithmCall.status).all()
|
||
|
||
status_dict = {status: count for status, count in status_counts}
|
||
|
||
success_rate = (success_calls / total_calls * 100) if total_calls > 0 else 0
|
||
|
||
return {
|
||
"algorithm_id": algorithm_id,
|
||
"period_days": days,
|
||
"total_calls": total_calls,
|
||
"success_calls": success_calls,
|
||
"failed_calls": total_calls - success_calls,
|
||
"success_rate": round(success_rate, 2),
|
||
"average_response_time": round(avg_response_time, 3) if avg_response_time else None,
|
||
"status_distribution": status_dict,
|
||
"timestamp": datetime.utcnow().isoformat()
|
||
} |