Initial commit for tst
This commit is contained in:
512
AIMonitor/deploy_升腾.md
Normal file
512
AIMonitor/deploy_升腾.md
Normal file
@@ -0,0 +1,512 @@
|
||||
# AI监控系统 - 升腾服务器部署指南
|
||||
|
||||
## 📋 部署概述
|
||||
|
||||
本指南专门针对华为昇腾(Ascend)NPU服务器环境,介绍如何部署AI监控系统,充分利用昇腾NPU的AI加速能力。
|
||||
|
||||
## 🔧 系统要求
|
||||
|
||||
### 硬件要求
|
||||
- **CPU**: x86_64 或 ARM64 架构
|
||||
- **NPU**: 昇腾 Atlas 系列芯片(310P、300I、800等)
|
||||
- **内存**: 16GB+ 推荐
|
||||
- **存储**: 100GB+ 可用空间(用于视频存储)
|
||||
- **网络**: 千兆网络接口
|
||||
|
||||
### 软件要求
|
||||
- **操作系统**: Ubuntu 20.04+ / CentOS 7.6+ / openEuler 20.03+
|
||||
- **Python**: 3.7-3.9(推荐3.8)
|
||||
- **昇腾软件栈**: CANN 5.0.2+
|
||||
- **Docker**: 20.10+(可选)
|
||||
|
||||
## 🚀 快速部署
|
||||
|
||||
### 方案一:直接部署(推荐)
|
||||
|
||||
#### 1. 准备昇腾环境
|
||||
|
||||
```bash
|
||||
# 安装昇腾驱动(以Atlas 300I为例)
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc g++ make cmake
|
||||
|
||||
# 下载并安装昇腾驱动
|
||||
wget https://ascend-repo.huawei.com/Atlas%20200I%20DK/Ascend-hdk-23.0.0-ubuntu20.04.aarch64.run
|
||||
sudo bash Ascend-hdk-23.0.0-ubuntu20.04.aarch64.run
|
||||
|
||||
# 安装CANN开发套件
|
||||
wget https://ascend-repo.huawei.com/CANN/CANN%205.0.2/Ascend-cann-toolkit_5.0.2_linux-aarch64.run
|
||||
sudo bash Ascend-cann-toolkit_5.0.2_linux-aarch64.run
|
||||
|
||||
# 配置环境变量
|
||||
echo "source /usr/local/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
|
||||
source ~/.bashrc
|
||||
|
||||
# 验证安装
|
||||
npu-smi
|
||||
```
|
||||
|
||||
#### 2. 部署AI监控系统
|
||||
|
||||
```bash
|
||||
# 克隆项目
|
||||
git clone <your-repo>
|
||||
cd AIMonitor
|
||||
|
||||
# 创建虚拟环境
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
|
||||
# 安装PyTorch(昇腾版本)
|
||||
pip install torch==2.0.1+cpu torchaudio==2.0.2 --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
# 安装ONNX Runtime(昇腾支持)
|
||||
pip install onnxruntime==1.15.1
|
||||
|
||||
# 安装项目依赖
|
||||
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple/
|
||||
|
||||
# 配置昇腾推理
|
||||
export ASCEND_SLOG_PRINT_TO_STDOUT=1
|
||||
export ASCEND_GLOBAL_LOG_LEVEL=0
|
||||
```
|
||||
|
||||
#### 3. 配置昇腾AI模型
|
||||
|
||||
```bash
|
||||
# 确保使用昇腾支持的ONNX模型
|
||||
ls -la YOLO_Weight/
|
||||
# 应该包含: yolov8n.onnx
|
||||
|
||||
# 验证模型格式
|
||||
python3 -c "
|
||||
import onnx
|
||||
model = onnx.load('YOLO_Weight/yolov8n.onnx')
|
||||
print(f'模型输入: {model.graph.input[0].name}')
|
||||
print(f'输入形状: {model.graph.input[0].type.tensor_type.shape.dim}')
|
||||
"
|
||||
```
|
||||
|
||||
#### 4. 启动服务
|
||||
|
||||
```bash
|
||||
# 启动后端服务
|
||||
python3 rtsp_service_ws.py &
|
||||
|
||||
# 启动HTTP服务
|
||||
python3 static_server.py &
|
||||
|
||||
# 启动GUI(如果需要图形界面)
|
||||
python3 monitor_gui.py
|
||||
```
|
||||
|
||||
### 方案二:Docker部署
|
||||
|
||||
#### 1. 构建昇腾Docker镜像
|
||||
|
||||
```dockerfile
|
||||
# Dockerfile
|
||||
FROM swr.cn-north-4.myhuaweicloud.com/atlas/pytorch:2.0.1-aarch64
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /app
|
||||
|
||||
# 安装系统依赖
|
||||
RUN apt-get update && apt-get install -y \
|
||||
python3-pip \
|
||||
python3-dev \
|
||||
libglib2.0-0 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
libxrender-dev \
|
||||
libgomp1 \
|
||||
wget \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 复制项目文件
|
||||
COPY . /app/
|
||||
|
||||
# 安装Python依赖
|
||||
RUN pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple/
|
||||
|
||||
# 配置昇腾环境
|
||||
ENV ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
|
||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/driver
|
||||
ENV PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/pyACL/python/site-packages
|
||||
|
||||
# 创建必要目录
|
||||
RUN mkdir -p /app/videos /app/YOLO_Pipe_results
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 8765 5000
|
||||
|
||||
# 启动命令
|
||||
CMD ["python3", "rtsp_service_ws.py"]
|
||||
```
|
||||
|
||||
#### 2. 构建和运行容器
|
||||
|
||||
```bash
|
||||
# 构建镜像
|
||||
docker build -t aimonitor:ascend .
|
||||
|
||||
# 运行容器
|
||||
docker run -d \
|
||||
--name aimonitor \
|
||||
--device=/dev/davinci0 \
|
||||
--device=/dev/davinci_manager \
|
||||
--device=/dev/devmm_svm \
|
||||
--device=/dev/hisi_hdc \
|
||||
-v $(pwd)/videos:/app/videos \
|
||||
-v $(pwd)/YOLO_Weight:/app/YOLO_Weight \
|
||||
-p 8765:8765 \
|
||||
-p 5000:5000 \
|
||||
aimonitor:ascend
|
||||
```
|
||||
|
||||
## ⚙️ 优化配置
|
||||
|
||||
### 1. 昇腾推理优化
|
||||
|
||||
修改 `npu_yolo_onnx.py` 中的配置:
|
||||
|
||||
```python
|
||||
class YOLOv8_ONNX:
|
||||
def __init__(self, onnx_path, conf_threshold=0.25, iou_threshold=0.45):
|
||||
# 昇腾NPU优化配置
|
||||
providers = [("CANNExecutionProvider", {
|
||||
"device_id": 0,
|
||||
"arena_extend_strategy": "kNextPowerOfTwo",
|
||||
"npu_mem_limit": 16 * 1024 * 1024 * 1024, # 16GB
|
||||
"precision_mode": "allow_fp32_to_fp16",
|
||||
"op_select_impl_mode": "high_precision",
|
||||
"enable_cann_graph": True,
|
||||
"graph_optimization_level": "enable_all",
|
||||
})]
|
||||
|
||||
# 添加CPU作为备选
|
||||
providers.append(("CPUExecutionProvider", {}))
|
||||
|
||||
self.session = ort.InferenceSession(onnx_path, providers=providers)
|
||||
|
||||
# 检查是否使用昇腾
|
||||
actual_providers = self.session.get_providers()
|
||||
if "CANNExecutionProvider" in actual_providers:
|
||||
print("✓ 使用昇腾NPU加速推理")
|
||||
else:
|
||||
print("⚠ 使用CPU推理,昇腾加速未启用")
|
||||
```
|
||||
|
||||
### 2. 性能监控
|
||||
|
||||
```bash
|
||||
# 监控昇腾NPU使用情况
|
||||
watch -n 1 npu-smi
|
||||
|
||||
# 监控系统资源
|
||||
htop
|
||||
|
||||
# 监控网络连接
|
||||
netstat -an | grep -E "(8765|5000)"
|
||||
```
|
||||
|
||||
### 3. 日志配置
|
||||
|
||||
```bash
|
||||
# 配置昇腾日志级别
|
||||
export ASCEND_GLOBAL_LOG_LEVEL=1 # 0: INFO, 1: WARNING, 2: ERROR
|
||||
|
||||
# 配置日志文件
|
||||
export ASCEND_SLOG_PRINT_TO_STDOUT=0
|
||||
export ASCEND_SLOG_PATH=/var/log/npu/
|
||||
```
|
||||
|
||||
## 🔒 安全配置
|
||||
|
||||
### 1. 防火墙设置
|
||||
|
||||
```bash
|
||||
# 配置防火墙规则
|
||||
sudo ufw allow 8765/tcp # WebSocket
|
||||
sudo ufw allow 5000/tcp # HTTP
|
||||
sudo ufw enable
|
||||
```
|
||||
|
||||
### 2. 访问控制
|
||||
|
||||
```python
|
||||
# 在 rtsp_service_ws.py 中添加IP白名单
|
||||
ALLOWED_IPS = ['192.168.1.0/24', '10.0.0.0/8']
|
||||
|
||||
async def _ws_handler(self, websocket, path):
|
||||
client_ip = websocket.remote_address[0]
|
||||
|
||||
# 检查IP白名单
|
||||
if not any(ipaddress.ip_address(client_ip) in ipaddress.ip_network(network)
|
||||
for network in ALLOWED_IPS):
|
||||
await websocket.close(code=1008, reason="IP not allowed")
|
||||
return
|
||||
```
|
||||
|
||||
## 📊 性能调优
|
||||
|
||||
### 1. NPU资源优化
|
||||
|
||||
```python
|
||||
# 调整并发推理数量
|
||||
MAX_CONCURRENT_INFERENCES = 4 # 根据NPU型号调整
|
||||
|
||||
# 批处理优化
|
||||
BATCH_SIZE = 8 # 提高吞吐量
|
||||
|
||||
# 内存池管理
|
||||
arena_extend_strategy = "kSameAsRequested" # 减少内存碎片
|
||||
```
|
||||
|
||||
### 2. 视频流优化
|
||||
|
||||
```python
|
||||
# 调整处理参数
|
||||
RTSP_TARGET_FPS = 15.0 # 昇腾可支持更高帧率
|
||||
FRAMES_PER_SEGMENT = 1200 # 增加视频段长度
|
||||
QUEUE_MAX_SIZE = 1000 # 增大队列大小
|
||||
```
|
||||
|
||||
### 3. 存储优化
|
||||
|
||||
```bash
|
||||
# 配置视频存储策略
|
||||
# 1. 使用SSD存储热数据
|
||||
mkdir -p /ssd/videos
|
||||
ln -s /ssd/videos ./videos
|
||||
|
||||
# 2. 定期清理旧视频
|
||||
find ./videos -name "*.mp4" -mtime +7 -delete
|
||||
|
||||
# 3. 压缩历史视频
|
||||
ffmpeg -i input.mp4 -c:v libx264 -crf 28 output.mp4
|
||||
```
|
||||
|
||||
## 🚨 故障排除
|
||||
|
||||
### 1. 常见问题
|
||||
|
||||
#### 昇腾驱动未加载
|
||||
```bash
|
||||
# 检查驱动状态
|
||||
lsmod | grep npu
|
||||
dmesg | grep ascend
|
||||
|
||||
# 重新加载驱动
|
||||
sudo rmmod npu
|
||||
sudo modprobe npu
|
||||
```
|
||||
|
||||
#### CANN环境配置错误
|
||||
```bash
|
||||
# 检查环境变量
|
||||
echo $LD_LIBRARY_PATH
|
||||
echo $PYTHONPATH
|
||||
|
||||
# 重新配置
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
```
|
||||
|
||||
#### 推理性能差
|
||||
```python
|
||||
# 检查是否使用NPU
|
||||
providers = session.get_providers()
|
||||
print("当前使用的推理后端:", providers)
|
||||
|
||||
# 强制使用昇腾
|
||||
providers = [("CANNExecutionProvider", {
|
||||
"device_id": 0,
|
||||
"precision_mode": "force_fp16" # 强制FP16精度
|
||||
})]
|
||||
```
|
||||
|
||||
### 2. 日志分析
|
||||
|
||||
```bash
|
||||
# 查看昇腾日志
|
||||
tail -f /var/log/npu/slog/device-0/slog_info.log
|
||||
|
||||
# 查看应用日志
|
||||
tail -f npu_yolo_inference.log
|
||||
|
||||
# 性能分析
|
||||
npu-smi dump -i 0 -t 100 -d performance
|
||||
```
|
||||
|
||||
### 3. 性能基准测试
|
||||
|
||||
```python
|
||||
# 测试推理速度
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
# 创建测试数据
|
||||
test_input = np.random.rand(1, 3, 640, 640).astype(np.float32)
|
||||
|
||||
# 运行基准测试
|
||||
times = []
|
||||
for _ in range(100):
|
||||
start = time.time()
|
||||
outputs = session.run(None, {input_name: test_input})
|
||||
times.append(time.time() - start)
|
||||
|
||||
print(f"平均推理时间: {np.mean(times)*1000:.2f}ms")
|
||||
print(f"推理吞吐量: {1/np.mean(times):.2f} FPS")
|
||||
```
|
||||
|
||||
## 🔄 监控和维护
|
||||
|
||||
### 1. 系统监控脚本
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# monitor_aimonitor.sh
|
||||
|
||||
echo "=== AI监控系统状态 ==="
|
||||
echo "时间: $(date)"
|
||||
|
||||
# 检查进程状态
|
||||
if pgrep -f "rtsp_service_ws" > /dev/null; then
|
||||
echo "✓ RTSP服务运行正常"
|
||||
else
|
||||
echo "✗ RTSP服务异常,正在重启..."
|
||||
python3 /path/to/rtsp_service_ws.py &
|
||||
fi
|
||||
|
||||
if pgrep -f "static_server" > /dev/null; then
|
||||
echo "✓ HTTP服务运行正常"
|
||||
else
|
||||
echo "✗ HTTP服务异常,正在重启..."
|
||||
python3 /path/to/static_server.py &
|
||||
fi
|
||||
|
||||
# 检查NPU状态
|
||||
if npu-smi | grep -q "OK"; then
|
||||
echo "✓ 昇腾NPU工作正常"
|
||||
else
|
||||
echo "✗ 昇腾NPU异常"
|
||||
fi
|
||||
|
||||
# 检查磁盘空间
|
||||
DISK_USAGE=$(df ./videos | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
if [ $DISK_USAGE -gt 80 ]; then
|
||||
echo "⚠ 磁盘空间不足: ${DISK_USAGE}%"
|
||||
else
|
||||
echo "✓ 磁盘空间充足: ${DISK_USAGE}%"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
```
|
||||
|
||||
### 2. 自动重启脚本
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# auto_restart.sh
|
||||
|
||||
SERVICE_NAME="AI监控系统"
|
||||
LOG_FILE="/var/log/aimonitor_restart.log"
|
||||
|
||||
while true; do
|
||||
sleep 30
|
||||
|
||||
if ! pgrep -f "rtsp_service_ws" > /dev/null; then
|
||||
echo "$(date): $SERVICE_NAME 异常,正在重启..." >> $LOG_FILE
|
||||
cd /path/to/AIMonitor
|
||||
python3 rtsp_service_ws.py >> $LOG_FILE 2>&1 &
|
||||
fi
|
||||
done
|
||||
```
|
||||
|
||||
### 3. 定时任务配置
|
||||
|
||||
```bash
|
||||
# 添加到crontab
|
||||
crontab -e
|
||||
|
||||
# 每5分钟检查服务状态
|
||||
*/5 * * * * /path/to/monitor_aimonitor.sh
|
||||
|
||||
# 每天凌晨清理旧视频
|
||||
0 2 * * * find /path/to/videos -name "*.mp4" -mtime +7 -delete
|
||||
|
||||
# 每小时生成性能报告
|
||||
0 * * * * /path/to/performance_report.sh
|
||||
```
|
||||
|
||||
## 📈 扩展部署
|
||||
|
||||
### 1. 多节点部署
|
||||
|
||||
```yaml
|
||||
# docker-compose.yml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
aimonitor-master:
|
||||
build: .
|
||||
ports:
|
||||
- "8765:8765"
|
||||
- "5000:5000"
|
||||
volumes:
|
||||
- ./videos:/app/videos
|
||||
environment:
|
||||
- ROLE=master
|
||||
- DEVICE_ID=0
|
||||
|
||||
aimonitor-worker:
|
||||
build: .
|
||||
volumes:
|
||||
- ./videos:/app/videos
|
||||
environment:
|
||||
- ROLE=worker
|
||||
- DEVICE_ID=1
|
||||
depends_on:
|
||||
- aimonitor-master
|
||||
```
|
||||
|
||||
### 2. 负载均衡配置
|
||||
|
||||
```nginx
|
||||
# nginx.conf
|
||||
upstream aimonitor {
|
||||
server 192.168.1.10:8765;
|
||||
server 192.168.1.11:8765;
|
||||
server 192.168.1.12:8765;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
location / {
|
||||
proxy_pass http://aimonitor;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 🎯 部署检查清单
|
||||
|
||||
- [ ] 昇腾驱动安装完成
|
||||
- [ ] CANN工具包配置正确
|
||||
- [ ] Python环境准备就绪
|
||||
- [ ] 依赖包安装完成
|
||||
- [ ] 模型文件格式正确
|
||||
- [ ] 配置文件设置合理
|
||||
- [ ] 防火墙规则配置
|
||||
- [ ] 监控脚本就位
|
||||
- [ ] 日志收集启动
|
||||
- [ ] 性能测试通过
|
||||
|
||||
---
|
||||
|
||||
**文档版本**: v1.0
|
||||
**更新日期**: 2024-12-10
|
||||
**适用硬件**: 昇腾Atlas 310P/300I/800系列
|
||||
**支持系统**: Ubuntu/CentOS/openEuler
|
||||
Reference in New Issue
Block a user