# 文件名: npu_yolo_onnx.py import cv2 import numpy as np import onnxruntime as ort import os import time def letterbox(img, new_shape=(640, 640), color=(114, 114, 114)): shape = img.shape[:2] # h, w r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] dw /= 2 dh /= 2 if shape[::-1] != new_unpad: img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) return img, r, (dw, dh) class YOLOv8_ONNX: def __init__(self, onnx_path, conf_threshold=0.25, iou_threshold=0.45, input_size=640): providers = [("CANNExecutionProvider", { "device_id": 0, "arena_extend_strategy": "kNextPowerOfTwo", "npu_mem_limit": 16 * 1024 * 1024 * 1024, "precision_mode": "allow_fp32_to_fp16", "op_select_impl_mode": "high_precision", "enable_cann_graph": True, }), "CUDAExecutionProvider", "CPUExecutionProvider", ] self.session = ort.InferenceSession(onnx_path, providers=providers) actual_providers = self.session.get_providers() print("YOLO Providers:", actual_providers) if "CANNExecutionProvider" in actual_providers: print("[INFO] YOLO 使用 CANNExecutionProvider(昇腾 NPU)") elif 'CUDAExecutionProvider' in actual_providers: print("[INFO] YOLO 使用 CUDAExecutionProvider(NVIDIA GPU)") else: print("[INFO] YOLO 使用 CPUExecutionProvider") self.conf_threshold = conf_threshold self.iou_threshold = iou_threshold self.input_name = self.session.get_inputs()[0].name self.input_size = (input_size, input_size) if isinstance(input_size, int) else input_size print(f"模型输入名称: {self.input_name}") print(f"模型输入形状: {self.session.get_inputs()[0].shape}") print(f"模型输出形状: {self.session.get_outputs()[0].shape}") def preprocess(self, img): self.orig_shape = img.shape[:2] img, self.ratio, (self.dw, self.dh) = letterbox(img, self.input_size) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = img.transpose(2, 0, 1).astype(np.float32) img /= 255.0 img = np.expand_dims(img, axis=0) return img def postprocess(self, pred, im0_shape): # 1. 转置:从 [1, 4+cls, 8400] -> [8400, 4+cls] pred = pred[0].T # 2. 获取数据 boxes = pred[:, :4] # cx, cy, w, h scores = pred[:, 4:] # 3. 获取最大置信度和类别 conf = np.max(scores, axis=1) class_pred = np.argmax(scores, axis=1) # 4. 初步过滤 mask = conf > self.conf_threshold if not mask.any(): return [] boxes = boxes[mask] conf = conf[mask] class_pred = class_pred[mask] # ========================================================= # 还原坐标 (逆 Letterbox) # ========================================================= boxes[:, 0] = (boxes[:, 0] - self.dw) / self.ratio # cx boxes[:, 1] = (boxes[:, 1] - self.dh) / self.ratio # cy boxes[:, 2] = boxes[:, 2] / self.ratio # w boxes[:, 3] = boxes[:, 3] / self.ratio # h # 转换格式:Center(cx,cy) -> TopLeft(x,y) x = boxes[:, 0] - boxes[:, 2] / 2 y = boxes[:, 1] - boxes[:, 3] / 2 w = boxes[:, 2] h = boxes[:, 3] # 原始框(用于最终输出) bboxes_original = np.stack([x, y, w, h], axis=1) # ========================================================= # 【核心修复】:Class-Aware NMS (偏移量法) # 给不同类别的框增加不同的偏移量,使得不同类别的框绝对不会重叠 # 从而避免 "车" 把 "人" 过滤掉的情况 # ========================================================= max_wh = 4096 # 只要大于图片最大分辨率即可 class_offset = class_pred * max_wh # NMS 专用的框坐标 (加上了偏移量) bboxes_for_nms = bboxes_original.copy() bboxes_for_nms[:, 0] += class_offset bboxes_for_nms[:, 1] += class_offset # ========================================================= # 执行 NMS # ========================================================= indices = cv2.dnn.NMSBoxes( bboxes_for_nms.tolist(), conf.tolist(), self.conf_threshold, self.iou_threshold ) result = [] if len(indices) > 0: indices = indices.flatten() for i in indices: # 注意:这里取数据要从 bboxes_original 取 (没有加偏移量的) bx, by, bw, bh = bboxes_original[i] # 转换回 x1, y1, x2, y2 供业务代码画图使用 x1 = np.clip(bx, 0, im0_shape[1]) y1 = np.clip(by, 0, im0_shape[0]) x2 = np.clip(bx + bw, 0, im0_shape[1]) y2 = np.clip(by + bh, 0, im0_shape[0]) result.append([ float(x1), float(y1), float(x2), float(y2), float(conf[i]), int(class_pred[i]) ]) return result def __call__(self, frame): input_data = self.preprocess(frame) pred = self.session.run(None, {self.input_name: input_data})[0] results = self.postprocess(pred, frame.shape[:2]) return results