# 文件名: npu_yolo_onnx.py
import cv2
import numpy as np
import onnxruntime as ort
import os
import time

def letterbox(img, new_shape=(640, 640), color=(114, 114, 114)):
    shape = img.shape[:2]  # h, w
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]
    dw /= 2
    dh /= 2
    if shape[::-1] != new_unpad:
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    return img, r, (dw, dh)

class YOLOv8_ONNX:
    def __init__(self, onnx_path, conf_threshold=0.25, iou_threshold=0.45, input_size=640):
        providers = [("CANNExecutionProvider", {
            "device_id": 0,
            "arena_extend_strategy": "kNextPowerOfTwo",
            "npu_mem_limit": 16 * 1024 * 1024 * 1024,
            "precision_mode": "allow_fp32_to_fp16",
            "op_select_impl_mode": "high_precision",
            "enable_cann_graph": True,
        }),
            "CUDAExecutionProvider",
            "CPUExecutionProvider",
        ]

        self.session = ort.InferenceSession(onnx_path, providers=providers)
        actual_providers = self.session.get_providers()
        print("YOLO Providers:", actual_providers)

        if "CANNExecutionProvider" in actual_providers:
            print("[INFO] YOLO 使用 CANNExecutionProvider（昇腾 NPU）")
        elif 'CUDAExecutionProvider' in actual_providers:
            print("[INFO] YOLO 使用 CUDAExecutionProvider（NVIDIA GPU）")
        else:
            print("[INFO] YOLO 使用 CPUExecutionProvider")

        self.conf_threshold = conf_threshold
        self.iou_threshold = iou_threshold
        self.input_name = self.session.get_inputs()[0].name
        self.input_size = (input_size, input_size) if isinstance(input_size, int) else input_size

        print(f"模型输入名称: {self.input_name}")
        print(f"模型输入形状: {self.session.get_inputs()[0].shape}")
        print(f"模型输出形状: {self.session.get_outputs()[0].shape}")

    def preprocess(self, img):
        self.orig_shape = img.shape[:2]
        img, self.ratio, (self.dw, self.dh) = letterbox(img, self.input_size)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = img.transpose(2, 0, 1).astype(np.float32)
        img /= 255.0
        img = np.expand_dims(img, axis=0)
        return img

    def postprocess(self, pred, im0_shape):
        # 1. 转置：从 [1, 4+cls, 8400] -> [8400, 4+cls]
        pred = pred[0].T

        # 2. 获取数据
        boxes = pred[:, :4]  # cx, cy, w, h
        scores = pred[:, 4:]

        # 3. 获取最大置信度和类别
        conf = np.max(scores, axis=1)
        class_pred = np.argmax(scores, axis=1)

        # 4. 初步过滤
        mask = conf > self.conf_threshold
        if not mask.any():
            return []

        boxes = boxes[mask]
        conf = conf[mask]
        class_pred = class_pred[mask]

        # =========================================================
        # 还原坐标 (逆 Letterbox)
        # =========================================================
        boxes[:, 0] = (boxes[:, 0] - self.dw) / self.ratio  # cx
        boxes[:, 1] = (boxes[:, 1] - self.dh) / self.ratio  # cy
        boxes[:, 2] = boxes[:, 2] / self.ratio  # w
        boxes[:, 3] = boxes[:, 3] / self.ratio  # h

        # 转换格式：Center(cx,cy) -> TopLeft(x,y)
        x = boxes[:, 0] - boxes[:, 2] / 2
        y = boxes[:, 1] - boxes[:, 3] / 2
        w = boxes[:, 2]
        h = boxes[:, 3]

        # 原始框（用于最终输出）
        bboxes_original = np.stack([x, y, w, h], axis=1)

        # =========================================================
        # 【核心修复】：Class-Aware NMS (偏移量法)
        # 给不同类别的框增加不同的偏移量，使得不同类别的框绝对不会重叠
        # 从而避免 "车" 把 "人" 过滤掉的情况
        # =========================================================
        max_wh = 4096  # 只要大于图片最大分辨率即可
        class_offset = class_pred * max_wh

        # NMS 专用的框坐标 (加上了偏移量)
        bboxes_for_nms = bboxes_original.copy()
        bboxes_for_nms[:, 0] += class_offset
        bboxes_for_nms[:, 1] += class_offset

        # =========================================================
        # 执行 NMS
        # =========================================================
        indices = cv2.dnn.NMSBoxes(
            bboxes_for_nms.tolist(),
            conf.tolist(),
            self.conf_threshold,
            self.iou_threshold
        )

        result = []
        if len(indices) > 0:
            indices = indices.flatten()
            for i in indices:
                # 注意：这里取数据要从 bboxes_original 取 (没有加偏移量的)
                bx, by, bw, bh = bboxes_original[i]

                # 转换回 x1, y1, x2, y2 供业务代码画图使用
                x1 = np.clip(bx, 0, im0_shape[1])
                y1 = np.clip(by, 0, im0_shape[0])
                x2 = np.clip(bx + bw, 0, im0_shape[1])
                y2 = np.clip(by + bh, 0, im0_shape[0])

                result.append([
                    float(x1),
                    float(y1),
                    float(x2),
                    float(y2),
                    float(conf[i]),
                    int(class_pred[i])
                ])
        return result

    def __call__(self, frame):
        input_data = self.preprocess(frame)
        pred = self.session.run(None, {self.input_name: input_data})[0]
        results = self.postprocess(pred, frame.shape[:2])
        return results