STM32N6的开发日记(12):来自N6烧坏之前的最后一次部署经验贴（完结篇）

观看这个系列的朋友们，非常抱歉这个系列之后很长一段时间可能要暂停更新了。在这周主播的N6板主控烧了短路了。由于STM32N6570-DK板子渠道很少并且现在非官方渠道价格实在太贵了（博主没有报销途径几乎全是自费）

非常戏剧性的是，就在完成这篇文章的代码的凌晨，我还非常期待第二天拿到线下去测试。

并且就在我的所有代码都完成的时候（参加嵌入式大赛）已经调试完了目标检测、二维云台定位、GPS定位和4G传输，刚准备去实机测试的时候，惨案就发生了。因此二维云台定位的视频都还没来得及拍。也是非常感慨，从24年末拿到这块N6，到自己手上也是经过了这么长时间的使用和探索，但是终究是倒在了黎明之前。

上上期介绍到我们在STM32N6中部署了一个Yolov8n的模型，完成了量化并实现了后处理的过程，本期我们在Yolov8n模型的基础上，扩充我们的数据集。

1、模型制作

首先还是准备我们制作好的yolo格式图片和对应的标签作为训练数据。

import os
from ultralytics import YOLO
def run_embedded_optimization():
    # 1. 加载模型
    model = YOLO('yolov8n.pt')
    # 2. 开始训练
    model.train(
        data='./dataset.yaml',
        epochs=300,
        imgsz=320,
        batch=16,
        name='STM32N6',
        patience=50,    
        device=0,
        optimizer="AdamW",
        lr0=0.001,
        lrf=0.01,
    )
if __name__ == '__main__':
    run_embedded_optimization()

训练一个标准的Yolov8n模型，之后利用训练完的best.pt测试图片是否正确：

import cv2
import os
import glob
from ultralytics import YOLO
def main():
    model = YOLO('best.pt')
    image_dir = 'images'
    image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp']
    image_files = []
    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(image_dir, ext)))
    ifnot image_files:
        print(f"在 {image_dir} 目录中没有找到图片文件")
        return
    print(f"找到 {len(image_files)} 张图片用于测试")
    cv2.namedWindow('YOLO Detection', cv2.WINDOW_NORMAL)
    cv2.resizeWindow('YOLO Detection', 800, 600)
    current_index = 0
    while True:
        image_path = image_files[current_index]
        image = cv2.imread(image_path)
        if image is None:
            print(f"无法读取图片: {image_path}")
            current_index = (current_index + 1) % len(image_files)
            continue
        results = model.predict(image, conf=0.25, imgsz=320)
        annotated_image = results[0].plot()
        filename = os.path.basename(image_path)
        info_text = f"图片: {filename} ({current_index + 1}/{len(image_files)})"
        cv2.putText(annotated_image, info_text, (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)
        help_text = "按 SPACE 键显示下一张 | 按 ESC 键退出"
        cv2.putText(annotated_image, help_text, (10, annotated_image.shape[0] - 20),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)
        cv2.imshow('YOLO Detection', annotated_image)
        key = cv2.waitKey(0)
        if key == 27: 
            break
        elif key == 32: 
            current_index = (current_index + 1) % len(image_files)
        elif key == 8: 
            current_index = (current_index - 1) % len(image_files)
        elif key == ord('r'):
            continue
        elif key == ord('s'): 
            output_dir = 'detection_results'
            os.makedirs(output_dir, exist_ok=True)
            output_path = os.path.join(output_dir, f"detected_{filename}")
            cv2.imwrite(output_path, annotated_image)
    cv2.destroyAllWindows()
if __name__ == "__main__":
    main()

之后我们利用Yolo官方的导出脚本来导出tflite格式并进行int8量化：

from ultralytics import YOLO
model = YOLO('best.pt')
model.export(
    format='tflite',
    imgsz=320,
    data='dataset.yaml',
    int8=True,
    half=False
)

注意的是，这个模型并不是用tensorflow导出的，而是利用Ultralytics的内置导出，目前只能解决导出成int8输入，int8输出的样子，所以要对模型输入输出有预处理和后处理。

2、STM32上的部署和使用

接着在STM32CubeMX中，在CubeAI中间件导入量化好的tflite格式模型

接着配置DCMIPP和CSI，接收摄像头数据并将其中一路作为背景显示，另一路缩放到320*320大小送入神经网络模型进行计算：

DCMIPP_DownsizeTypeDef DownsizeConf = {0};
  DownsizeConf.HRatio      = 25656;
  DownsizeConf.VRatio      = 33161;
  DownsizeConf.HSize       = 800;
  DownsizeConf.VSize       = 480;
  DownsizeConf.HDivFactor  = 316;
  DownsizeConf.VDivFactor  = 253;
  if(HAL_DCMIPP_PIPE_SetDownsizeConfig(&hdcmipp, DCMIPP_PIPE1, &DownsizeConf) != HAL_OK)
  {
    Error_Handler();
  }
  if(HAL_DCMIPP_PIPE_EnableDownsize(&hdcmipp, DCMIPP_PIPE1) != HAL_OK)
  {
    Error_Handler();
  }
    DownsizeConf.HRatio      = 65532;    /* 水平缩放比例：65532*/
    DownsizeConf.VRatio      = 49152;    /* 垂直缩放比例：49152*/
    DownsizeConf.HSize       = 320;     /* 输出宽度 */
    DownsizeConf.VSize       = 320;     /* 输出高度 */
    DownsizeConf.HDivFactor  = 128;         /* 水平除法因子：128*/
    DownsizeConf.VDivFactor  = 180;         /* 垂直除法因子：180*/
    if(HAL_DCMIPP_PIPE_SetDownsizeConfig(&hdcmipp, DCMIPP_PIPE2, &DownsizeConf) != HAL_OK)
    {
      Error_Handler();
    }
    if(HAL_DCMIPP_PIPE_EnableDownsize(&hdcmipp, DCMIPP_PIPE2) != HAL_OK)
    {
      Error_Handler();
    }

进入到app_x-cube-ai.c文件中，完成输入数据的预处理和输出数据的后处理。

   uint32_t buff_in_len,buff_out_len;
  LL_ATON_RT_RetValues_t ll_aton_rt_ret = LL_ATON_RT_DONE;
  const LL_Buffer_InfoTypeDef * ibuffersInfos = NN_Interface_Default.input_buffers_info();
  const LL_Buffer_InfoTypeDef * obuffersInfos = NN_Interface_Default.output_buffers_info();
  buffer_in = (uint8_t *)LL_Buffer_addr_start(&ibuffersInfos[0]);
  buffer_out = (uint8_t *)LL_Buffer_addr_start(&obuffersInfos[0]);
  buff_in_len = ibuffersInfos->offset_end - ibuffersInfos->offset_start;
  buff_out_len = obuffersInfos->offset_end - obuffersInfos->offset_start;
  uint32_t len = 320 * 320 * 3;
  HAL_DCMIPP_CSI_PIPE_Start(&hdcmipp, DCMIPP_PIPE2, DCMIPP_VIRTUAL_CHANNEL0,(uint32_t)buffer_in, DCMIPP_MODE_SNAPSHOT);

  SCB_CleanDCache_by_Addr((uint32_t*)buffer_in, buff_in_len);   // 写回内存
  SCB_InvalidateDCache_by_Addr((uint32_t*)buffer_in, buff_in_len);  // 重新从内存
  HAL_Delay(5);
  for (uint32_t i = 0; i < len; i++)
  {
    buffer_in[i] = (buffer_in[i]+128);//数据预处理
  }
  SCB_CleanDCache_by_Addr((uint32_t*)buffer_in, buff_in_len)

模型接收320*320*3的int8类型输入，归一化步骤是在模型中实现的，这里只需要加上偏置128即可。

for (int inferenceNb = 0; inferenceNb<1; ++inferenceNb) {
    LL_ATON_RT_Init_Network(&NN_Instance_Default);  // Initialize passed network instance object
    do {
      /* Execute first/next step */
      ll_aton_rt_ret = LL_ATON_RT_RunEpochBlock(&NN_Instance_Default);
      /* Wait for next event */
      if (ll_aton_rt_ret == LL_ATON_RT_WFE) {
        LL_ATON_OSAL_WFE();
      }
    } while (ll_aton_rt_ret != LL_ATON_RT_DONE

接着等待模型运算完成。

int8_t *floatout = (int8_t *)buffer_out;
    float scale = *obuffersInfos->scale;
    uint16_t off = *obuffersInfos->offset;
    int valid_count = 0;
    for (int i = 0; i < 2100; ++i) {
      int8_t cx   = floatout[i + 0 * 2100];
      int8_t cy   = floatout[i + 1 * 2100];
      int8_t w    = floatout[i + 2 * 2100];
      int8_t h    = floatout[i + 3 * 2100];
      float conf = (float)(scale * (floatout[i + 4 * 2100] +116));
        if (conf > 0.7f && valid_count < 2100) {
            float cx_input = (float)(scale * (cx +116));
            float cy_input = (float)(scale * (cy  +116));
            float w_input = (float)(scale * (w  +116)) ;
            float h_input = (float)(scale * (h  +116)) ;

            boxes[valid_count].x1 = cx_input - w_input / 2.0f;
            boxes[valid_count].y1 = cy_input - h_input / 2.0f;
            boxes[valid_count].x2 = cx_input + w_input / 2.0f;
            boxes[valid_count].y2 = cy_input + h_input / 2.0f;
            boxes[valid_count].conf = conf;
            boxes[valid_count].keep = 1;
            valid_count++;
        }
    }
    for (int i = 0; i < valid_count; i++) {
        if (boxes[i].keep) {
            for (int j = i + 1; j < valid_count; j++) {
                if (boxes[j].keep) {
                    float x1 = (boxes[i].x1 > boxes[j].x1) ? boxes[i].x1 : boxes[j].x1;
                    float y1 = (boxes[i].y1 > boxes[j].y1) ? boxes[i].y1 : boxes[j].y1;
                    float x2 = (boxes[i].x2 < boxes[j].x2) ? boxes[i].x2 : boxes[j].x2;
                    float y2 = (boxes[i].y2 < boxes[j].y2) ? boxes[i].y2 : boxes[j].y2;
                    float intersection = (x2 - x1) * (y2 - y1);
                    if (intersection < 0) intersection = 0;
                    float area_i = (boxes[i].x2 - boxes[i].x1) * (boxes[i].y2 - boxes[i].y1);
                    float area_j = (boxes[j].x2 - boxes[j].x1) * (boxes[j].y2 - boxes[j].y1);
                    float union_area = area_i + area_j - intersection;
                    float iou = (union_area > 0) ? (intersection / union_area) : 0;
                    if (iou > 0.2f) {
                        boxes[j].keep = 0;
                    }
                }
            }
        }
    }

后处理步骤主要是对2100*5个数据值的处理，包括反归一化和NMS处理，挑出符合目标的检测框进行显示。

STM32N6的开发日记(12):来自N6烧坏之前的最后一次部署经验贴（完结篇）

1、模型制作

2、STM32上的部署和使用

相关推荐