观看这个系列的朋友们,非常抱歉这个系列之后很长一段时间可能要暂停更新了。在这周主播的N6板主控烧了短路了。由于STM32N6570-DK板子渠道很少并且现在非官方渠道价格实在太贵了(博主没有报销途径几乎全是自费)
非常戏剧性的是,就在完成这篇文章的代码的凌晨,我还非常期待第二天拿到线下去测试。
并且就在我的所有代码都完成的时候(参加嵌入式大赛)已经调试完了目标检测、二维云台定位、GPS定位和4G传输,刚准备去实机测试的时候,惨案就发生了。因此二维云台定位的视频都还没来得及拍。也是非常感慨,从24年末拿到这块N6,到自己手上也是经过了这么长时间的使用和探索,但是终究是倒在了黎明之前。
上上期介绍到我们在STM32N6中部署了一个Yolov8n的模型,完成了量化并实现了后处理的过程,本期我们在Yolov8n模型的基础上,扩充我们的数据集。
1、模型制作
首先还是准备我们制作好的yolo格式图片和对应的标签作为训练数据。
import os
from ultralytics import YOLO
def run_embedded_optimization():
# 1. 加载模型
model = YOLO('yolov8n.pt')
# 2. 开始训练
model.train(
data='./dataset.yaml',
epochs=300,
imgsz=320,
batch=16,
name='STM32N6',
patience=50,
device=0,
optimizer="AdamW",
lr0=0.001,
lrf=0.01,
)
if __name__ == '__main__':
run_embedded_optimization()
训练一个标准的Yolov8n模型,之后利用训练完的best.pt测试图片是否正确:
import cv2
import os
import glob
from ultralytics import YOLO
def main():
model = YOLO('best.pt')
image_dir = 'images'
image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp']
image_files = []
for ext in image_extensions:
image_files.extend(glob.glob(os.path.join(image_dir, ext)))
ifnot image_files:
print(f"在 {image_dir} 目录中没有找到图片文件")
return
print(f"找到 {len(image_files)} 张图片用于测试")
cv2.namedWindow('YOLO Detection', cv2.WINDOW_NORMAL)
cv2.resizeWindow('YOLO Detection', 800, 600)
current_index = 0
while True:
image_path = image_files[current_index]
image = cv2.imread(image_path)
if image is None:
print(f"无法读取图片: {image_path}")
current_index = (current_index + 1) % len(image_files)
continue
results = model.predict(image, conf=0.25, imgsz=320)
annotated_image = results[0].plot()
filename = os.path.basename(image_path)
info_text = f"图片: {filename} ({current_index + 1}/{len(image_files)})"
cv2.putText(annotated_image, info_text, (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)
help_text = "按 SPACE 键显示下一张 | 按 ESC 键退出"
cv2.putText(annotated_image, help_text, (10, annotated_image.shape[0] - 20),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)
cv2.imshow('YOLO Detection', annotated_image)
key = cv2.waitKey(0)
if key == 27:
break
elif key == 32:
current_index = (current_index + 1) % len(image_files)
elif key == 8:
current_index = (current_index - 1) % len(image_files)
elif key == ord('r'):
continue
elif key == ord('s'):
output_dir = 'detection_results'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"detected_{filename}")
cv2.imwrite(output_path, annotated_image)
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
之后我们利用Yolo官方的导出脚本来导出tflite格式并进行int8量化:
from ultralytics import YOLO
model = YOLO('best.pt')
model.export(
format='tflite',
imgsz=320,
data='dataset.yaml',
int8=True,
half=False
)
注意的是,这个模型并不是用tensorflow导出的,而是利用Ultralytics的内置导出,目前只能解决导出成int8输入,int8输出的样子,所以要对模型输入输出有预处理和后处理。
2、STM32上的部署和使用
接着在STM32CubeMX中,在CubeAI中间件导入量化好的tflite格式模型
接着配置DCMIPP和CSI,接收摄像头数据并将其中一路作为背景显示,另一路缩放到320*320大小送入神经网络模型进行计算:
DCMIPP_DownsizeTypeDef DownsizeConf = {0};
DownsizeConf.HRatio = 25656;
DownsizeConf.VRatio = 33161;
DownsizeConf.HSize = 800;
DownsizeConf.VSize = 480;
DownsizeConf.HDivFactor = 316;
DownsizeConf.VDivFactor = 253;
if(HAL_DCMIPP_PIPE_SetDownsizeConfig(&hdcmipp, DCMIPP_PIPE1, &DownsizeConf) != HAL_OK)
{
Error_Handler();
}
if(HAL_DCMIPP_PIPE_EnableDownsize(&hdcmipp, DCMIPP_PIPE1) != HAL_OK)
{
Error_Handler();
}
DownsizeConf.HRatio = 65532; /* 水平缩放比例:65532*/
DownsizeConf.VRatio = 49152; /* 垂直缩放比例:49152*/
DownsizeConf.HSize = 320; /* 输出宽度 */
DownsizeConf.VSize = 320; /* 输出高度 */
DownsizeConf.HDivFactor = 128; /* 水平除法因子:128*/
DownsizeConf.VDivFactor = 180; /* 垂直除法因子:180*/
if(HAL_DCMIPP_PIPE_SetDownsizeConfig(&hdcmipp, DCMIPP_PIPE2, &DownsizeConf) != HAL_OK)
{
Error_Handler();
}
if(HAL_DCMIPP_PIPE_EnableDownsize(&hdcmipp, DCMIPP_PIPE2) != HAL_OK)
{
Error_Handler();
}
进入到app_x-cube-ai.c文件中,完成输入数据的预处理和输出数据的后处理。
uint32_t buff_in_len,buff_out_len;
LL_ATON_RT_RetValues_t ll_aton_rt_ret = LL_ATON_RT_DONE;
const LL_Buffer_InfoTypeDef * ibuffersInfos = NN_Interface_Default.input_buffers_info();
const LL_Buffer_InfoTypeDef * obuffersInfos = NN_Interface_Default.output_buffers_info();
buffer_in = (uint8_t *)LL_Buffer_addr_start(&ibuffersInfos[0]);
buffer_out = (uint8_t *)LL_Buffer_addr_start(&obuffersInfos[0]);
buff_in_len = ibuffersInfos->offset_end - ibuffersInfos->offset_start;
buff_out_len = obuffersInfos->offset_end - obuffersInfos->offset_start;
uint32_t len = 320 * 320 * 3;
HAL_DCMIPP_CSI_PIPE_Start(&hdcmipp, DCMIPP_PIPE2, DCMIPP_VIRTUAL_CHANNEL0,(uint32_t)buffer_in, DCMIPP_MODE_SNAPSHOT);
SCB_CleanDCache_by_Addr((uint32_t*)buffer_in, buff_in_len); // 写回内存
SCB_InvalidateDCache_by_Addr((uint32_t*)buffer_in, buff_in_len); // 重新从内存
HAL_Delay(5);
for (uint32_t i = 0; i < len; i++)
{
buffer_in[i] = (buffer_in[i]+128);//数据预处理
}
SCB_CleanDCache_by_Addr((uint32_t*)buffer_in, buff_in_len)
模型接收320*320*3的int8类型输入,归一化步骤是在模型中实现的,这里只需要加上偏置128即可。
for (int inferenceNb = 0; inferenceNb<1; ++inferenceNb) {
LL_ATON_RT_Init_Network(&NN_Instance_Default); // Initialize passed network instance object
do {
/* Execute first/next step */
ll_aton_rt_ret = LL_ATON_RT_RunEpochBlock(&NN_Instance_Default);
/* Wait for next event */
if (ll_aton_rt_ret == LL_ATON_RT_WFE) {
LL_ATON_OSAL_WFE();
}
} while (ll_aton_rt_ret != LL_ATON_RT_DONE
接着等待模型运算完成。
int8_t *floatout = (int8_t *)buffer_out;
float scale = *obuffersInfos->scale;
uint16_t off = *obuffersInfos->offset;
int valid_count = 0;
for (int i = 0; i < 2100; ++i) {
int8_t cx = floatout[i + 0 * 2100];
int8_t cy = floatout[i + 1 * 2100];
int8_t w = floatout[i + 2 * 2100];
int8_t h = floatout[i + 3 * 2100];
float conf = (float)(scale * (floatout[i + 4 * 2100] +116));
if (conf > 0.7f && valid_count < 2100) {
float cx_input = (float)(scale * (cx +116));
float cy_input = (float)(scale * (cy +116));
float w_input = (float)(scale * (w +116)) ;
float h_input = (float)(scale * (h +116)) ;
boxes[valid_count].x1 = cx_input - w_input / 2.0f;
boxes[valid_count].y1 = cy_input - h_input / 2.0f;
boxes[valid_count].x2 = cx_input + w_input / 2.0f;
boxes[valid_count].y2 = cy_input + h_input / 2.0f;
boxes[valid_count].conf = conf;
boxes[valid_count].keep = 1;
valid_count++;
}
}
for (int i = 0; i < valid_count; i++) {
if (boxes[i].keep) {
for (int j = i + 1; j < valid_count; j++) {
if (boxes[j].keep) {
float x1 = (boxes[i].x1 > boxes[j].x1) ? boxes[i].x1 : boxes[j].x1;
float y1 = (boxes[i].y1 > boxes[j].y1) ? boxes[i].y1 : boxes[j].y1;
float x2 = (boxes[i].x2 < boxes[j].x2) ? boxes[i].x2 : boxes[j].x2;
float y2 = (boxes[i].y2 < boxes[j].y2) ? boxes[i].y2 : boxes[j].y2;
float intersection = (x2 - x1) * (y2 - y1);
if (intersection < 0) intersection = 0;
float area_i = (boxes[i].x2 - boxes[i].x1) * (boxes[i].y2 - boxes[i].y1);
float area_j = (boxes[j].x2 - boxes[j].x1) * (boxes[j].y2 - boxes[j].y1);
float union_area = area_i + area_j - intersection;
float iou = (union_area > 0) ? (intersection / union_area) : 0;
if (iou > 0.2f) {
boxes[j].keep = 0;
}
}
}
}
}
后处理步骤主要是对2100*5个数据值的处理,包括反归一化和NMS处理,挑出符合目标的检测框进行显示。
196