本帖最后由 donatello 于 2025-3-3 15:31 编辑
1. 作品简介作品采用在得捷商城购买的树莓派5 8GB内存开发板,通过TTL串口与SX1278模块进行通信,并接入一些外部必要硬件模块,软件程序接入百度语音云,使之实现无线通信+语音识别+反馈控制的一个系统。
2.系统框图(图文结合)
树莓派5 8GB内存开发板通过TTL串口与SX1278模块进行通信,模块的模拟输出端接功放模块,功放模块接树莓派5 8GB内存开发板的GPIO模拟输入引脚,另一SX1278模块通过无线信号与SX1278模块进行通信,并接入一个麦克风模块用于声音录入。树莓派5 开发板通过摄像头麦克风录入声音,通过以太网连接百度语音云实现反馈控制。
3.流程图(图文结合)
树莓派5处理无线模块的声音模拟数据是以帧(frame)为单位的,每一帧可以是256/512/1024/2048字节,每一帧数据量越大,则读取一帧数据的时间就越长,以本作品的1024字节为一帧为例,每一秒可以处理大约16帧数据,也就是16384字节的声音模拟数据,这个声音模拟数据的指标也跟设备的码率挂钩,以USB摄像头自带的麦克风为例,市面上常用的USB免驱摄像头采集声音模拟数据一般是16384码率或者44000码率,码率越大,每秒发给USB主机的声音模拟数据量就越多,但是声音质量也会有所下降。树莓派5接收到一帧声音模拟数据之后,立刻进行FFT,众所周知FFT的结果就是频域结果,横轴是频率,纵轴是强度,现在的AI声纹大模型就是对频域结果进行处理,对每个频率的强度分析就可以得出多方面的数据,比如男声还是女声,汉字的平仄,英文的语气等等,这是一套非常复杂且完善的流程,我这里只做简单的声纹判断,也就是区分人的语音和环境杂音,只要有人说话就开始进行语音识别:
有人语音的帧即为有效帧,将所有连续的有效帧保存就能生成语音文件。
百度语音云的短语句识别是通过传输声音WAV文件来实现的,因此在上述步骤,将生成的只有有效帧的语音文件通过TCP协议发送到百度语音云平台,即可完成语音识别,树莓派5开发板再通过平台返回的语音内容进行相应操作。
4.各部分功能说明(图文结合)首先是将有效帧识别并保存为WAV文件的步骤: - FILE *file = fopen(SAVE_WAV_FILE , "wb");
- if (!file)
- {
- printf("ERROR: Can't open output file.\n");
- return -1;
- }
- Write_Wav_Header(file, CHANNELS, sample_rate, BITS_PER_SAMPLE, 0);
- int buffer_size = frames_per_period * CHANNELS * BITS_PER_SAMPLE / 8;
- //printf("frames_per_period = %d.\n" , frames_per_period);
- //printf("buffer_size = %d.\n" , buffer_size);
- char *buffer = (char *) malloc(buffer_size);
- float buffer_float[1024];
- float buffer_fft[512];
- uint32_t buffer_fft_256[256];
- uint32_t buffer_fft_seg_aver_64[64];
- int total_bytes = 0;
- count = 0;
- while (total_bytes < SAMPLE_RATE * CHANNELS * BITS_PER_SAMPLE / 8)
- //while(1)
- {
- memset(buffer_fft_seg_aver_64_sum , NULL , sizeof(buffer_fft_seg_aver_64_sum));
- pcm = gpio_ad_read(handle , buffer , frames_per_period);
- for(i = 0 ; i < buffer_size ; i ++)
- {
- //printf("%d " , buffer[i]);
- buffer_float[i] = buffer[i] * 1.0;
- }
- FFTW_Mag_Test(buffer_float , buffer_fft);
- //printf("buffer_size = %d.\n" , buffer_size);
- //buffer_size == 1024
- //buffer_fft_32[512]
- //buffer_fft_seg_aver_16[32]
- for(i = 0 ; i < buffer_size / 4 ; i ++)
- {
- buffer_fft_256[i] = (uint32_t)(buffer_fft[i] / 50.0);
- }
- for(i = 0 ; i < buffer_size / 4 ; i += 4)
- {
- buffer_fft_seg_aver_64[i / 4] =
- (uint32_t)
- (
- (buffer_fft[i] +
- buffer_fft[i + 1] +
- buffer_fft[i + 2] +
- buffer_fft[i + 3]
- ) / 320.0
- );
- buffer_fft_seg_aver_64_sum[i / 4] +=
- (uint32_t)
- (
- (buffer_fft[i] +
- buffer_fft[i + 1] +
- buffer_fft[i + 2] +
- buffer_fft[i + 3]
- ) / 200.0
- );
- //printf("buffer_fft_seg_aver_8 i:%d %d.\n" , i / 8 , buffer_fft_seg_aver_8[i / 8]);
- }
- LCD_Waveform_Graph_Whole_Add_Single(100 , 256 , 20 , 300 , buffer_fft_256 , LCD_COLOR_WHITE_32 , 0 , LCD_COLOR_RED_32 , 1);
- LCD_Waveform_Chart(100 , 1700 , 660 , 300 , buffer_fft_seg_aver_64_sum , 64 , 22 , 5 , LCD_COLOR_WHITE_32 , 0 , LCD_COLOR_BLUE_32);
-
- LCD_Show_ASCII_32(100 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '0');
- LCD_Show_ASCII_32(222 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '5');
- LCD_Show_ASCII_32(330 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '1');
- LCD_Show_ASCII_32(342 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '0');
- LCD_Show_ASCII_32(444 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '1');
- LCD_Show_ASCII_32(460 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '5');
- LCD_Show_ASCII_32(558 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '2');
- LCD_Show_ASCII_32(574 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '0');
- LCD_Show_ASCII_32(788 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '3');
- LCD_Show_ASCII_32(804 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '0');
- LCD_Show_ASCII_32(1018 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '4');
- LCD_Show_ASCII_32(1034 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '0');
- LCD_Show_ASCII_32(1248 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '5');
- LCD_Show_ASCII_32(1264 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '0');
- LCD_Show_ASCII_32(1480 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '6');
- LCD_Show_ASCII_32(1496 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '0');
- LCD_Show_ASCII_32(1548 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '6');
- LCD_Show_ASCII_32(1564 , 980 , LCD_COLOR_BLUE_32 , LCD_COLOR_CYAN_32 , '3');
-
- LCD_Effect_32("/dev/fb0");
- if (pcm == -EPIPE)
- {
- printf("ERROR: XRUN.\n");
- snd_pcm_prepare(pcm_handle);
- } else if (pcm < 0)
- {
- printf("ERROR: Can't read from PCM device.\n");
- }
- else
- {
- fwrite(buffer, 1, buffer_size, file);
- total_bytes += buffer_size;
- count ++;
- //printf("count = %d total_bytes = %d\n" , count , total_bytes);
- }
- //usleep(50*1000);
- }
- uint32_t sum_6_and_8 = 0;
- uint32_t sum_10_31 = 0;
- uint32_t sum_60_62 = 0;
- sum_6_and_8 = buffer_fft_seg_aver_64_sum[6] + buffer_fft_seg_aver_64_sum[8];
-
- // for(i = 1 ; i <= 4 ; i ++)
- // {
- // //printf("i = %d %d.\n" , i , buffer_fft_seg_aver_8_sum[i]);
- // sum_1_4 += buffer_fft_seg_aver_8_sum[i];
- // }
- // for(i = 10 ; i <= 31 ; i ++)
- // {
- // //printf("i = %d %d.\n" , i , buffer_fft_seg_aver_8_sum[i]);
- // sum_10_31 += buffer_fft_seg_aver_8_sum[i];
- // }
- printf("%d %d %d %d .\n" , buffer_fft_seg_aver_64_sum[2] , buffer_fft_seg_aver_64_sum[4] , buffer_fft_seg_aver_64_sum[6]
- , buffer_fft_seg_aver_64_sum[8]);
- //printf("1 = %d 2 = %d 3 = %d 4 = %d.\n" , buffer_fft_seg_aver_8_sum[1] , buffer_fft_seg_aver_8_sum[2] ,
- //buffer_fft_seg_aver_8_sum[3] , buffer_fft_seg_aver_8_sum[4]);
- // if(sum_1_2 >= 9000 && sum_12_22 >= 17000 && sum_12_22 <= 30000 && sum_49_54 >= 9500 && sum_60_62 >= 9000)
- // {
- // printf("Human Voice 1 , while.\n");
- // }
- // else if(sum_1_2 >= 7500 && sum_12_22 >= 20000 && sum_12_22 <= 30000 && sum_49_54 >= 11500 && sum_60_62 >= 8000)
- // {
- // printf("Human Voice 2 , while.\n");
- // }
- // else if(sum_1_2 >= 6500 && sum_12_22 >= 23000 && sum_12_22 <= 25000 && sum_49_54 >= 12000 && sum_60_62 >= 8500)
- // {
- // printf("Human Voice 3 , while.\n");
- // }
- // else if(sum_1_2 >= 8000 && sum_12_22 >= 21000 && sum_12_22 <= 30000 && sum_49_54 >= 10000 && sum_60_62 >= 9000)
- // {
- // printf("Human Voice 4 , while.\n");
- // }
- fseek(file, 0, SEEK_SET);
- Write_Wav_Header(file , CHANNELS , sample_rate , BITS_PER_SAMPLE , total_bytes);
- free(buffer);
- fclose(file);
复制代码
上述代码中FFTW_Mag_Test()函数就是FFT工作的函数,将时域信号转为频域信号,FFT结果数组为buffer_fft[],是浮点类型的,需要转为整型才可以进行汇总,汇总结果数组为buffer_fft_seg_aver_64_sum(),这个数组的意义是buffer_fft[]数组按相邻若干个数据的求和结果,也就是在信号处理领域常见的频谱数组。
然后是接入百度语音云并进行语音识别的函数,这个直接照抄百度语音云给出的示例代码即可: - RETURN_CODE Baidu_TSR(char baidu_api_key[] , char baidu_secret_key[] , char baidu_cuid[] , char baidu_tsr_filename[] , char baidu_tsr_format[] , uint32_t baidu_tsr_dev_pid ,
- char baidu_tsr_scope[] , uint32_t baidu_tsr_rate)
- {
- char token[MAX_TOKEN_SIZE];
- char url[300];
- char header[50];
- FILE *fp;
- Json::Reader reader;
- Json::Value root;
- RETURN_CODE res;
- curl_global_init(CURL_GLOBAL_ALL);
- fp = fopen(baidu_tsr_filename , "r");
- if (fp == NULL)
- {
- char cwd[200];
- getcwd(cwd,sizeof(cwd));
- snprintf(g_demo_error_msg , BUFFER_ERROR_SIZE,
- "current running directory does not contain file %s, %s", baidu_tsr_filename , cwd);
- return ERROR_ASR_FILE_NOT_EXIST;
- }
-
- res = speech_get_token(baidu_api_key , baidu_secret_key , baidu_tsr_scope , token);
- if (res == RETURN_OK)
- {
- CURL *curl = curl_easy_init();
- char *cuid = curl_easy_escape(curl , baidu_cuid , strlen(baidu_cuid));
- snprintf(url, sizeof(url), "%s?cuid=%s&token=%s&dev_pid=%d",
- baidu_tsr_url , baidu_cuid , token , baidu_tsr_dev_pid);
- curl_free(cuid);
- struct curl_slist *headerlist = NULL;
- snprintf(header, sizeof(header), "Content-Type: audio/%s; rate=%d", baidu_tsr_format ,
- baidu_tsr_rate);
- headerlist = curl_slist_append(headerlist, header);
- int content_len = 0;
- char *result = NULL;
- char *audio_data = read_file_data(fp , &content_len);
- curl_easy_setopt(curl, CURLOPT_URL, url);
- curl_easy_setopt(curl, CURLOPT_POST, 1);
- curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 5); // 连接5s超时
- curl_easy_setopt(curl, CURLOPT_TIMEOUT, 60); // 整体请求60s超时
- curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headerlist); // 添加http header Content-Type
- curl_easy_setopt(curl, CURLOPT_POSTFIELDS, audio_data);
- curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, content_len);
- curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writefunc_tsr);
- curl_easy_setopt(curl, CURLOPT_WRITEDATA, &result);
- CURLcode res_curl = curl_easy_perform(curl);
- baidu_xtts_string.clear();
- if (res_curl != CURLE_OK)
- {
- snprintf(g_demo_error_msg, BUFFER_ERROR_SIZE, "perform curl error:%d, %s.\n" , res , curl_easy_strerror(res_curl));
- res = ERROR_ASR_CURL;
- }
- else
- {
- //printf("result = %s\n", result);
- if (reader.parse(result , root))
- {
- baidu_xtts_string = root["result"][0].asString();
- printf("baidu_xtts_string = %s.\n" , baidu_xtts_string.c_str());
- }
- }
- curl_slist_free_all(headerlist);
- free(audio_data);
- free(result);
- curl_easy_cleanup(curl);
- }
-
- if (fp != NULL)
- {
- fclose(fp);
- }
- curl_global_cleanup();
- return res;
- }
复制代码
最后就是根据语音返回结果来进行各种操作了,这个就不多赘述,只是简单的字符串判断。
5. 结语总的来说,树莓派5的强大性能,使得它在处理语音数据输入,FFT算法,还是访问百度语音云的时候都非常轻松,整体流程非常顺畅不存在卡顿的情况,更由于其本身就是MPU,比起MCU平台的性能强大得多。非常感谢主办方得捷和协办方与非网,也感谢协办方工作人员&联络员Seven,日天兄辛苦了。祝得捷平台,与非网平台今后举办的活动更加红火,让广大的电子爱好者充分发挥才能。 |