import pyaudio import time import struct import wave import base64 import json import requests import webrtcvad import ffmpeg import sys from pathlib import Path sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) from tools.log import CustomLogger import os current_file_path = os.path.abspath(__file__) Language_Path = os.path.dirname(os.path.dirname(os.path.dirname(current_file_path))) MassageRobot_Dobot_Path = os.path.dirname(Language_Path) print("MassageRobot_Dobot_Path:",MassageRobot_Dobot_Path) sys.path.append(MassageRobot_Dobot_Path) from VortXDB.client import VTXClient # 获取当前文件的父目录的上三级路径 parent_path = Path(__file__).resolve().parent.parent.parent.parent # print(parent_path) # 将父路径添加到 sys.path sys.path.append(str(parent_path)) tmp_path = parent_path / 'tmp' / 'speech_audio_16k16bit.wav' class SpeechRecognizer: def __init__(self): vtxdb = VTXClient() self.audio = pyaudio.PyAudio() self.recognize_url = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.recognize_url") self.token_url = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_url") self.token_access = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_access") self.token_secret = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_secret") self.save_path = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.save_path") self.save_wav = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.save_wav") self.logger = CustomLogger() # self.token = self.get_Speech_Recognize_token() self.token = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_HW") # self.logger.log_info(f"token_HW:{self.token}") self.token_success=True def save_wave_file(self,filename,data): wf = wave.open(filename,'wb') # 打开WAV文档 wf.setnchannels(1) #配置声道数 wf.setsampwidth(2) #配置量化位数 wf.setframerate(16000) #采样频率 wf.writeframes(b"".join(data)) # 将wav_data转换为二进制数据写入文件 wf.close() def speech_record(self, timeout=5): # 打开麦克风流 try: stream = self.audio.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=320) # 每次读取320个样本(20ms) except OSError as e: self.logger.log_info("无法打开麦克风,请检查麦克风连接是否正常。") return self.logger.log_info("开始检测...") # 初始化WebRTC VAD vad = webrtcvad.Vad() vad.set_mode(3) # 设置VAD模式为最高灵敏度 SILENCE_DURATION = 0.5 # 静音持续时间阈值(秒) MIN_SPEECH_DURATION = 0.7 # 最小语音持续时间(秒) speech_detected = False speech_start_time = 0 silence_start_time = 0 recorded_buffer = [] detect_start_time = time.time() if_timeout = False while True: data = stream.read(320) is_speech = vad.is_speech(data, sample_rate=16000) if is_speech: if not speech_detected: speech_detected = True speech_start_time = time.time() silence_start_time = 0 recorded_buffer = [] # 重置音频数据 recorded_buffer.append(data) elif time.time() - detect_start_time > timeout: self.logger.log_info("检测超时") if_timeout = True break if speech_detected: if not is_speech: if silence_start_time == 0: silence_start_time = time.time() elif time.time() - silence_start_time >= SILENCE_DURATION: if time.time() - speech_start_time < MIN_SPEECH_DURATION: self.logger.log_info("语音片段过短,跳过发送") speech_detected = False continue # 语音片段太短,不发送请求 break else: silence_start_time = 0 stream.stop_stream() stream.close() if self.save_wav: self.save_wave_file(self.save_path, recorded_buffer) return if_timeout else: return recorded_buffer, if_timeout def test_token(self): current_file_path = os.path.abspath(__file__) current_directory = os.path.dirname(current_file_path) file_path = os.path.join(current_directory, "test_token.wav") # 读取文件并进行 Base64 编码 try: with open(file_path, 'rb') as f: data = f.read() base64_data = base64.b64encode(data).decode('utf-8') # Python3 推荐使用 .decode('utf-8') except FileNotFoundError: print(f"错误:文件 {file_path} 未找到!") return except Exception as e: print(f"读取文件失败: {e}") return # 构造请求 header = { 'Content-Type': 'application/json', 'X-Auth-Token': self.token } body = { 'data': base64_data, 'config': { 'property': 'chinese_8k_common', 'audio_format': 'pcm8k16bit' } } # 发送请求 try: resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header) # **检查请求状态** if resp.status_code == 200: self.token_success=True try: response_json = resp.json() # 解析 JSON 响应 if "result" in response_json: print("识别成功:", response_json["result"]) else: print("识别失败,未返回 'result' 字段:", response_json) except json.JSONDecodeError: print("服务器返回的不是 JSON 格式:", resp.text) else: self.token_success=False print(f"请求失败,状态码: {resp.status_code}, 响应内容: {resp.text}") except requests.exceptions.RequestException as e: print(f"请求失败: {e}") return def speech_recognize(self, timeout=5): try: time1 = time.time() if self.save_wav: if_timeout = self.speech_record(timeout=timeout) else: recorded_buffer, if_timeout = self.speech_record(timeout=timeout) time2 = time.time() self.logger.log_info(f'record_time:{time2-time1}') text = '' remaining_time = 0 if not if_timeout: if self.save_wav: with open(self.save_path, 'rb') as f: audio_data = f.read() base64_data = base64.b64encode(audio_data).decode('utf-8') else: base64_data = base64.b64encode(b"".join(recorded_buffer)).decode('utf-8') header = { 'Content-Type': 'application/json', 'X-Auth-Token': self.token } body = { 'data': base64_data, 'config': { 'property': 'chinese_16k_common', 'audio_format': 'pcm16k16bit', 'add_punc': 'yes', # 'vocabulary_id': '5f2bb507-2524-4a0d-8ced-2b64ab464099' } } # 语音识别 resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header,timeout=10) time3 = time.time() json_data = resp.json() text = json_data["result"]["text"] self.logger.log_info(f"recognize_time:{time3 - time2}") # 计算剩余时间 total_time_spent = time3 - time1 remaining_time = max(0, timeout - total_time_spent) return text, if_timeout, remaining_time except Exception as e: self.logger.log_error(f"语音识别问题:{e}") def speech_recognize_UI(self,file_path): try: ffmpeg.input(file_path).output(str(tmp_path), acodec='pcm_s16le', ar='16000', ac=1, y=None).run(cmd='/usr/bin/ffmpeg') file_path=str(tmp_path) with open(file_path, 'rb') as f: data = f.read() base64_data = str(base64.b64encode(data), 'utf-8') header = { 'Content-Type': 'application/json', 'X-Auth-Token': self.token } body = { 'data': base64_data, 'config': { 'property': 'chinese_16k_common', 'audio_format': 'pcm16k16bit' } } resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header,timeout=10) # print("--------------------") json_data = resp.json() text = json_data["result"]["text"] # print("text",text) self.logger.log_info(f"UI发送语音识别结果:{text}") return text # print("--------------------") except Exception as e: self.logger.log_error(f"UI发送mp3转为wav语音识别出现问题{e}") def get_Speech_Recognize_token(self): '''语音识别获取token''' payload = json.dumps({ "auth": { "identity": { "methods": [ "hw_ak_sk" ], "hw_ak_sk": { "access": { "key": self.token_access }, "secret": { "key": self.token_secret } } }, "scope": { "project": { "name": "cn-east-3" } } } }) headers = { 'Content-Type': 'application/json' } try: response = requests.request("POST", self.token_url, headers=headers, data=payload, verify=False) self.logger.log_info("Successfully get Speech Recognize token!!!") return response.headers["X-Subject-Token"] except Exception as e: print("Error occurred while getting Speech Recognize token") print(f"Exception: {e}") self.logger.log_error(f"{e}") raise if __name__ == '__main__': import argparse from tools.yaml_operator import read_yaml def parse_args(): parser = argparse.ArgumentParser(description='Speech processor') parser.add_argument('--recognizer_config_path', type=str, default='Speech_processor/config/huaweiyun_recognize_config.yaml') args = parser.parse_args() return args args = parse_args() # config = read_yaml(args.recognizer_config_path) recognizer = SpeechRecognizer() ## 直接语音识别 # text,if_timeout,remaining_time = recognizer.speech_recognize() # print(text,if_timeout) ## 音频语音识别 # recognizer.speech_recognize_UI("/home/jsfb/jsfb_ws/MassageRobot_Dobot/tmp/speech_audio.mp3") recognizer.test_token()