300 lines
12 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pyaudio
import time
import struct
import wave
import base64
import json
import requests
import webrtcvad
import ffmpeg
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
from tools.log import CustomLogger
import os
current_file_path = os.path.abspath(__file__)
Language_Path = os.path.dirname(os.path.dirname(os.path.dirname(current_file_path)))
MassageRobot_Dobot_Path = os.path.dirname(Language_Path)
print("MassageRobot_Dobot_Path:",MassageRobot_Dobot_Path)
sys.path.append(MassageRobot_Dobot_Path)
from VortXDB.client import VTXClient
# 获取当前文件的父目录的上三级路径
parent_path = Path(__file__).resolve().parent.parent.parent.parent
# print(parent_path)
# 将父路径添加到 sys.path
sys.path.append(str(parent_path))
tmp_path = parent_path / 'tmp' / 'speech_audio_16k16bit.wav'
class SpeechRecognizer:
def __init__(self):
vtxdb = VTXClient()
self.audio = pyaudio.PyAudio()
self.recognize_url = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.recognize_url")
self.token_url = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_url")
self.token_access = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_access")
self.token_secret = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_secret")
self.save_path = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.save_path")
self.save_wav = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.save_wav")
self.logger = CustomLogger()
# self.token = self.get_Speech_Recognize_token()
self.token = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_HW")
# self.logger.log_info(f"token_HW:{self.token}")
self.token_success=True
def save_wave_file(self,filename,data):
wf = wave.open(filename,'wb') # 打开WAV文档
wf.setnchannels(1) #配置声道数
wf.setsampwidth(2) #配置量化位数
wf.setframerate(16000) #采样频率
wf.writeframes(b"".join(data)) # 将wav_data转换为二进制数据写入文件
wf.close()
def speech_record(self, timeout=5):
# 打开麦克风流
try:
stream = self.audio.open(format=pyaudio.paInt16,
channels=1,
rate=16000,
input=True,
frames_per_buffer=320) # 每次读取320个样本20ms
except OSError as e:
self.logger.log_info("无法打开麦克风,请检查麦克风连接是否正常。")
return
self.logger.log_info("开始检测...")
# 初始化WebRTC VAD
vad = webrtcvad.Vad()
vad.set_mode(3) # 设置VAD模式为最高灵敏度
SILENCE_DURATION = 0.5 # 静音持续时间阈值(秒)
MIN_SPEECH_DURATION = 0.7 # 最小语音持续时间(秒)
speech_detected = False
speech_start_time = 0
silence_start_time = 0
recorded_buffer = []
detect_start_time = time.time()
if_timeout = False
while True:
data = stream.read(320)
is_speech = vad.is_speech(data, sample_rate=16000)
if is_speech:
if not speech_detected:
speech_detected = True
speech_start_time = time.time()
silence_start_time = 0
recorded_buffer = [] # 重置音频数据
recorded_buffer.append(data)
elif time.time() - detect_start_time > timeout:
self.logger.log_info("检测超时")
if_timeout = True
break
if speech_detected:
if not is_speech:
if silence_start_time == 0:
silence_start_time = time.time()
elif time.time() - silence_start_time >= SILENCE_DURATION:
if time.time() - speech_start_time < MIN_SPEECH_DURATION:
self.logger.log_info("语音片段过短,跳过发送")
speech_detected = False
continue # 语音片段太短,不发送请求
break
else:
silence_start_time = 0
stream.stop_stream()
stream.close()
if self.save_wav:
self.save_wave_file(self.save_path, recorded_buffer)
return if_timeout
else:
return recorded_buffer, if_timeout
def test_token(self):
current_file_path = os.path.abspath(__file__)
current_directory = os.path.dirname(current_file_path)
file_path = os.path.join(current_directory, "test_token.wav")
# 读取文件并进行 Base64 编码
try:
with open(file_path, 'rb') as f:
data = f.read()
base64_data = base64.b64encode(data).decode('utf-8') # Python3 推荐使用 .decode('utf-8')
except FileNotFoundError:
print(f"错误:文件 {file_path} 未找到!")
return
except Exception as e:
print(f"读取文件失败: {e}")
return
# 构造请求
header = {
'Content-Type': 'application/json',
'X-Auth-Token': self.token
}
body = {
'data': base64_data,
'config': {
'property': 'chinese_8k_common',
'audio_format': 'pcm8k16bit'
}
}
# 发送请求
try:
resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header)
# **检查请求状态**
if resp.status_code == 200:
self.token_success=True
try:
response_json = resp.json() # 解析 JSON 响应
if "result" in response_json:
print("识别成功:", response_json["result"])
else:
print("识别失败,未返回 'result' 字段:", response_json)
except json.JSONDecodeError:
print("服务器返回的不是 JSON 格式:", resp.text)
else:
self.token_success=False
print(f"请求失败,状态码: {resp.status_code}, 响应内容: {resp.text}")
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return
def speech_recognize(self, timeout=5):
try:
time1 = time.time()
if self.save_wav:
if_timeout = self.speech_record(timeout=timeout)
else:
recorded_buffer, if_timeout = self.speech_record(timeout=timeout)
time2 = time.time()
self.logger.log_info(f'record_time:{time2-time1}')
text = ''
remaining_time = 0
if not if_timeout:
if self.save_wav:
with open(self.save_path, 'rb') as f:
audio_data = f.read()
base64_data = base64.b64encode(audio_data).decode('utf-8')
else:
base64_data = base64.b64encode(b"".join(recorded_buffer)).decode('utf-8')
header = {
'Content-Type': 'application/json',
'X-Auth-Token': self.token
}
body = {
'data': base64_data,
'config': {
'property': 'chinese_16k_common',
'audio_format': 'pcm16k16bit',
'add_punc': 'yes',
# 'vocabulary_id': '5f2bb507-2524-4a0d-8ced-2b64ab464099'
}
}
# 语音识别
resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header,timeout=10)
time3 = time.time()
json_data = resp.json()
text = json_data["result"]["text"]
self.logger.log_info(f"recognize_time:{time3 - time2}")
# 计算剩余时间
total_time_spent = time3 - time1
remaining_time = max(0, timeout - total_time_spent)
return text, if_timeout, remaining_time
except Exception as e:
self.logger.log_error(f"语音识别问题:{e}")
def speech_recognize_UI(self,file_path):
try:
ffmpeg.input(file_path).output(str(tmp_path), acodec='pcm_s16le', ar='16000', ac=1, y=None).run(cmd='/usr/bin/ffmpeg')
file_path=str(tmp_path)
with open(file_path, 'rb') as f:
data = f.read()
base64_data = str(base64.b64encode(data), 'utf-8')
header = {
'Content-Type': 'application/json',
'X-Auth-Token': self.token
}
body = {
'data': base64_data,
'config': {
'property': 'chinese_16k_common',
'audio_format': 'pcm16k16bit'
}
}
resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header,timeout=10)
# print("--------------------")
json_data = resp.json()
text = json_data["result"]["text"]
# print("text",text)
self.logger.log_info(f"UI发送语音识别结果:{text}")
return text
# print("--------------------")
except Exception as e:
self.logger.log_error(f"UI发送mp3转为wav语音识别出现问题{e}")
def get_Speech_Recognize_token(self):
'''语音识别获取token'''
payload = json.dumps({
"auth": {
"identity": {
"methods": [
"hw_ak_sk"
],
"hw_ak_sk": {
"access": {
"key": self.token_access
},
"secret": {
"key": self.token_secret
}
}
},
"scope": {
"project": {
"name": "cn-east-3"
}
}
}
})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.request("POST", self.token_url, headers=headers, data=payload, verify=False)
self.logger.log_info("Successfully get Speech Recognize token!!!")
return response.headers["X-Subject-Token"]
except Exception as e:
print("Error occurred while getting Speech Recognize token")
print(f"Exception: {e}")
self.logger.log_error(f"{e}")
raise
if __name__ == '__main__':
import argparse
from tools.yaml_operator import read_yaml
def parse_args():
parser = argparse.ArgumentParser(description='Speech processor')
parser.add_argument('--recognizer_config_path', type=str, default='Speech_processor/config/huaweiyun_recognize_config.yaml')
args = parser.parse_args()
return args
args = parse_args()
# config = read_yaml(args.recognizer_config_path)
recognizer = SpeechRecognizer()
## 直接语音识别
# text,if_timeout,remaining_time = recognizer.speech_recognize()
# print(text,if_timeout)
## 音频语音识别
# recognizer.speech_recognize_UI("/home/jsfb/jsfb_ws/MassageRobot_Dobot/tmp/speech_audio.mp3")
recognizer.test_token()