300 lines
12 KiB
Python
Executable File
300 lines
12 KiB
Python
Executable File
import pyaudio
|
||
import time
|
||
import struct
|
||
import wave
|
||
import base64
|
||
import json
|
||
import requests
|
||
import webrtcvad
|
||
import ffmpeg
|
||
import sys
|
||
from pathlib import Path
|
||
sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
|
||
from tools.log import CustomLogger
|
||
import os
|
||
current_file_path = os.path.abspath(__file__)
|
||
Language_Path = os.path.dirname(os.path.dirname(os.path.dirname(current_file_path)))
|
||
MassageRobot_Dobot_Path = os.path.dirname(Language_Path)
|
||
print("MassageRobot_Dobot_Path:",MassageRobot_Dobot_Path)
|
||
sys.path.append(MassageRobot_Dobot_Path)
|
||
from VortXDB.client import VTXClient
|
||
|
||
# 获取当前文件的父目录的上三级路径
|
||
parent_path = Path(__file__).resolve().parent.parent.parent.parent
|
||
# print(parent_path)
|
||
# 将父路径添加到 sys.path
|
||
sys.path.append(str(parent_path))
|
||
tmp_path = parent_path / 'tmp' / 'speech_audio_16k16bit.wav'
|
||
class SpeechRecognizer:
|
||
def __init__(self):
|
||
vtxdb = VTXClient()
|
||
self.audio = pyaudio.PyAudio()
|
||
self.recognize_url = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.recognize_url")
|
||
self.token_url = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_url")
|
||
self.token_access = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_access")
|
||
self.token_secret = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_secret")
|
||
self.save_path = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.save_path")
|
||
self.save_wav = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.save_wav")
|
||
self.logger = CustomLogger()
|
||
# self.token = self.get_Speech_Recognize_token()
|
||
self.token = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_HW")
|
||
# self.logger.log_info(f"token_HW:{self.token}")
|
||
self.token_success=True
|
||
|
||
|
||
def save_wave_file(self,filename,data):
|
||
wf = wave.open(filename,'wb') # 打开WAV文档
|
||
wf.setnchannels(1) #配置声道数
|
||
wf.setsampwidth(2) #配置量化位数
|
||
wf.setframerate(16000) #采样频率
|
||
wf.writeframes(b"".join(data)) # 将wav_data转换为二进制数据写入文件
|
||
wf.close()
|
||
|
||
def speech_record(self, timeout=5):
|
||
# 打开麦克风流
|
||
try:
|
||
stream = self.audio.open(format=pyaudio.paInt16,
|
||
channels=1,
|
||
rate=16000,
|
||
input=True,
|
||
frames_per_buffer=320) # 每次读取320个样本(20ms)
|
||
except OSError as e:
|
||
self.logger.log_info("无法打开麦克风,请检查麦克风连接是否正常。")
|
||
return
|
||
self.logger.log_info("开始检测...")
|
||
|
||
# 初始化WebRTC VAD
|
||
vad = webrtcvad.Vad()
|
||
vad.set_mode(3) # 设置VAD模式为最高灵敏度
|
||
|
||
SILENCE_DURATION = 0.5 # 静音持续时间阈值(秒)
|
||
MIN_SPEECH_DURATION = 0.7 # 最小语音持续时间(秒)
|
||
|
||
speech_detected = False
|
||
speech_start_time = 0
|
||
silence_start_time = 0
|
||
recorded_buffer = []
|
||
detect_start_time = time.time()
|
||
if_timeout = False
|
||
|
||
while True:
|
||
data = stream.read(320)
|
||
is_speech = vad.is_speech(data, sample_rate=16000)
|
||
|
||
if is_speech:
|
||
if not speech_detected:
|
||
speech_detected = True
|
||
speech_start_time = time.time()
|
||
silence_start_time = 0
|
||
recorded_buffer = [] # 重置音频数据
|
||
recorded_buffer.append(data)
|
||
elif time.time() - detect_start_time > timeout:
|
||
self.logger.log_info("检测超时")
|
||
if_timeout = True
|
||
break
|
||
|
||
if speech_detected:
|
||
if not is_speech:
|
||
if silence_start_time == 0:
|
||
silence_start_time = time.time()
|
||
elif time.time() - silence_start_time >= SILENCE_DURATION:
|
||
if time.time() - speech_start_time < MIN_SPEECH_DURATION:
|
||
self.logger.log_info("语音片段过短,跳过发送")
|
||
speech_detected = False
|
||
continue # 语音片段太短,不发送请求
|
||
break
|
||
else:
|
||
silence_start_time = 0
|
||
|
||
stream.stop_stream()
|
||
stream.close()
|
||
|
||
if self.save_wav:
|
||
self.save_wave_file(self.save_path, recorded_buffer)
|
||
return if_timeout
|
||
else:
|
||
return recorded_buffer, if_timeout
|
||
|
||
def test_token(self):
|
||
current_file_path = os.path.abspath(__file__)
|
||
current_directory = os.path.dirname(current_file_path)
|
||
file_path = os.path.join(current_directory, "test_token.wav")
|
||
|
||
# 读取文件并进行 Base64 编码
|
||
try:
|
||
with open(file_path, 'rb') as f:
|
||
data = f.read()
|
||
base64_data = base64.b64encode(data).decode('utf-8') # Python3 推荐使用 .decode('utf-8')
|
||
except FileNotFoundError:
|
||
print(f"错误:文件 {file_path} 未找到!")
|
||
return
|
||
except Exception as e:
|
||
print(f"读取文件失败: {e}")
|
||
return
|
||
|
||
# 构造请求
|
||
header = {
|
||
'Content-Type': 'application/json',
|
||
'X-Auth-Token': self.token
|
||
}
|
||
body = {
|
||
'data': base64_data,
|
||
'config': {
|
||
'property': 'chinese_8k_common',
|
||
'audio_format': 'pcm8k16bit'
|
||
}
|
||
}
|
||
|
||
# 发送请求
|
||
try:
|
||
resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header)
|
||
# **检查请求状态**
|
||
if resp.status_code == 200:
|
||
self.token_success=True
|
||
try:
|
||
response_json = resp.json() # 解析 JSON 响应
|
||
if "result" in response_json:
|
||
print("识别成功:", response_json["result"])
|
||
else:
|
||
print("识别失败,未返回 'result' 字段:", response_json)
|
||
except json.JSONDecodeError:
|
||
print("服务器返回的不是 JSON 格式:", resp.text)
|
||
else:
|
||
self.token_success=False
|
||
print(f"请求失败,状态码: {resp.status_code}, 响应内容: {resp.text}")
|
||
except requests.exceptions.RequestException as e:
|
||
print(f"请求失败: {e}")
|
||
return
|
||
|
||
|
||
def speech_recognize(self, timeout=5):
|
||
try:
|
||
time1 = time.time()
|
||
if self.save_wav:
|
||
if_timeout = self.speech_record(timeout=timeout)
|
||
else:
|
||
recorded_buffer, if_timeout = self.speech_record(timeout=timeout)
|
||
time2 = time.time()
|
||
self.logger.log_info(f'record_time:{time2-time1}')
|
||
|
||
text = ''
|
||
remaining_time = 0
|
||
if not if_timeout:
|
||
if self.save_wav:
|
||
with open(self.save_path, 'rb') as f:
|
||
audio_data = f.read()
|
||
base64_data = base64.b64encode(audio_data).decode('utf-8')
|
||
else:
|
||
base64_data = base64.b64encode(b"".join(recorded_buffer)).decode('utf-8')
|
||
header = {
|
||
'Content-Type': 'application/json',
|
||
'X-Auth-Token': self.token
|
||
}
|
||
body = {
|
||
'data': base64_data,
|
||
'config': {
|
||
'property': 'chinese_16k_common',
|
||
'audio_format': 'pcm16k16bit',
|
||
'add_punc': 'yes',
|
||
# 'vocabulary_id': '5f2bb507-2524-4a0d-8ced-2b64ab464099'
|
||
}
|
||
}
|
||
|
||
# 语音识别
|
||
resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header,timeout=10)
|
||
time3 = time.time()
|
||
json_data = resp.json()
|
||
text = json_data["result"]["text"]
|
||
self.logger.log_info(f"recognize_time:{time3 - time2}")
|
||
|
||
# 计算剩余时间
|
||
total_time_spent = time3 - time1
|
||
remaining_time = max(0, timeout - total_time_spent)
|
||
return text, if_timeout, remaining_time
|
||
except Exception as e:
|
||
self.logger.log_error(f"语音识别问题:{e}")
|
||
|
||
def speech_recognize_UI(self,file_path):
|
||
try:
|
||
ffmpeg.input(file_path).output(str(tmp_path), acodec='pcm_s16le', ar='16000', ac=1, y=None).run(cmd='/usr/bin/ffmpeg')
|
||
file_path=str(tmp_path)
|
||
with open(file_path, 'rb') as f:
|
||
data = f.read()
|
||
base64_data = str(base64.b64encode(data), 'utf-8')
|
||
header = {
|
||
'Content-Type': 'application/json',
|
||
'X-Auth-Token': self.token
|
||
}
|
||
body = {
|
||
'data': base64_data,
|
||
'config': {
|
||
'property': 'chinese_16k_common',
|
||
'audio_format': 'pcm16k16bit'
|
||
}
|
||
}
|
||
resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header,timeout=10)
|
||
# print("--------------------")
|
||
json_data = resp.json()
|
||
text = json_data["result"]["text"]
|
||
# print("text",text)
|
||
self.logger.log_info(f"UI发送语音识别结果:{text}")
|
||
return text
|
||
# print("--------------------")
|
||
except Exception as e:
|
||
self.logger.log_error(f"UI发送mp3转为wav语音识别出现问题{e}")
|
||
|
||
def get_Speech_Recognize_token(self):
|
||
'''语音识别获取token'''
|
||
payload = json.dumps({
|
||
"auth": {
|
||
"identity": {
|
||
"methods": [
|
||
"hw_ak_sk"
|
||
],
|
||
"hw_ak_sk": {
|
||
"access": {
|
||
"key": self.token_access
|
||
},
|
||
"secret": {
|
||
"key": self.token_secret
|
||
}
|
||
}
|
||
},
|
||
"scope": {
|
||
"project": {
|
||
"name": "cn-east-3"
|
||
}
|
||
}
|
||
}
|
||
})
|
||
headers = {
|
||
'Content-Type': 'application/json'
|
||
}
|
||
try:
|
||
response = requests.request("POST", self.token_url, headers=headers, data=payload, verify=False)
|
||
self.logger.log_info("Successfully get Speech Recognize token!!!")
|
||
return response.headers["X-Subject-Token"]
|
||
except Exception as e:
|
||
print("Error occurred while getting Speech Recognize token")
|
||
print(f"Exception: {e}")
|
||
self.logger.log_error(f"{e}")
|
||
raise
|
||
|
||
if __name__ == '__main__':
|
||
import argparse
|
||
from tools.yaml_operator import read_yaml
|
||
def parse_args():
|
||
parser = argparse.ArgumentParser(description='Speech processor')
|
||
parser.add_argument('--recognizer_config_path', type=str, default='Speech_processor/config/huaweiyun_recognize_config.yaml')
|
||
args = parser.parse_args()
|
||
return args
|
||
|
||
args = parse_args()
|
||
# config = read_yaml(args.recognizer_config_path)
|
||
recognizer = SpeechRecognizer()
|
||
## 直接语音识别
|
||
# text,if_timeout,remaining_time = recognizer.speech_recognize()
|
||
# print(text,if_timeout)
|
||
## 音频语音识别
|
||
# recognizer.speech_recognize_UI("/home/jsfb/jsfb_ws/MassageRobot_Dobot/tmp/speech_audio.mp3")
|
||
recognizer.test_token() |