MassageRobot_Dobot/Language/Speech_processor/scripts/Recognizer.py

import pyaudio
import time
import struct
import wave
import base64
import json
import requests
import webrtcvad
import ffmpeg
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
from tools.log import CustomLogger
import os
current_file_path = os.path.abspath(__file__)
Language_Path = os.path.dirname(os.path.dirname(os.path.dirname(current_file_path)))
MassageRobot_Dobot_Path = os.path.dirname(Language_Path)
print("MassageRobot_Dobot_Path:",MassageRobot_Dobot_Path)
sys.path.append(MassageRobot_Dobot_Path)
from VortXDB.client import VTXClient

# 获取当前文件的父目录的上三级路径
parent_path = Path(__file__).resolve().parent.parent.parent.parent
# print(parent_path)
# 将父路径添加到 sys.path
sys.path.append(str(parent_path))
tmp_path = parent_path / 'tmp' / 'speech_audio_16k16bit.wav'
class SpeechRecognizer:
    def __init__(self):
        vtxdb = VTXClient()
        self.audio = pyaudio.PyAudio()
        self.recognize_url = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.recognize_url")
        self.token_url = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_url")
        self.token_access = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_access")
        self.token_secret = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_secret")
        self.save_path = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.save_path")
        self.save_wav = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.save_wav")
        self.logger = CustomLogger()
        # self.token = self.get_Speech_Recognize_token()
        self.token = vtxdb.get("robot_config", "Language.Speech_processor.huaweiyun_recognize_config.token_HW")
        # self.logger.log_info(f"token_HW:{self.token}")
        self.token_success=True


    def save_wave_file(self,filename,data):
        wf = wave.open(filename,'wb')  # 打开WAV文档
        wf.setnchannels(1)  #配置声道数
        wf.setsampwidth(2)  #配置量化位数
        wf.setframerate(16000) #采样频率
        wf.writeframes(b"".join(data))  # 将wav_data转换为二进制数据写入文件
        wf.close()

    def speech_record(self, timeout=5):
        # 打开麦克风流
        try:
            stream = self.audio.open(format=pyaudio.paInt16,
                                    channels=1,
                                    rate=16000,
                                    input=True,
                                    frames_per_buffer=320)  # 每次读取320个样本（20ms）
        except OSError as e:
            self.logger.log_info("无法打开麦克风，请检查麦克风连接是否正常。")
            return
        self.logger.log_info("开始检测...")

        # 初始化WebRTC VAD
        vad = webrtcvad.Vad()
        vad.set_mode(3)  # 设置VAD模式为最高灵敏度

        SILENCE_DURATION = 0.5  # 静音持续时间阈值（秒）
        MIN_SPEECH_DURATION = 0.7  # 最小语音持续时间（秒）

        speech_detected = False
        speech_start_time = 0
        silence_start_time = 0
        recorded_buffer = []
        detect_start_time = time.time()
        if_timeout = False

        while True:
            data = stream.read(320)
            is_speech = vad.is_speech(data, sample_rate=16000)

            if is_speech:
                if not speech_detected:
                    speech_detected = True
                    speech_start_time = time.time()
                    silence_start_time = 0
                    recorded_buffer = []  # 重置音频数据
                recorded_buffer.append(data)
            elif time.time() - detect_start_time > timeout:
                self.logger.log_info("检测超时")
                if_timeout = True
                break

            if speech_detected:
                if not is_speech:
                    if silence_start_time == 0:
                        silence_start_time = time.time()
                    elif time.time() - silence_start_time >= SILENCE_DURATION:
                        if time.time() - speech_start_time < MIN_SPEECH_DURATION:
                            self.logger.log_info("语音片段过短，跳过发送")
                            speech_detected = False
                            continue  # 语音片段太短，不发送请求
                        break
                else:
                    silence_start_time = 0

        stream.stop_stream()
        stream.close()

        if self.save_wav:
            self.save_wave_file(self.save_path, recorded_buffer)
            return if_timeout
        else:
            return recorded_buffer, if_timeout

    def test_token(self):
        current_file_path = os.path.abspath(__file__)
        current_directory = os.path.dirname(current_file_path)
        file_path = os.path.join(current_directory, "test_token.wav")

        # 读取文件并进行 Base64 编码
        try:
            with open(file_path, 'rb') as f:
                data = f.read()
                base64_data = base64.b64encode(data).decode('utf-8')  # Python3 推荐使用 .decode('utf-8')
        except FileNotFoundError:
            print(f"错误：文件 {file_path} 未找到！")
            return
        except Exception as e:
            print(f"读取文件失败: {e}")
            return

        # 构造请求
        header = {
            'Content-Type': 'application/json',
            'X-Auth-Token': self.token
        }
        body = {
            'data': base64_data,
            'config': {
                'property': 'chinese_8k_common',
                'audio_format': 'pcm8k16bit'
            }
        }

        # 发送请求
        try:
            resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header)
            # **检查请求状态**
            if resp.status_code == 200:
                self.token_success=True
                try:
                    response_json = resp.json()  # 解析 JSON 响应
                    if "result" in response_json:
                        print("识别成功:", response_json["result"])
                    else:
                        print("识别失败，未返回 'result' 字段:", response_json)
                except json.JSONDecodeError:
                    print("服务器返回的不是 JSON 格式:", resp.text)
            else:
                self.token_success=False
                print(f"请求失败，状态码: {resp.status_code}, 响应内容: {resp.text}")
        except requests.exceptions.RequestException as e:
            print(f"请求失败: {e}")
            return


    def speech_recognize(self, timeout=5):
        try:
            time1 = time.time()
            if self.save_wav:
                if_timeout = self.speech_record(timeout=timeout)
            else:
                recorded_buffer, if_timeout = self.speech_record(timeout=timeout)
            time2 = time.time()
            self.logger.log_info(f'record_time:{time2-time1}')

            text = ''
            remaining_time = 0
            if not if_timeout:
                if self.save_wav:
                    with open(self.save_path, 'rb') as f:
                        audio_data = f.read()
                        base64_data = base64.b64encode(audio_data).decode('utf-8')
                else:
                    base64_data = base64.b64encode(b"".join(recorded_buffer)).decode('utf-8')
                header = {
                    'Content-Type': 'application/json',
                    'X-Auth-Token': self.token
                }
                body = {
                    'data': base64_data,
                    'config': {
                        'property': 'chinese_16k_common',
                        'audio_format': 'pcm16k16bit',
                        'add_punc': 'yes',
                        # 'vocabulary_id': '5f2bb507-2524-4a0d-8ced-2b64ab464099'
                    }
                }

                # 语音识别
                resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header,timeout=10)
                time3 = time.time()
                json_data = resp.json()
                text = json_data["result"]["text"]
                self.logger.log_info(f"recognize_time:{time3 - time2}")

                # 计算剩余时间
                total_time_spent = time3 - time1
                remaining_time = max(0, timeout - total_time_spent)
            return text, if_timeout, remaining_time
        except Exception as e:
            self.logger.log_error(f"语音识别问题：{e}")

    def speech_recognize_UI(self,file_path):
        try:
            ffmpeg.input(file_path).output(str(tmp_path), acodec='pcm_s16le', ar='16000', ac=1, y=None).run(cmd='/usr/bin/ffmpeg')
            file_path=str(tmp_path)
            with open(file_path, 'rb') as f:
                data = f.read()
                base64_data = str(base64.b64encode(data), 'utf-8')
            header = {
                'Content-Type': 'application/json',
                'X-Auth-Token': self.token
            }
            body = {
                'data': base64_data,
                'config': {
                    'property': 'chinese_16k_common',
                    'audio_format': 'pcm16k16bit'
                }
            }
            resp = requests.post(self.recognize_url, data=json.dumps(body), headers=header,timeout=10)
            # print("--------------------")
            json_data = resp.json()
            text = json_data["result"]["text"]
            # print("text",text)
            self.logger.log_info(f"UI发送语音识别结果:{text}")
            return text
            # print("--------------------")
        except Exception as e:
            self.logger.log_error(f"UI发送mp3转为wav语音识别出现问题{e}")

    def get_Speech_Recognize_token(self):
        '''语音识别获取token'''
        payload = json.dumps({
        "auth": {
            "identity": {
            "methods": [
                "hw_ak_sk"
            ],
            "hw_ak_sk": {
                "access": {
                "key": self.token_access
                },
                "secret": {
                "key": self.token_secret
                }
            }
            },
            "scope": {
            "project": {
                "name": "cn-east-3"
            }
            }
        }
        })
        headers = {
        'Content-Type': 'application/json'
        }
        try:
            response = requests.request("POST", self.token_url, headers=headers, data=payload, verify=False)
            self.logger.log_info("Successfully get Speech Recognize token!!!")
            return response.headers["X-Subject-Token"]
        except Exception as e:
            print("Error occurred while getting Speech Recognize token")
            print(f"Exception: {e}")
            self.logger.log_error(f"{e}")
            raise

if __name__ == '__main__':
    import argparse
    from tools.yaml_operator import read_yaml
    def parse_args():
        parser = argparse.ArgumentParser(description='Speech processor')
        parser.add_argument('--recognizer_config_path', type=str, default='Speech_processor/config/huaweiyun_recognize_config.yaml')
        args = parser.parse_args()
        return args

    args = parse_args()
    # config = read_yaml(args.recognizer_config_path)
    recognizer = SpeechRecognizer()
    ## 直接语音识别
    # text,if_timeout,remaining_time = recognizer.speech_recognize()
    # print(text,if_timeout)
    ## 音频语音识别
    # recognizer.speech_recognize_UI("/home/jsfb/jsfb_ws/MassageRobot_Dobot/tmp/speech_audio.mp3")
    recognizer.test_token()