import re

def extract_data_from_log(log_file_path, output_file_path):
    # 打开并读取日志文件
    with open(log_file_path, 'r') as file:
        log_data = file.readlines()

    # 用来存储提取的所有数据
    extracted_data = []

    # 标志位,用于记录第一次检测到 "检测到小悠小悠、小悠师傅" 后
    in_detection_section = False
    last_detection_index = -1

    # 临时变量存储累积的文本
    accumulated_text = ""

    # 遍历每一行,查找符合条件的行
    for i in range(len(log_data)):
        # 查找每次 "检测到小悠小悠、小悠师傅" 出现的行
        if "检测到小悠小悠、小悠师傅" in log_data[i]:
            # 如果已经进入了一个检测区段,则提取前一个区段的数据
            if in_detection_section:
                # 提取该段 recognize_time、recognized_text 和 run_time
                recognize_times = []
                recognized_texts = []
                run_times = []

                for j in range(last_detection_index + 1, i):
                    # 提取 recognize_time
                    match_recognize_time = re.search(r"(recognize_time:\d+\.\d+)", log_data[j])
                    if match_recognize_time:
                        recognize_times.append(match_recognize_time.group(1))

                    # 提取 Recognized text
                    match_recognized_text = re.search(r"(Recognized text:.*?)(?=\n|$)", log_data[j])
                    if match_recognized_text:
                        recognized_texts.append(match_recognized_text.group(1))

                    # 提取运行时间
                    match_run_time = re.search(r"(运行时间:\d+\.\d+)", log_data[j])
                    if match_run_time:
                        run_times.append(match_run_time.group(1))

                    # 提取 accumulated_text,并进行拼接,去除每个累积文本的 "accumulated_text:" 前缀
                    match_accumulated_text = re.search(r"(accumulated_text:.*?)(?=\n|$)", log_data[j])
                    if match_accumulated_text:
                        # 去除 "accumulated_text:" 前缀并拼接
                        text = match_accumulated_text.group(1).replace("accumulated_text:", "").strip()
                        accumulated_text += text + " "

                # 将提取的结果添加到列表中
                extracted_data.append({
                    "recognize_times": recognize_times,
                    "recognized_texts": recognized_texts,
                    "run_times": run_times,
                    "accumulated_text": accumulated_text.strip()  # 移除最后的多余空格
                })

                # 添加分隔符
                extracted_data.append('--------------------------------')

            # 标记为检测到新的 "小悠小悠"
            in_detection_section = True
            last_detection_index = i  # 更新上一个检测的位置

            # 清空 accumulated_text 为下一次检测做准备
            accumulated_text = ""

    # 最后一次检查,确保从最后一个检测点到文件结束的时间也被记录
    if in_detection_section:
        recognize_times = []
        recognized_texts = []
        run_times = []

        for j in range(last_detection_index + 1, len(log_data)):
            # 提取 recognize_time
            match_recognize_time = re.search(r"(recognize_time:\d+\.\d+)", log_data[j])
            if match_recognize_time:
                recognize_times.append(match_recognize_time.group(1))

            # 提取 Recognized text
            match_recognized_text = re.search(r"(Recognized text:.*?)(?=\n|$)", log_data[j])
            if match_recognized_text:
                recognized_texts.append(match_recognized_text.group(1))

            # 提取运行时间
            match_run_time = re.search(r"(运行时间:\d+\.\d+)", log_data[j])
            if match_run_time:
                run_times.append(match_run_time.group(1))

            # 提取 accumulated_text,并进行拼接,去除每个累积文本的 "accumulated_text:" 前缀
            match_accumulated_text = re.search(r"(accumulated_text:.*?)(?=\n|$)", log_data[j])
            if match_accumulated_text:
                # 去除 "accumulated_text:" 前缀并拼接
                text = match_accumulated_text.group(1).replace("accumulated_text:", "").strip()
                accumulated_text += text + " "

        # 将提取的结果添加到列表中
        extracted_data.append({
            "recognize_times": recognize_times,
            "recognized_texts": recognized_texts,
            "run_times": run_times,
            "accumulated_text": accumulated_text.strip()  # 移除最后的多余空格
        })

        # 添加分隔符
        extracted_data.append('--------------------------------')

    # 将提取的数据保存到输出文件
    with open(output_file_path, 'w') as output_file:
        for section in extracted_data:
            if isinstance(section, dict):  # 如果是字典
                output_file.write("Recognize Time:\n")
                for time in section["recognize_times"]:
                    output_file.write(time + '\n')

                output_file.write("\nRecognized Text:\n")
                for text in section["recognized_texts"]:
                    output_file.write(text + '\n')

                output_file.write("\nModel Time:\n")
                for run_time in section["run_times"]:
                    output_file.write(run_time + '\n')

                output_file.write("\nAccumulated Text:\n")
                output_file.write(section["accumulated_text"] + '\n')

                output_file.write("\n")

            elif section == '--------------------------------':
                output_file.write('--------------------------------\n')

    print(f"提取的数据已保存到 {output_file_path}")


if __name__ == "__main__":
# 调用函数并指定输出文件路径
    log_file_path = 'Language_2025-01-03_165625.log'  # 替换为实际的日志文件路径
    output_file_path = 'extracted_data.txt'  # 输出文件路径

    extract_data_from_log(log_file_path, output_file_path)