import re def extract_data_from_log(log_file_path, output_file_path): # 打开并读取日志文件 with open(log_file_path, 'r') as file: log_data = file.readlines() # 用来存储提取的所有数据 extracted_data = [] # 标志位,用于记录第一次检测到 "检测到小悠小悠、小悠师傅" 后 in_detection_section = False last_detection_index = -1 # 临时变量存储累积的文本 accumulated_text = "" # 遍历每一行,查找符合条件的行 for i in range(len(log_data)): # 查找每次 "检测到小悠小悠、小悠师傅" 出现的行 if "检测到小悠小悠、小悠师傅" in log_data[i]: # 如果已经进入了一个检测区段,则提取前一个区段的数据 if in_detection_section: # 提取该段 recognize_time、recognized_text 和 run_time recognize_times = [] recognized_texts = [] run_times = [] for j in range(last_detection_index + 1, i): # 提取 recognize_time match_recognize_time = re.search(r"(recognize_time:\d+\.\d+)", log_data[j]) if match_recognize_time: recognize_times.append(match_recognize_time.group(1)) # 提取 Recognized text match_recognized_text = re.search(r"(Recognized text:.*?)(?=\n|$)", log_data[j]) if match_recognized_text: recognized_texts.append(match_recognized_text.group(1)) # 提取运行时间 match_run_time = re.search(r"(运行时间:\d+\.\d+)", log_data[j]) if match_run_time: run_times.append(match_run_time.group(1)) # 提取 accumulated_text,并进行拼接,去除每个累积文本的 "accumulated_text:" 前缀 match_accumulated_text = re.search(r"(accumulated_text:.*?)(?=\n|$)", log_data[j]) if match_accumulated_text: # 去除 "accumulated_text:" 前缀并拼接 text = match_accumulated_text.group(1).replace("accumulated_text:", "").strip() accumulated_text += text + " " # 将提取的结果添加到列表中 extracted_data.append({ "recognize_times": recognize_times, "recognized_texts": recognized_texts, "run_times": run_times, "accumulated_text": accumulated_text.strip() # 移除最后的多余空格 }) # 添加分隔符 extracted_data.append('--------------------------------') # 标记为检测到新的 "小悠小悠" in_detection_section = True last_detection_index = i # 更新上一个检测的位置 # 清空 accumulated_text 为下一次检测做准备 accumulated_text = "" # 最后一次检查,确保从最后一个检测点到文件结束的时间也被记录 if in_detection_section: recognize_times = [] recognized_texts = [] run_times = [] for j in range(last_detection_index + 1, len(log_data)): # 提取 recognize_time match_recognize_time = re.search(r"(recognize_time:\d+\.\d+)", log_data[j]) if match_recognize_time: recognize_times.append(match_recognize_time.group(1)) # 提取 Recognized text match_recognized_text = re.search(r"(Recognized text:.*?)(?=\n|$)", log_data[j]) if match_recognized_text: recognized_texts.append(match_recognized_text.group(1)) # 提取运行时间 match_run_time = re.search(r"(运行时间:\d+\.\d+)", log_data[j]) if match_run_time: run_times.append(match_run_time.group(1)) # 提取 accumulated_text,并进行拼接,去除每个累积文本的 "accumulated_text:" 前缀 match_accumulated_text = re.search(r"(accumulated_text:.*?)(?=\n|$)", log_data[j]) if match_accumulated_text: # 去除 "accumulated_text:" 前缀并拼接 text = match_accumulated_text.group(1).replace("accumulated_text:", "").strip() accumulated_text += text + " " # 将提取的结果添加到列表中 extracted_data.append({ "recognize_times": recognize_times, "recognized_texts": recognized_texts, "run_times": run_times, "accumulated_text": accumulated_text.strip() # 移除最后的多余空格 }) # 添加分隔符 extracted_data.append('--------------------------------') # 将提取的数据保存到输出文件 with open(output_file_path, 'w') as output_file: for section in extracted_data: if isinstance(section, dict): # 如果是字典 output_file.write("Recognize Time:\n") for time in section["recognize_times"]: output_file.write(time + '\n') output_file.write("\nRecognized Text:\n") for text in section["recognized_texts"]: output_file.write(text + '\n') output_file.write("\nModel Time:\n") for run_time in section["run_times"]: output_file.write(run_time + '\n') output_file.write("\nAccumulated Text:\n") output_file.write(section["accumulated_text"] + '\n') output_file.write("\n") elif section == '--------------------------------': output_file.write('--------------------------------\n') print(f"提取的数据已保存到 {output_file_path}") if __name__ == "__main__": # 调用函数并指定输出文件路径 log_file_path = 'Language_2025-01-03_165625.log' # 替换为实际的日志文件路径 output_file_path = 'extracted_data.txt' # 输出文件路径 extract_data_from_log(log_file_path, output_file_path)