143 lines
6.1 KiB
Python
143 lines
6.1 KiB
Python
import re
|
||
|
||
def extract_data_from_log(log_file_path, output_file_path):
|
||
# 打开并读取日志文件
|
||
with open(log_file_path, 'r') as file:
|
||
log_data = file.readlines()
|
||
|
||
# 用来存储提取的所有数据
|
||
extracted_data = []
|
||
|
||
# 标志位,用于记录第一次检测到 "检测到小悠小悠、小悠师傅" 后
|
||
in_detection_section = False
|
||
last_detection_index = -1
|
||
|
||
# 临时变量存储累积的文本
|
||
accumulated_text = ""
|
||
|
||
# 遍历每一行,查找符合条件的行
|
||
for i in range(len(log_data)):
|
||
# 查找每次 "检测到小悠小悠、小悠师傅" 出现的行
|
||
if "检测到小悠小悠、小悠师傅" in log_data[i]:
|
||
# 如果已经进入了一个检测区段,则提取前一个区段的数据
|
||
if in_detection_section:
|
||
# 提取该段 recognize_time、recognized_text 和 run_time
|
||
recognize_times = []
|
||
recognized_texts = []
|
||
run_times = []
|
||
|
||
for j in range(last_detection_index + 1, i):
|
||
# 提取 recognize_time
|
||
match_recognize_time = re.search(r"(recognize_time:\d+\.\d+)", log_data[j])
|
||
if match_recognize_time:
|
||
recognize_times.append(match_recognize_time.group(1))
|
||
|
||
# 提取 Recognized text
|
||
match_recognized_text = re.search(r"(Recognized text:.*?)(?=\n|$)", log_data[j])
|
||
if match_recognized_text:
|
||
recognized_texts.append(match_recognized_text.group(1))
|
||
|
||
# 提取运行时间
|
||
match_run_time = re.search(r"(运行时间:\d+\.\d+)", log_data[j])
|
||
if match_run_time:
|
||
run_times.append(match_run_time.group(1))
|
||
|
||
# 提取 accumulated_text,并进行拼接,去除每个累积文本的 "accumulated_text:" 前缀
|
||
match_accumulated_text = re.search(r"(accumulated_text:.*?)(?=\n|$)", log_data[j])
|
||
if match_accumulated_text:
|
||
# 去除 "accumulated_text:" 前缀并拼接
|
||
text = match_accumulated_text.group(1).replace("accumulated_text:", "").strip()
|
||
accumulated_text += text + " "
|
||
|
||
# 将提取的结果添加到列表中
|
||
extracted_data.append({
|
||
"recognize_times": recognize_times,
|
||
"recognized_texts": recognized_texts,
|
||
"run_times": run_times,
|
||
"accumulated_text": accumulated_text.strip() # 移除最后的多余空格
|
||
})
|
||
|
||
# 添加分隔符
|
||
extracted_data.append('--------------------------------')
|
||
|
||
# 标记为检测到新的 "小悠小悠"
|
||
in_detection_section = True
|
||
last_detection_index = i # 更新上一个检测的位置
|
||
|
||
# 清空 accumulated_text 为下一次检测做准备
|
||
accumulated_text = ""
|
||
|
||
# 最后一次检查,确保从最后一个检测点到文件结束的时间也被记录
|
||
if in_detection_section:
|
||
recognize_times = []
|
||
recognized_texts = []
|
||
run_times = []
|
||
|
||
for j in range(last_detection_index + 1, len(log_data)):
|
||
# 提取 recognize_time
|
||
match_recognize_time = re.search(r"(recognize_time:\d+\.\d+)", log_data[j])
|
||
if match_recognize_time:
|
||
recognize_times.append(match_recognize_time.group(1))
|
||
|
||
# 提取 Recognized text
|
||
match_recognized_text = re.search(r"(Recognized text:.*?)(?=\n|$)", log_data[j])
|
||
if match_recognized_text:
|
||
recognized_texts.append(match_recognized_text.group(1))
|
||
|
||
# 提取运行时间
|
||
match_run_time = re.search(r"(运行时间:\d+\.\d+)", log_data[j])
|
||
if match_run_time:
|
||
run_times.append(match_run_time.group(1))
|
||
|
||
# 提取 accumulated_text,并进行拼接,去除每个累积文本的 "accumulated_text:" 前缀
|
||
match_accumulated_text = re.search(r"(accumulated_text:.*?)(?=\n|$)", log_data[j])
|
||
if match_accumulated_text:
|
||
# 去除 "accumulated_text:" 前缀并拼接
|
||
text = match_accumulated_text.group(1).replace("accumulated_text:", "").strip()
|
||
accumulated_text += text + " "
|
||
|
||
# 将提取的结果添加到列表中
|
||
extracted_data.append({
|
||
"recognize_times": recognize_times,
|
||
"recognized_texts": recognized_texts,
|
||
"run_times": run_times,
|
||
"accumulated_text": accumulated_text.strip() # 移除最后的多余空格
|
||
})
|
||
|
||
# 添加分隔符
|
||
extracted_data.append('--------------------------------')
|
||
|
||
# 将提取的数据保存到输出文件
|
||
with open(output_file_path, 'w') as output_file:
|
||
for section in extracted_data:
|
||
if isinstance(section, dict): # 如果是字典
|
||
output_file.write("Recognize Time:\n")
|
||
for time in section["recognize_times"]:
|
||
output_file.write(time + '\n')
|
||
|
||
output_file.write("\nRecognized Text:\n")
|
||
for text in section["recognized_texts"]:
|
||
output_file.write(text + '\n')
|
||
|
||
output_file.write("\nModel Time:\n")
|
||
for run_time in section["run_times"]:
|
||
output_file.write(run_time + '\n')
|
||
|
||
output_file.write("\nAccumulated Text:\n")
|
||
output_file.write(section["accumulated_text"] + '\n')
|
||
|
||
output_file.write("\n")
|
||
|
||
elif section == '--------------------------------':
|
||
output_file.write('--------------------------------\n')
|
||
|
||
print(f"提取的数据已保存到 {output_file_path}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# 调用函数并指定输出文件路径
|
||
log_file_path = 'Language_2025-01-03_165625.log' # 替换为实际的日志文件路径
|
||
output_file_path = 'extracted_data.txt' # 输出文件路径
|
||
|
||
extract_data_from_log(log_file_path, output_file_path)
|