RAG_APF_planning/CoreRAG/lightrag/mineru_parser.py
2025-06-30 09:53:49 +08:00

514 lines
19 KiB
Python

# type: ignore
"""
MinerU Document Parser Utility
This module provides functionality for parsing PDF, image and office documents using MinerU library,
and converts the parsing results into markdown and JSON formats
"""
from __future__ import annotations
__all__ = ["MineruParser"]
import os
import json
import argparse
from pathlib import Path
from typing import (
Dict,
List,
Optional,
Union,
Tuple,
Any,
TypeVar,
cast,
TYPE_CHECKING,
ClassVar,
)
# Type stubs for magic_pdf
FileBasedDataWriter = Any
FileBasedDataReader = Any
PymuDocDataset = Any
InferResult = Any
PipeResult = Any
SupportedPdfParseMethod = Any
doc_analyze = Any
read_local_office = Any
read_local_images = Any
if TYPE_CHECKING:
from magic_pdf.data.data_reader_writer import (
FileBasedDataWriter,
FileBasedDataReader,
)
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.read_api import read_local_office, read_local_images
else:
# MinerU imports
from magic_pdf.data.data_reader_writer import (
FileBasedDataWriter,
FileBasedDataReader,
)
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.read_api import read_local_office, read_local_images
T = TypeVar("T")
class MineruParser:
"""
MinerU document parsing utility class
Supports parsing PDF, image and office documents (like Word, PPT, etc.),
converting the content into structured data and generating markdown and JSON output
"""
__slots__: ClassVar[Tuple[str, ...]] = ()
def __init__(self) -> None:
"""Initialize MineruParser"""
pass
@staticmethod
def safe_write(
writer: Any,
content: Union[str, bytes, Dict[str, Any], List[Any]],
filename: str,
) -> None:
"""
Safely write content to a file, ensuring the filename is valid
Args:
writer: The writer object to use
content: The content to write
filename: The filename to write to
"""
# Ensure the filename isn't too long
if len(filename) > 200: # Most filesystems have limits around 255 characters
# Truncate the filename while keeping the extension
base, ext = os.path.splitext(filename)
filename = base[:190] + ext # Leave room for the extension and some margin
# Handle specific content types
if isinstance(content, str):
# Ensure str content is encoded to bytes if required
try:
writer.write(content, filename)
except TypeError:
# If the writer expects bytes, convert string to bytes
writer.write(content.encode("utf-8"), filename)
else:
# For dict/list content, always encode as JSON string first
if isinstance(content, (dict, list)):
try:
writer.write(
json.dumps(content, ensure_ascii=False, indent=4), filename
)
except TypeError:
# If the writer expects bytes, convert JSON string to bytes
writer.write(
json.dumps(content, ensure_ascii=False, indent=4).encode(
"utf-8"
),
filename,
)
else:
# Regular content (assumed to be bytes or compatible)
writer.write(content, filename)
@staticmethod
def parse_pdf(
pdf_path: Union[str, Path],
output_dir: Optional[str] = None,
use_ocr: bool = False,
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse PDF document
Args:
pdf_path: Path to the PDF file
output_dir: Output directory path
use_ocr: Whether to force OCR parsing
Returns:
Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text)
"""
try:
# Convert to Path object for easier handling
pdf_path = Path(pdf_path)
name_without_suff = pdf_path.stem
# Prepare output directories - ensure file name is in path
if output_dir:
base_output_dir = Path(output_dir)
local_md_dir = base_output_dir / name_without_suff
else:
local_md_dir = pdf_path.parent / name_without_suff
local_image_dir = local_md_dir / "images"
image_dir = local_image_dir.name
# Create directories
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
# Initialize writers and reader
image_writer = FileBasedDataWriter(str(local_image_dir)) # type: ignore
md_writer = FileBasedDataWriter(str(local_md_dir)) # type: ignore
reader = FileBasedDataReader("") # type: ignore
# Read PDF bytes
pdf_bytes = reader.read(str(pdf_path)) # type: ignore
# Create dataset instance
ds = PymuDocDataset(pdf_bytes) # type: ignore
# Process based on PDF type and user preference
if use_ocr or ds.classify() == SupportedPdfParseMethod.OCR: # type: ignore
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
pipe_result = infer_result.pipe_ocr_mode(image_writer) # type: ignore
else:
infer_result = ds.apply(doc_analyze, ocr=False) # type: ignore
pipe_result = infer_result.pipe_txt_mode(image_writer) # type: ignore
# Draw visualizations
try:
infer_result.draw_model(
os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")
) # type: ignore
pipe_result.draw_layout(
os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")
) # type: ignore
pipe_result.draw_span(
os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")
) # type: ignore
except Exception as e:
print(f"Warning: Failed to draw visualizations: {str(e)}")
# Get data using API methods
md_content = pipe_result.get_markdown(image_dir) # type: ignore
content_list = pipe_result.get_content_list(image_dir) # type: ignore
# Save files using dump methods (consistent with API)
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore
pipe_result.dump_content_list(
md_writer, f"{name_without_suff}_content_list.json", image_dir
) # type: ignore
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
# Save model result - convert JSON string to bytes before writing
model_inference_result = infer_result.get_infer_res() # type: ignore
json_str = json.dumps(model_inference_result, ensure_ascii=False, indent=4)
try:
# Try to write to a file manually to avoid FileBasedDataWriter issues
model_file_path = os.path.join(
local_md_dir, f"{name_without_suff}_model.json"
)
with open(model_file_path, "w", encoding="utf-8") as f:
f.write(json_str)
except Exception as e:
print(
f"Warning: Failed to save model result using file write: {str(e)}"
)
try:
# If direct file write fails, try using the writer with bytes encoding
md_writer.write(
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
) # type: ignore
except Exception as e2:
print(
f"Warning: Failed to save model result using writer: {str(e2)}"
)
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
except Exception as e:
print(f"Error in parse_pdf: {str(e)}")
raise
@staticmethod
def parse_office_doc(
doc_path: Union[str, Path], output_dir: Optional[str] = None
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse office document (Word, PPT, etc.)
Args:
doc_path: Path to the document file
output_dir: Output directory path
Returns:
Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text)
"""
try:
# Convert to Path object for easier handling
doc_path = Path(doc_path)
name_without_suff = doc_path.stem
# Prepare output directories - ensure file name is in path
if output_dir:
base_output_dir = Path(output_dir)
local_md_dir = base_output_dir / name_without_suff
else:
local_md_dir = doc_path.parent / name_without_suff
local_image_dir = local_md_dir / "images"
image_dir = local_image_dir.name
# Create directories
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
# Initialize writers
image_writer = FileBasedDataWriter(str(local_image_dir)) # type: ignore
md_writer = FileBasedDataWriter(str(local_md_dir)) # type: ignore
# Read office document
ds = read_local_office(str(doc_path))[0] # type: ignore
# Apply chain of operations according to API documentation
# This follows the pattern shown in MS-Office example in the API docs
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
) # type: ignore
# Re-execute for getting the content data
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
pipe_result = infer_result.pipe_txt_mode(image_writer) # type: ignore
# Get data for return values and additional outputs
md_content = pipe_result.get_markdown(image_dir) # type: ignore
content_list = pipe_result.get_content_list(image_dir) # type: ignore
# Save additional output files
pipe_result.dump_content_list(
md_writer, f"{name_without_suff}_content_list.json", image_dir
) # type: ignore
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
# Save model result - convert JSON string to bytes before writing
model_inference_result = infer_result.get_infer_res() # type: ignore
json_str = json.dumps(model_inference_result, ensure_ascii=False, indent=4)
try:
# Try to write to a file manually to avoid FileBasedDataWriter issues
model_file_path = os.path.join(
local_md_dir, f"{name_without_suff}_model.json"
)
with open(model_file_path, "w", encoding="utf-8") as f:
f.write(json_str)
except Exception as e:
print(
f"Warning: Failed to save model result using file write: {str(e)}"
)
try:
# If direct file write fails, try using the writer with bytes encoding
md_writer.write(
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
) # type: ignore
except Exception as e2:
print(
f"Warning: Failed to save model result using writer: {str(e2)}"
)
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
except Exception as e:
print(f"Error in parse_office_doc: {str(e)}")
raise
@staticmethod
def parse_image(
image_path: Union[str, Path], output_dir: Optional[str] = None
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse image document
Args:
image_path: Path to the image file
output_dir: Output directory path
Returns:
Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text)
"""
try:
# Convert to Path object for easier handling
image_path = Path(image_path)
name_without_suff = image_path.stem
# Prepare output directories - ensure file name is in path
if output_dir:
base_output_dir = Path(output_dir)
local_md_dir = base_output_dir / name_without_suff
else:
local_md_dir = image_path.parent / name_without_suff
local_image_dir = local_md_dir / "images"
image_dir = local_image_dir.name
# Create directories
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
# Initialize writers
image_writer = FileBasedDataWriter(str(local_image_dir)) # type: ignore
md_writer = FileBasedDataWriter(str(local_md_dir)) # type: ignore
# Read image
ds = read_local_images(str(image_path))[0] # type: ignore
# Apply chain of operations according to API documentation
# This follows the pattern shown in Image example in the API docs
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
) # type: ignore
# Re-execute for getting the content data
infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore
pipe_result = infer_result.pipe_ocr_mode(image_writer) # type: ignore
# Get data for return values and additional outputs
md_content = pipe_result.get_markdown(image_dir) # type: ignore
content_list = pipe_result.get_content_list(image_dir) # type: ignore
# Save additional output files
pipe_result.dump_content_list(
md_writer, f"{name_without_suff}_content_list.json", image_dir
) # type: ignore
pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore
# Save model result - convert JSON string to bytes before writing
model_inference_result = infer_result.get_infer_res() # type: ignore
json_str = json.dumps(model_inference_result, ensure_ascii=False, indent=4)
try:
# Try to write to a file manually to avoid FileBasedDataWriter issues
model_file_path = os.path.join(
local_md_dir, f"{name_without_suff}_model.json"
)
with open(model_file_path, "w", encoding="utf-8") as f:
f.write(json_str)
except Exception as e:
print(
f"Warning: Failed to save model result using file write: {str(e)}"
)
try:
# If direct file write fails, try using the writer with bytes encoding
md_writer.write(
json_str.encode("utf-8"), f"{name_without_suff}_model.json"
) # type: ignore
except Exception as e2:
print(
f"Warning: Failed to save model result using writer: {str(e2)}"
)
return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content))
except Exception as e:
print(f"Error in parse_image: {str(e)}")
raise
@staticmethod
def parse_document(
file_path: Union[str, Path],
parse_method: str = "auto",
output_dir: Optional[str] = None,
save_results: bool = True,
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse document using MinerU based on file extension
Args:
file_path: Path to the file to be parsed
parse_method: Parsing method, supports "auto", "ocr", "txt", default is "auto"
output_dir: Output directory path, if None, use the directory of the input file
save_results: Whether to save parsing results to files
Returns:
Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text)
"""
# Convert to Path object
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File does not exist: {file_path}")
# Get file extension
ext = file_path.suffix.lower()
# Choose appropriate parser based on file type
if ext in [".pdf"]:
return MineruParser.parse_pdf(
file_path, output_dir, use_ocr=(parse_method == "ocr")
)
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
return MineruParser.parse_image(file_path, output_dir)
elif ext in [".doc", ".docx", ".ppt", ".pptx"]:
return MineruParser.parse_office_doc(file_path, output_dir)
else:
# For unsupported file types, default to PDF parsing
print(
f"Warning: Unsupported file extension '{ext}', trying generic PDF parser"
)
return MineruParser.parse_pdf(
file_path, output_dir, use_ocr=(parse_method == "ocr")
)
def main():
"""
Main function to run the MinerU parser from command line
"""
parser = argparse.ArgumentParser(description="Parse documents using MinerU")
parser.add_argument("file_path", help="Path to the document to parse")
parser.add_argument("--output", "-o", help="Output directory path")
parser.add_argument(
"--method",
"-m",
choices=["auto", "ocr", "txt"],
default="auto",
help="Parsing method (auto, ocr, txt)",
)
parser.add_argument(
"--stats", action="store_true", help="Display content statistics"
)
args = parser.parse_args()
try:
# Parse the document
content_list, md_content = MineruParser.parse_document(
file_path=args.file_path, parse_method=args.method, output_dir=args.output
)
# Display statistics if requested
if args.stats:
print("\nDocument Statistics:")
print(f"Total content blocks: {len(content_list)}")
# Count different types of content
content_types = {}
for item in content_list:
content_type = item.get("type", "unknown")
content_types[content_type] = content_types.get(content_type, 0) + 1
print("\nContent Type Distribution:")
for content_type, count in content_types.items():
print(f"- {content_type}: {count}")
except Exception as e:
print(f"Error: {str(e)}")
return 1
return 0
if __name__ == "__main__":
exit(main())