# type: ignore """ MinerU Document Parser Utility This module provides functionality for parsing PDF, image and office documents using MinerU library, and converts the parsing results into markdown and JSON formats """ from __future__ import annotations __all__ = ["MineruParser"] import os import json import argparse from pathlib import Path from typing import ( Dict, List, Optional, Union, Tuple, Any, TypeVar, cast, TYPE_CHECKING, ClassVar, ) # Type stubs for magic_pdf FileBasedDataWriter = Any FileBasedDataReader = Any PymuDocDataset = Any InferResult = Any PipeResult = Any SupportedPdfParseMethod = Any doc_analyze = Any read_local_office = Any read_local_images = Any if TYPE_CHECKING: from magic_pdf.data.data_reader_writer import ( FileBasedDataWriter, FileBasedDataReader, ) from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.data.read_api import read_local_office, read_local_images else: # MinerU imports from magic_pdf.data.data_reader_writer import ( FileBasedDataWriter, FileBasedDataReader, ) from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.data.read_api import read_local_office, read_local_images T = TypeVar("T") class MineruParser: """ MinerU document parsing utility class Supports parsing PDF, image and office documents (like Word, PPT, etc.), converting the content into structured data and generating markdown and JSON output """ __slots__: ClassVar[Tuple[str, ...]] = () def __init__(self) -> None: """Initialize MineruParser""" pass @staticmethod def safe_write( writer: Any, content: Union[str, bytes, Dict[str, Any], List[Any]], filename: str, ) -> None: """ Safely write content to a file, ensuring the filename is valid Args: writer: The writer object to use content: The content to write filename: The filename to write to """ # Ensure the filename isn't too long if len(filename) > 200: # Most filesystems have limits around 255 characters # Truncate the filename while keeping the extension base, ext = os.path.splitext(filename) filename = base[:190] + ext # Leave room for the extension and some margin # Handle specific content types if isinstance(content, str): # Ensure str content is encoded to bytes if required try: writer.write(content, filename) except TypeError: # If the writer expects bytes, convert string to bytes writer.write(content.encode("utf-8"), filename) else: # For dict/list content, always encode as JSON string first if isinstance(content, (dict, list)): try: writer.write( json.dumps(content, ensure_ascii=False, indent=4), filename ) except TypeError: # If the writer expects bytes, convert JSON string to bytes writer.write( json.dumps(content, ensure_ascii=False, indent=4).encode( "utf-8" ), filename, ) else: # Regular content (assumed to be bytes or compatible) writer.write(content, filename) @staticmethod def parse_pdf( pdf_path: Union[str, Path], output_dir: Optional[str] = None, use_ocr: bool = False, ) -> Tuple[List[Dict[str, Any]], str]: """ Parse PDF document Args: pdf_path: Path to the PDF file output_dir: Output directory path use_ocr: Whether to force OCR parsing Returns: Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text) """ try: # Convert to Path object for easier handling pdf_path = Path(pdf_path) name_without_suff = pdf_path.stem # Prepare output directories - ensure file name is in path if output_dir: base_output_dir = Path(output_dir) local_md_dir = base_output_dir / name_without_suff else: local_md_dir = pdf_path.parent / name_without_suff local_image_dir = local_md_dir / "images" image_dir = local_image_dir.name # Create directories os.makedirs(local_image_dir, exist_ok=True) os.makedirs(local_md_dir, exist_ok=True) # Initialize writers and reader image_writer = FileBasedDataWriter(str(local_image_dir)) # type: ignore md_writer = FileBasedDataWriter(str(local_md_dir)) # type: ignore reader = FileBasedDataReader("") # type: ignore # Read PDF bytes pdf_bytes = reader.read(str(pdf_path)) # type: ignore # Create dataset instance ds = PymuDocDataset(pdf_bytes) # type: ignore # Process based on PDF type and user preference if use_ocr or ds.classify() == SupportedPdfParseMethod.OCR: # type: ignore infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore pipe_result = infer_result.pipe_ocr_mode(image_writer) # type: ignore else: infer_result = ds.apply(doc_analyze, ocr=False) # type: ignore pipe_result = infer_result.pipe_txt_mode(image_writer) # type: ignore # Draw visualizations try: infer_result.draw_model( os.path.join(local_md_dir, f"{name_without_suff}_model.pdf") ) # type: ignore pipe_result.draw_layout( os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf") ) # type: ignore pipe_result.draw_span( os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf") ) # type: ignore except Exception as e: print(f"Warning: Failed to draw visualizations: {str(e)}") # Get data using API methods md_content = pipe_result.get_markdown(image_dir) # type: ignore content_list = pipe_result.get_content_list(image_dir) # type: ignore # Save files using dump methods (consistent with API) pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) # type: ignore pipe_result.dump_content_list( md_writer, f"{name_without_suff}_content_list.json", image_dir ) # type: ignore pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore # Save model result - convert JSON string to bytes before writing model_inference_result = infer_result.get_infer_res() # type: ignore json_str = json.dumps(model_inference_result, ensure_ascii=False, indent=4) try: # Try to write to a file manually to avoid FileBasedDataWriter issues model_file_path = os.path.join( local_md_dir, f"{name_without_suff}_model.json" ) with open(model_file_path, "w", encoding="utf-8") as f: f.write(json_str) except Exception as e: print( f"Warning: Failed to save model result using file write: {str(e)}" ) try: # If direct file write fails, try using the writer with bytes encoding md_writer.write( json_str.encode("utf-8"), f"{name_without_suff}_model.json" ) # type: ignore except Exception as e2: print( f"Warning: Failed to save model result using writer: {str(e2)}" ) return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content)) except Exception as e: print(f"Error in parse_pdf: {str(e)}") raise @staticmethod def parse_office_doc( doc_path: Union[str, Path], output_dir: Optional[str] = None ) -> Tuple[List[Dict[str, Any]], str]: """ Parse office document (Word, PPT, etc.) Args: doc_path: Path to the document file output_dir: Output directory path Returns: Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text) """ try: # Convert to Path object for easier handling doc_path = Path(doc_path) name_without_suff = doc_path.stem # Prepare output directories - ensure file name is in path if output_dir: base_output_dir = Path(output_dir) local_md_dir = base_output_dir / name_without_suff else: local_md_dir = doc_path.parent / name_without_suff local_image_dir = local_md_dir / "images" image_dir = local_image_dir.name # Create directories os.makedirs(local_image_dir, exist_ok=True) os.makedirs(local_md_dir, exist_ok=True) # Initialize writers image_writer = FileBasedDataWriter(str(local_image_dir)) # type: ignore md_writer = FileBasedDataWriter(str(local_md_dir)) # type: ignore # Read office document ds = read_local_office(str(doc_path))[0] # type: ignore # Apply chain of operations according to API documentation # This follows the pattern shown in MS-Office example in the API docs ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md( md_writer, f"{name_without_suff}.md", image_dir ) # type: ignore # Re-execute for getting the content data infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore pipe_result = infer_result.pipe_txt_mode(image_writer) # type: ignore # Get data for return values and additional outputs md_content = pipe_result.get_markdown(image_dir) # type: ignore content_list = pipe_result.get_content_list(image_dir) # type: ignore # Save additional output files pipe_result.dump_content_list( md_writer, f"{name_without_suff}_content_list.json", image_dir ) # type: ignore pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore # Save model result - convert JSON string to bytes before writing model_inference_result = infer_result.get_infer_res() # type: ignore json_str = json.dumps(model_inference_result, ensure_ascii=False, indent=4) try: # Try to write to a file manually to avoid FileBasedDataWriter issues model_file_path = os.path.join( local_md_dir, f"{name_without_suff}_model.json" ) with open(model_file_path, "w", encoding="utf-8") as f: f.write(json_str) except Exception as e: print( f"Warning: Failed to save model result using file write: {str(e)}" ) try: # If direct file write fails, try using the writer with bytes encoding md_writer.write( json_str.encode("utf-8"), f"{name_without_suff}_model.json" ) # type: ignore except Exception as e2: print( f"Warning: Failed to save model result using writer: {str(e2)}" ) return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content)) except Exception as e: print(f"Error in parse_office_doc: {str(e)}") raise @staticmethod def parse_image( image_path: Union[str, Path], output_dir: Optional[str] = None ) -> Tuple[List[Dict[str, Any]], str]: """ Parse image document Args: image_path: Path to the image file output_dir: Output directory path Returns: Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text) """ try: # Convert to Path object for easier handling image_path = Path(image_path) name_without_suff = image_path.stem # Prepare output directories - ensure file name is in path if output_dir: base_output_dir = Path(output_dir) local_md_dir = base_output_dir / name_without_suff else: local_md_dir = image_path.parent / name_without_suff local_image_dir = local_md_dir / "images" image_dir = local_image_dir.name # Create directories os.makedirs(local_image_dir, exist_ok=True) os.makedirs(local_md_dir, exist_ok=True) # Initialize writers image_writer = FileBasedDataWriter(str(local_image_dir)) # type: ignore md_writer = FileBasedDataWriter(str(local_md_dir)) # type: ignore # Read image ds = read_local_images(str(image_path))[0] # type: ignore # Apply chain of operations according to API documentation # This follows the pattern shown in Image example in the API docs ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( md_writer, f"{name_without_suff}.md", image_dir ) # type: ignore # Re-execute for getting the content data infer_result = ds.apply(doc_analyze, ocr=True) # type: ignore pipe_result = infer_result.pipe_ocr_mode(image_writer) # type: ignore # Get data for return values and additional outputs md_content = pipe_result.get_markdown(image_dir) # type: ignore content_list = pipe_result.get_content_list(image_dir) # type: ignore # Save additional output files pipe_result.dump_content_list( md_writer, f"{name_without_suff}_content_list.json", image_dir ) # type: ignore pipe_result.dump_middle_json(md_writer, f"{name_without_suff}_middle.json") # type: ignore # Save model result - convert JSON string to bytes before writing model_inference_result = infer_result.get_infer_res() # type: ignore json_str = json.dumps(model_inference_result, ensure_ascii=False, indent=4) try: # Try to write to a file manually to avoid FileBasedDataWriter issues model_file_path = os.path.join( local_md_dir, f"{name_without_suff}_model.json" ) with open(model_file_path, "w", encoding="utf-8") as f: f.write(json_str) except Exception as e: print( f"Warning: Failed to save model result using file write: {str(e)}" ) try: # If direct file write fails, try using the writer with bytes encoding md_writer.write( json_str.encode("utf-8"), f"{name_without_suff}_model.json" ) # type: ignore except Exception as e2: print( f"Warning: Failed to save model result using writer: {str(e2)}" ) return cast(Tuple[List[Dict[str, Any]], str], (content_list, md_content)) except Exception as e: print(f"Error in parse_image: {str(e)}") raise @staticmethod def parse_document( file_path: Union[str, Path], parse_method: str = "auto", output_dir: Optional[str] = None, save_results: bool = True, ) -> Tuple[List[Dict[str, Any]], str]: """ Parse document using MinerU based on file extension Args: file_path: Path to the file to be parsed parse_method: Parsing method, supports "auto", "ocr", "txt", default is "auto" output_dir: Output directory path, if None, use the directory of the input file save_results: Whether to save parsing results to files Returns: Tuple[List[Dict[str, Any]], str]: Tuple containing (content list JSON, Markdown text) """ # Convert to Path object file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"File does not exist: {file_path}") # Get file extension ext = file_path.suffix.lower() # Choose appropriate parser based on file type if ext in [".pdf"]: return MineruParser.parse_pdf( file_path, output_dir, use_ocr=(parse_method == "ocr") ) elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]: return MineruParser.parse_image(file_path, output_dir) elif ext in [".doc", ".docx", ".ppt", ".pptx"]: return MineruParser.parse_office_doc(file_path, output_dir) else: # For unsupported file types, default to PDF parsing print( f"Warning: Unsupported file extension '{ext}', trying generic PDF parser" ) return MineruParser.parse_pdf( file_path, output_dir, use_ocr=(parse_method == "ocr") ) def main(): """ Main function to run the MinerU parser from command line """ parser = argparse.ArgumentParser(description="Parse documents using MinerU") parser.add_argument("file_path", help="Path to the document to parse") parser.add_argument("--output", "-o", help="Output directory path") parser.add_argument( "--method", "-m", choices=["auto", "ocr", "txt"], default="auto", help="Parsing method (auto, ocr, txt)", ) parser.add_argument( "--stats", action="store_true", help="Display content statistics" ) args = parser.parse_args() try: # Parse the document content_list, md_content = MineruParser.parse_document( file_path=args.file_path, parse_method=args.method, output_dir=args.output ) # Display statistics if requested if args.stats: print("\nDocument Statistics:") print(f"Total content blocks: {len(content_list)}") # Count different types of content content_types = {} for item in content_list: content_type = item.get("type", "unknown") content_types[content_type] = content_types.get(content_type, 0) + 1 print("\nContent Type Distribution:") for content_type, count in content_types.items(): print(f"- {content_type}: {count}") except Exception as e: print(f"Error: {str(e)}") return 1 return 0 if __name__ == "__main__": exit(main())