diff --git a/docs/PAPER.md b/docs/PAPER.md index 0f2857e..21b233c 100644 --- a/docs/PAPER.md +++ b/docs/PAPER.md @@ -1,5 +1,8 @@ # WizardMerge Research Paper -*Extracted via OCR from paper pages* + +*Extracted via OCR from paper pages in `docs/pages/`* + +**Note**: This document contains OCR-extracted text which may have some errors. The original page images are preserved in `docs/pages/` for reference. For critical information, please refer to the original images or the published paper. --- diff --git a/requirements.txt b/requirements.txt index 1a2cbe7..e2d4c91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,7 @@ PyQt6>=6.6 + +# Optional: OCR dependencies for extracting text from documents +# Uncomment if you need to run scripts/ocr_pages.py +# pillow>=10.0 +# pytesseract>=0.3.10 +# System requirement: tesseract-ocr (install via: sudo apt-get install tesseract-ocr) diff --git a/scripts/ocr_pages.py b/scripts/ocr_pages.py index b6e1cf6..4bb8e85 100755 --- a/scripts/ocr_pages.py +++ b/scripts/ocr_pages.py @@ -1,5 +1,12 @@ #!/usr/bin/env python3 -"""Extract text from page images using OCR and save as a markdown document.""" +"""Extract text from page images using OCR and save as a markdown document. + +Dependencies: + pip install pillow pytesseract + +System requirements: + tesseract-ocr (install via: sudo apt-get install tesseract-ocr) +""" from pathlib import Path import pytesseract @@ -13,10 +20,14 @@ def ocr_pages(pages_dir: Path, output_file: Path) -> None: raise FileNotFoundError(f"Pages directory not found: {pages_dir}") # Get all PNG files sorted by number - image_files = sorted( - pages_dir.glob("*.png"), - key=lambda p: int(p.stem.split("_")[-1]) - ) + def get_page_number(path: Path) -> int: + """Extract page number from filename, defaulting to 0 if not found.""" + try: + return int(path.stem.split("_")[-1]) + except (ValueError, IndexError): + return 0 + + image_files = sorted(pages_dir.glob("*.png"), key=get_page_number) if not image_files: raise ValueError(f"No PNG files found in {pages_dir}")