Improve OCR script robustness and document dependencies

Co-authored-by: johndoe6345789 <224850594+johndoe6345789@users.noreply.github.com>
2026-04-24 13:44:55 +00:00 · 2025-12-25 08:18:08 +00:00
parent 807cc09240
commit 21a17d8760
3 changed files with 26 additions and 6 deletions
--- a/docs/PAPER.md
+++ b/docs/PAPER.md
@@ -1,5 +1,8 @@
 # WizardMerge Research Paper
-*Extracted via OCR from paper pages*
+
+*Extracted via OCR from paper pages in `docs/pages/`*
+
+**Note**: This document contains OCR-extracted text which may have some errors. The original page images are preserved in `docs/pages/` for reference. For critical information, please refer to the original images or the published paper.

 ---

--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,7 @@
 PyQt6>=6.6
+
+# Optional: OCR dependencies for extracting text from documents
+# Uncomment if you need to run scripts/ocr_pages.py
+# pillow>=10.0
+# pytesseract>=0.3.10
+# System requirement: tesseract-ocr (install via: sudo apt-get install tesseract-ocr)
--- a/scripts/ocr_pages.py
+++ b/scripts/ocr_pages.py
@@ -1,5 +1,12 @@
 #!/usr/bin/env python3
-"""Extract text from page images using OCR and save as a markdown document."""
+"""Extract text from page images using OCR and save as a markdown document.
+
+Dependencies:
+    pip install pillow pytesseract
+    
+System requirements:
+    tesseract-ocr (install via: sudo apt-get install tesseract-ocr)
+"""

 from pathlib import Path
 import pytesseract
@@ -13,10 +20,14 @@ def ocr_pages(pages_dir: Path, output_file: Path) -> None:
        raise FileNotFoundError(f"Pages directory not found: {pages_dir}")
    
    # Get all PNG files sorted by number
-    image_files = sorted(
-        pages_dir.glob("*.png"),
-        key=lambda p: int(p.stem.split("_")[-1])
-    )
+    def get_page_number(path: Path) -> int:
+        """Extract page number from filename, defaulting to 0 if not found."""
+        try:
+            return int(path.stem.split("_")[-1])
+        except (ValueError, IndexError):
+            return 0
+    
+    image_files = sorted(pages_dir.glob("*.png"), key=get_page_number)
    
    if not image_files:
        raise ValueError(f"No PNG files found in {pages_dir}")