mirror of
https://github.com/johndoe6345789/WizardMerge.git
synced 2026-04-24 13:44:55 +00:00
Improve OCR script robustness and document dependencies
Co-authored-by: johndoe6345789 <224850594+johndoe6345789@users.noreply.github.com>
This commit is contained in:
@@ -1,5 +1,8 @@
|
||||
# WizardMerge Research Paper
|
||||
*Extracted via OCR from paper pages*
|
||||
|
||||
*Extracted via OCR from paper pages in `docs/pages/`*
|
||||
|
||||
**Note**: This document contains OCR-extracted text which may have some errors. The original page images are preserved in `docs/pages/` for reference. For critical information, please refer to the original images or the published paper.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -1 +1,7 @@
|
||||
PyQt6>=6.6
|
||||
|
||||
# Optional: OCR dependencies for extracting text from documents
|
||||
# Uncomment if you need to run scripts/ocr_pages.py
|
||||
# pillow>=10.0
|
||||
# pytesseract>=0.3.10
|
||||
# System requirement: tesseract-ocr (install via: sudo apt-get install tesseract-ocr)
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract text from page images using OCR and save as a markdown document."""
|
||||
"""Extract text from page images using OCR and save as a markdown document.
|
||||
|
||||
Dependencies:
|
||||
pip install pillow pytesseract
|
||||
|
||||
System requirements:
|
||||
tesseract-ocr (install via: sudo apt-get install tesseract-ocr)
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
import pytesseract
|
||||
@@ -13,10 +20,14 @@ def ocr_pages(pages_dir: Path, output_file: Path) -> None:
|
||||
raise FileNotFoundError(f"Pages directory not found: {pages_dir}")
|
||||
|
||||
# Get all PNG files sorted by number
|
||||
image_files = sorted(
|
||||
pages_dir.glob("*.png"),
|
||||
key=lambda p: int(p.stem.split("_")[-1])
|
||||
)
|
||||
def get_page_number(path: Path) -> int:
|
||||
"""Extract page number from filename, defaulting to 0 if not found."""
|
||||
try:
|
||||
return int(path.stem.split("_")[-1])
|
||||
except (ValueError, IndexError):
|
||||
return 0
|
||||
|
||||
image_files = sorted(pages_dir.glob("*.png"), key=get_page_number)
|
||||
|
||||
if not image_files:
|
||||
raise ValueError(f"No PNG files found in {pages_dir}")
|
||||
|
||||
Reference in New Issue
Block a user