Implement Phase 1.1: Delete Python skeleton and create C++/TypeScript architecture

Co-authored-by: johndoe6345789 <224850594+johndoe6345789@users.noreply.github.com>
2026-04-24 13:44:55 +00:00 · 2025-12-25 08:37:29 +00:00
parent 155a8b896c
commit 15e40ffd4c
36 changed files with 792 additions and 1112 deletions
--- a/scripts/extract_graphics.py
+++ b/scripts/extract_graphics.py
@@ -1,303 +0,0 @@
-"""Extract image XObjects from wizardmerge.pdf and emit a JSON manifest.
-
-The script avoids external dependencies so it can run in constrained environments.
-Flate-encoded images are converted into PNG byte streams, while DCT-encoded
-images are treated as JPEG. A companion ``images.json`` file captures every
-image's metadata, a lightweight content analysis, and a base64 payload without
-writing raw binaries to disk. Semantic file names are generated from the
-analysis (color, contrast, orientation) so the manifest is easier to navigate.
-"""
-from __future__ import annotations
-
-import base64
-import json
-import pathlib
-import re
-import struct
-import zlib
-from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple
-
-PDF_PATH = pathlib.Path("wizardmerge.pdf")
-OUTPUT_DIR = pathlib.Path("extracted_graphics")
-
-
-@dataclass
-class ImageObject:
-    """Metadata and raw bytes for a single PDF image object."""
-
-    object_number: int
-    width: int
-    height: int
-    color_space: str
-    bits_per_component: int
-    filter: str
-    stream: bytes
-
-    @property
-    def channels(self) -> int:
-        if "/DeviceRGB" in self.color_space:
-            return 3
-        if "/DeviceGray" in self.color_space:
-            return 1
-        raise ValueError(f"Unsupported colorspace {self.color_space!r}")
-
-
-OBJECT_PATTERN = re.compile(rb"(\d+)\s+\d+\s+obj(.*?)endobj", re.DOTALL)
-
-
-def _extract_stream(obj_bytes: bytes) -> bytes:
-    """Return the raw stream bytes for a PDF object."""
-
-    stream_match = re.search(rb"stream\r?\n", obj_bytes)
-    if not stream_match:
-        raise ValueError("No stream found in object")
-
-    start = stream_match.end()
-    length_match = re.search(rb"/Length\s+(\d+)", obj_bytes)
-    if length_match:
-        length = int(length_match.group(1))
-        return obj_bytes[start : start + length]
-
-    end = obj_bytes.find(b"endstream", start)
-    return obj_bytes[start:end].rstrip(b"\r\n")
-
-
-def iter_image_objects(pdf_bytes: bytes) -> Iterable[ImageObject]:
-    """Yield image objects discovered in the PDF payload."""
-
-    for match in OBJECT_PATTERN.finditer(pdf_bytes):
-        obj_bytes = match.group(0)
-        if b"/Subtype /Image" not in obj_bytes:
-            continue
-
-        object_number = int(match.group(1))
-
-        def _lookup(name: bytes) -> Optional[str]:
-            pattern = re.search(rb"/" + name + rb"\s+(/[^\s]+)", obj_bytes)
-            return pattern.group(1).decode("ascii") if pattern else None
-
-        width_match = re.search(rb"/Width\s+(\d+)", obj_bytes)
-        height_match = re.search(rb"/Height\s+(\d+)", obj_bytes)
-        bits_match = re.search(rb"/BitsPerComponent\s+(\d+)", obj_bytes)
-
-        if not (width_match and height_match and bits_match):
-            raise ValueError(f"Image {object_number} missing dimension metadata")
-
-        image = ImageObject(
-            object_number=object_number,
-            width=int(width_match.group(1)),
-            height=int(height_match.group(1)),
-            color_space=_lookup(b"ColorSpace") or "/DeviceRGB",
-            bits_per_component=int(bits_match.group(1)),
-            filter=_lookup(b"Filter") or "",
-            stream=_extract_stream(obj_bytes),
-        )
-        yield image
-
-
-def _png_chunk(tag: bytes, payload: bytes) -> bytes:
-    length = struct.pack(">I", len(payload))
-    crc = struct.pack(">I", zlib.crc32(tag + payload) & 0xFFFFFFFF)
-    return length + tag + payload + crc
-
-
-def _dominant_color_label(means: Tuple[float, ...]) -> str:
-    """Return a coarse color label from per-channel means."""
-
-    if len(means) == 1:
-        gray = means[0]
-        if gray < 16:
-            return "black"
-        if gray < 64:
-            return "dark-gray"
-        if gray < 160:
-            return "mid-gray"
-        if gray < 224:
-            return "light-gray"
-        return "white"
-
-    red, green, blue = means
-    brightness = (red + green + blue) / 3
-    if max(red, green, blue) - min(red, green, blue) < 12:
-        # Essentially grayscale
-        return _dominant_color_label((brightness,))
-
-    dominant_channel = max(range(3), key=lambda idx: (red, green, blue)[idx])
-    channel_names = {0: "red", 1: "green", 2: "blue"}
-    brightness_label = _dominant_color_label((brightness,))
-    return f"{brightness_label}-{channel_names[dominant_channel]}"
-
-
-def _orientation_tag(width: int, height: int) -> str:
-    if width == height:
-        return "square"
-    if width > height:
-        return "landscape"
-    return "portrait"
-
-
-def analyse_flate_image(image: ImageObject) -> Dict[str, object]:
-    """Compute basic color statistics for a Flate-decoded image."""
-
-    raw = zlib.decompress(image.stream)
-    row_stride = image.width * image.channels
-    expected_size = row_stride * image.height
-    if len(raw) != expected_size:
-        raise ValueError(
-            f"Unexpected data length for image {image.object_number}: "
-            f"got {len(raw)}, expected {expected_size}"
-        )
-
-    channel_stats = [
-        {"count": 0, "mean": 0.0, "m2": 0.0, "min": 255, "max": 0}
-        for _ in range(image.channels)
-    ]
-    palette: set[Tuple[int, ...]] = set()
-    palette_limit = 1024
-
-    for idx in range(0, len(raw), image.channels):
-        for channel in range(image.channels):
-            value = raw[idx + channel]
-            stats = channel_stats[channel]
-            stats["count"] += 1
-            delta = value - stats["mean"]
-            stats["mean"] += delta / stats["count"]
-            stats["m2"] += delta * (value - stats["mean"])
-            stats["min"] = min(stats["min"], value)
-            stats["max"] = max(stats["max"], value)
-
-        if len(palette) < palette_limit:
-            if image.channels == 1:
-                palette.add((raw[idx],))
-            else:
-                palette.add(tuple(raw[idx : idx + image.channels]))
-
-    means = tuple(stat["mean"] for stat in channel_stats)
-    variances = tuple(stat["m2"] / max(stat["count"], 1) for stat in channel_stats)
-    palette_size = len(palette) if len(palette) < palette_limit else None
-    primary_color = _dominant_color_label(means)
-
-    return {
-        "means": means,
-        "variances": variances,
-        "min": tuple(stat["min"] for stat in channel_stats),
-        "max": tuple(stat["max"] for stat in channel_stats),
-        "palette_size": palette_size,
-        "primary_color": primary_color,
-        "orientation": _orientation_tag(image.width, image.height),
-    }
-
-
-def semantic_name(image: ImageObject, mime: str, analysis: Optional[Dict[str, object]]) -> str:
-    """Generate a more meaningful file name based on image analysis."""
-
-    extension = "png" if mime == "image/png" else "jpg"
-    base_parts = []
-
-    if analysis:
-        palette_size = analysis.get("palette_size")
-        variances: Tuple[float, ...] = analysis.get("variances", ())  # type: ignore[assignment]
-        variance_score = sum(variances) / max(len(variances), 1)
-        primary_color = analysis.get("primary_color") or "unknown"
-        base_parts.append(primary_color)
-
-        if palette_size == 1:
-            base_parts.append("solid")
-        elif palette_size and palette_size <= 4:
-            base_parts.append("two-tone")
-        elif variance_score < 400:
-            base_parts.append("low-contrast")
-        else:
-            base_parts.append("detailed")
-
-        base_parts.append(str(analysis.get("orientation", "unknown")))
-    else:
-        base_parts.extend(["jpeg", _orientation_tag(image.width, image.height)])
-
-    base_parts.append(f"{image.width}x{image.height}")
-    base_parts.append(f"obj{image.object_number}")
-    return "-".join(base_parts) + f".{extension}"
-
-
-def raw_to_png(image: ImageObject) -> tuple[bytes, Dict[str, object]]:
-    """Convert a Flate-encoded image stream to PNG bytes and analysis."""
-
-    if image.bits_per_component != 8:
-        raise ValueError(f"Unsupported bit depth: {image.bits_per_component}")
-
-    analysis = analyse_flate_image(image)
-
-    raw = zlib.decompress(image.stream)
-    row_stride = image.width * image.channels
-    filtered = b"".join(
-        b"\x00" + raw[i : i + row_stride] for i in range(0, len(raw), row_stride)
-    )
-
-    color_type = 2 if image.channels == 3 else 0
-    ihdr = struct.pack(
-        ">IIBBBBB", image.width, image.height, 8, color_type, 0, 0, 0
-    )
-    png = b"\x89PNG\r\n\x1a\n"
-    png += _png_chunk(b"IHDR", ihdr)
-    png += _png_chunk(b"IDAT", zlib.compress(filtered))
-    png += _png_chunk(b"IEND", b"")
-    return png, analysis
-
-
-def save_images(images: List[ImageObject]) -> None:
-    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-    manifest: List[dict[str, object]] = []
-    errors: List[str] = []
-
-    for index, image in enumerate(sorted(images, key=lambda im: im.object_number), start=1):
-        analysis: Optional[Dict[str, object]] = None
-
-        try:
-            if image.filter == "/FlateDecode":
-                raw_bytes, analysis = raw_to_png(image)
-                mime = "image/png"
-            elif image.filter == "/DCTDecode":
-                raw_bytes = image.stream
-                mime = "image/jpeg"
-            else:
-                raise ValueError(f"Unsupported filter {image.filter}")
-        except Exception as exc:  # noqa: BLE001 - surface helpful error context
-            placeholder = f"obj{image.object_number}"
-            errors.append(f"{placeholder}: {exc}")
-            print(f"Skipping {placeholder}: {exc}")
-            continue
-
-        name = semantic_name(image, mime, analysis)
-        encoded = base64.b64encode(raw_bytes).decode("ascii")
-        manifest.append(
-            {
-                "name": name,
-                "object_number": image.object_number,
-                "width": image.width,
-                "height": image.height,
-                "color_space": image.color_space,
-                "bits_per_component": image.bits_per_component,
-                "mime": mime,
-                "base64": encoded,
-                "analysis": analysis,
-            }
-        )
-        print(f"Captured {name} ({image.width}x{image.height}, {mime})")
-
-    images_path = OUTPUT_DIR / "images.json"
-    images_path.write_text(json.dumps(manifest, indent=2))
-    if errors:
-        (OUTPUT_DIR / "errors.txt").write_text("\n".join(errors))
-        print(f"Encountered errors for {len(errors)} image(s); see errors.txt")
-    print(f"Wrote JSON manifest to {images_path}")
-
-
-def main() -> None:
-    pdf_bytes = PDF_PATH.read_bytes()
-    images = list(iter_image_objects(pdf_bytes))
-    save_images(images)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/install_all_python.sh
+++ b/scripts/install_all_python.sh
@@ -1,8 +0,0 @@
-#!/usr/bin/env sh
-# Install Python dependencies system-wide or in the active environment.
-set -eu
-
-PYTHON_BIN=${PYTHON_BIN:-python3}
-
-"$PYTHON_BIN" -m pip install --upgrade pip
-"$PYTHON_BIN" -m pip install -r requirements.txt
--- a/scripts/ocr_pages.py
+++ b/scripts/ocr_pages.py
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-"""Extract text from page images using OCR and save as a markdown document.
-
-Dependencies:
-    pip install pillow pytesseract
-    
-System requirements:
-    tesseract-ocr (install via: sudo apt-get install tesseract-ocr)
-"""
-
-from pathlib import Path
-import pytesseract
-from PIL import Image
-
-def ocr_pages(pages_dir: Path, output_file: Path) -> None:
-    """Perform OCR on all page images and create a single document."""
-    
-    pages_dir = pages_dir.resolve()
-    if not pages_dir.exists():
-        raise FileNotFoundError(f"Pages directory not found: {pages_dir}")
-    
-    # Get all PNG files sorted by number
-    def get_page_number(path: Path) -> int:
-        """Extract page number from filename, defaulting to 0 if not found."""
-        try:
-            return int(path.stem.split("_")[-1])
-        except (ValueError, IndexError):
-            return 0
-    
-    image_files = sorted(pages_dir.glob("*.png"), key=get_page_number)
-    
-    if not image_files:
-        raise ValueError(f"No PNG files found in {pages_dir}")
-    
-    print(f"Found {len(image_files)} page images to process...")
-    
-    full_text = []
-    full_text.append("# WizardMerge Research Paper\n")
-    full_text.append("*Extracted via OCR from paper pages*\n\n")
-    full_text.append("---\n\n")
-    
-    for idx, image_file in enumerate(image_files, start=1):
-        print(f"Processing page {idx}/{len(image_files)}: {image_file.name}")
-        
-        try:
-            # Open image and perform OCR
-            img = Image.open(image_file)
-            text = pytesseract.image_to_string(img)
-            
-            # Add page separator and text
-            full_text.append(f"## Page {idx}\n\n")
-            full_text.append(text.strip())
-            full_text.append("\n\n---\n\n")
-            
-        except Exception as e:
-            print(f"  Error processing {image_file.name}: {e}")
-            full_text.append(f"## Page {idx}\n\n")
-            full_text.append(f"*[OCR Error: {e}]*\n\n")
-            full_text.append("---\n\n")
-    
-    # Write output
-    output_file.write_text("".join(full_text))
-    print(f"\nOCR complete! Output written to: {output_file}")
-    print(f"Total pages processed: {len(image_files)}")
-
-
-if __name__ == "__main__":
-    pages_dir = Path(__file__).parent.parent / "docs" / "pages"
-    output_file = Path(__file__).parent.parent / "docs" / "PAPER.md"
-    
-    ocr_pages(pages_dir, output_file)
--- a/scripts/run_app.sh
+++ b/scripts/run_app.sh
@@ -1,13 +0,0 @@
-#!/usr/bin/env sh
-# Launch the WizardMerge GUI using the local virtual environment when present.
-set -eu
-
-VENV_DIR=${VENV_DIR:-.venv}
-PYTHON_BIN=${PYTHON_BIN:-python3}
-
-if [ -d "$VENV_DIR" ]; then
-  # shellcheck disable=SC1090
-  . "$VENV_DIR/bin/activate"
-fi
-
-exec "$PYTHON_BIN" -m wizardmerge.app "$@"
--- a/scripts/setup.sh
+++ b/scripts/setup.sh
@@ -1,18 +0,0 @@
-#!/usr/bin/env sh
-# Prepare a local virtual environment and install dependencies.
-set -eu
-
-VENV_DIR=${VENV_DIR:-.venv}
-PYTHON_BIN=${PYTHON_BIN:-python3}
-
-if [ ! -d "$VENV_DIR" ]; then
-  "$PYTHON_BIN" -m venv "$VENV_DIR"
-fi
-
-# shellcheck disable=SC1090
-. "$VENV_DIR/bin/activate"
-
-pip install --upgrade pip
-pip install -r requirements.txt
-
-echo "Environment ready. Activate with: . $VENV_DIR/bin/activate"
--- a/scripts/tlaplus.py
+++ b/scripts/tlaplus.py
@@ -1,286 +0,0 @@
-#!/usr/bin/env python3
-"""
-Unified TLC helper: replaces bootstrap.{sh,ps1} and run-tlc.{sh,ps1}.
-
-Subcommands:
-  - bootstrap : download tla2tools.jar into tools/ (or custom dir)
-  - run       : ensure jar exists, then run TLC and tee output to log
-"""
-
-from __future__ import annotations
-
-import argparse
-import shutil
-import subprocess
-import sys
-from pathlib import Path
-from typing import Iterable, List
-from urllib.request import urlopen
-
-
-DEFAULT_VERSION = "1.8.0"
-DEFAULT_TOOLS_DIR = "tools"
-DEFAULT_RESULTS_DIR = "ci-results"
-DEFAULT_MODULE = "STLRepairAlgo"
-DEFAULT_CONFIG = "models/STLRepairAlgo.cfg"
-DEFAULT_SPEC_DIR = "spec"
-DEFAULT_JAVA = "java"
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="TLA+ TLC helper (bootstrap + run) in one script."
-    )
-
-    parser.add_argument(
-        "--tools-dir",
-        default=DEFAULT_TOOLS_DIR,
-        help=f"Directory for tla2tools.jar (default: {DEFAULT_TOOLS_DIR!r}).",
-    )
-
-    parser.add_argument(
-        "--version",
-        default=DEFAULT_VERSION,
-        help=f"TLA+ tools version tag (default: {DEFAULT_VERSION!r}).",
-    )
-
-    parser.add_argument(
-        "--url-template",
-        default=(
-            "https://github.com/tlaplus/tlaplus/releases/"
-            "download/v{version}/tla2tools.jar"
-        ),
-        help="Template URL for tla2tools.jar; {version} will be substituted.",
-    )
-
-    subparsers = parser.add_subparsers(
-        dest="command",
-        required=True,
-        help="Subcommands.",
-    )
-
-    bootstrap = subparsers.add_parser(
-        "bootstrap",
-        help="Download tla2tools.jar into tools-dir if missing.",
-    )
-
-    bootstrap.add_argument(
-        "--force",
-        action="store_true",
-        help="Re-download even if tla2tools.jar already exists.",
-    )
-
-    run_p = subparsers.add_parser(
-        "run",
-        help="Run TLC on a TLA+ module, teeing output to a log file.",
-    )
-
-    run_p.add_argument(
-        "module",
-        nargs="?",
-        default=DEFAULT_MODULE,
-        help=f"TLA+ module name without .tla (default: {DEFAULT_MODULE!r}).",
-    )
-
-    run_p.add_argument(
-        "-c",
-        "--config",
-        default=DEFAULT_CONFIG,
-        help=f"Path to TLC config file (default: {DEFAULT_CONFIG!r}).",
-    )
-
-    run_p.add_argument(
-        "--spec-dir",
-        default=DEFAULT_SPEC_DIR,
-        help=f"Directory containing .tla specs (default: {DEFAULT_SPEC_DIR!r}).",
-    )
-
-    run_p.add_argument(
-        "--results-dir",
-        default=DEFAULT_RESULTS_DIR,
-        help=f"Directory for TLC log files (default: {DEFAULT_RESULTS_DIR!r}).",
-    )
-
-    run_p.add_argument(
-        "--java",
-        default=DEFAULT_JAVA,
-        help=f"Java executable (default: {DEFAULT_JAVA!r}).",
-    )
-
-    run_p.add_argument(
-        "--extra-java-arg",
-        action="append",
-        default=[],
-        metavar="ARG",
-        help="Extra argument to pass to Java (can be repeated).",
-    )
-
-    run_p.add_argument(
-        "--no-bootstrap",
-        action="store_true",
-        help="Skip automatic bootstrap before running TLC.",
-    )
-
-    return parser.parse_args()
-
-
-def ensure_dir(path: Path) -> None:
-    path.mkdir(parents=True, exist_ok=True)
-
-
-def build_jar_url(version: str, url_template: str) -> str:
-    return url_template.format(version=version)
-
-
-def download_tla_tools(url: str, target: Path, overwrite: bool = False) -> None:
-    if target.exists() and not overwrite:
-        print(f"tla2tools.jar already present at {target}.")
-        return
-
-    ensure_dir(target.parent)
-    tmp = target.with_suffix(target.suffix + ".tmp")
-
-    print(f"Downloading tla2tools.jar from {url} ...")
-    try:
-        with urlopen(url) as resp, tmp.open("wb") as f:
-            chunk = resp.read(8192)
-            while chunk:
-                f.write(chunk)
-                chunk = resp.read(8192)
-    except Exception as exc:
-        if tmp.exists():
-            tmp.unlink()
-        raise SystemExit(f"Failed to download tla2tools.jar: {exc}") from exc
-
-    tmp.replace(target)
-    target.chmod(0o644)
-    print(f"Saved tla2tools.jar to {target}.")
-
-
-def ensure_java_available(java_exe: str) -> None:
-    if shutil.which(java_exe) is None:
-        raise SystemExit(
-            f"Java executable {java_exe!r} not found in PATH. "
-            "Install Java or pass --java with a full path."
-        )
-
-
-def tee_process_output(
-    proc: subprocess.Popen,
-    log_path: Path,
-) -> int:
-    ensure_dir(log_path.parent)
-    with log_path.open("w", encoding="utf-8", errors="replace") as log:
-        assert proc.stdout is not None
-        for line in proc.stdout:
-            sys.stdout.write(line)
-            sys.stdout.flush()
-            log.write(line)
-            log.flush()
-    return proc.wait()
-
-
-def build_tlc_command(
-    java_exe: str,
-    extra_java_args: Iterable[str],
-    jar_path: Path,
-    cfg_path: Path,
-    module_path: Path,
-) -> List[str]:
-    cmd: List[str] = [java_exe]
-    cmd.extend(extra_java_args)
-    cmd.extend(
-        [
-            "-cp",
-            str(jar_path),
-            "tlc2.TLC",
-            "-config",
-            str(cfg_path),
-            str(module_path),
-        ]
-    )
-    return cmd
-
-
-def run_tlc(
-    java_exe: str,
-    extra_java_args: Iterable[str],
-    tools_dir: Path,
-    spec_dir: Path,
-    module: str,
-    cfg: Path,
-    results_dir: Path,
-) -> int:
-    ensure_java_available(java_exe)
-
-    jar_path = tools_dir / "tla2tools.jar"
-    if not jar_path.exists():
-        raise SystemExit(
-            f"{jar_path} does not exist. Run with 'bootstrap' first "
-            "or omit --no-bootstrap on the 'run' command."
-        )
-
-    module_path = spec_dir / f"{module}.tla"
-    if not module_path.exists():
-        raise SystemExit(f"Spec file not found: {module_path}")
-
-    cfg_path = cfg
-    if not cfg_path.exists():
-        raise SystemExit(f"Config file not found: {cfg_path}")
-
-    ensure_dir(results_dir)
-    log_path = results_dir / f"{module}.tlc.log"
-
-    cmd = build_tlc_command(
-        java_exe=java_exe,
-        extra_java_args=list(extra_java_args),
-        jar_path=jar_path,
-        cfg_path=cfg_path,
-        module_path=module_path,
-    )
-
-    print("Running TLC with command:")
-    print("  " + " ".join(str(c) for c in cmd))
-    print(f"Logging output to {log_path}")
-
-    proc = subprocess.Popen(
-        cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-        bufsize=1,
-        universal_newlines=True,
-    )
-    return tee_process_output(proc, log_path)
-
-
-def main() -> None:
-    args = parse_args()
-    tools_dir = Path(args.tools_dir)
-    url = build_jar_url(args.version, args.url_template)
-    jar_target = tools_dir / "tla2tools.jar"
-
-    if args.command == "bootstrap":
-        download_tla_tools(url, jar_target, overwrite=args.force)
-        return
-
-    if args.command == "run":
-        if not args.no_bootstrap:
-            download_tla_tools(url, jar_target, overwrite=False)
-
-        exit_code = run_tlc(
-            java_exe=args.java,
-            extra_java_args=args.extra_java_arg,
-            tools_dir=tools_dir,
-            spec_dir=Path(args.spec_dir),
-            module=args.module,
-            cfg=Path(args.config),
-            results_dir=Path(args.results_dir),
-        )
-        raise SystemExit(exit_code)
-
-    raise SystemExit("No command given; use --help for usage.")
-
-
-if __name__ == "__main__":
-    main()