From 552a606b4e8839ab06c36211d4f711cdaedaaf07 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Wed, 27 Aug 2025 16:07:49 +0200
Subject: [PATCH] chore: TMP script to debug heron

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 debug_heron.py | 105 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 debug_heron.py

diff --git a/debug_heron.py b/debug_heron.py
new file mode 100644
index 00000000..4fa8a382
--- /dev/null
+++ b/debug_heron.py
@@ -0,0 +1,105 @@
+import argparse
+import json
+from pathlib import Path
+import shutil
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.settings import settings
+
+
+def main():
+    r""" """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-n", "--name",
+        type=str,
+        required=True,
+        help="Branch name"
+    )
+    parser.add_argument(
+        "-d", "--device",
+        type=str,
+        required=False,
+        default="cpu",
+        help="Device to run the conversion"
+    )
+    parser.add_argument(
+        "-w", "--work-dir",
+        type=Path,
+        required=False,
+        default="/Users/nli/docling/heron_debugging",
+        help="Work directory"
+    )
+    args = parser.parse_args()
+
+    pdf_path = args.work_dir / "2305.03393v1-pg9.pdf"
+    print(f"Name: {args.name}")
+    print(f"Input file: {pdf_path}")
+
+    # Enable debugging
+    settings.debug.visualize_cells = True
+    settings.debug.visualize_ocr = True
+    settings.debug.visualize_layout = True
+    settings.debug.visualize_raw_layout = True
+    settings.debug.visualize_tables = True
+
+    # Locally decide the device
+    if args.device.lower() == "cpu":
+        device = AcceleratorDevice.CPU
+    elif args.device.lower() == "mps":
+        device = AcceleratorDevice.MPS
+    else:
+        raise ValueError(f"Unsupported device: {device}")
+
+    # Setup conversion pipeline
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_ocr = True
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+    pipeline_options.generate_parsed_pages = True
+    pipeline_options.generate_page_images = True
+    pipeline_options.accelerator_options.device = device
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=PdfFormatOption().backend,
+            )
+        }
+    )
+
+    # Convert
+    doc_result = converter.convert(pdf_path)
+    doc = doc_result.document
+
+    # Export and save as json
+    out_dir = args.work_dir / args.name
+    out_dir.mkdir(parents=True, exist_ok=True)
+    save_fn = out_dir / "2305.03393v1-pg9.json"
+    print(f"Out dir: {out_dir}")
+    with open(save_fn, "w") as fd:
+       dd = doc.export_to_dict()
+       json.dump(dd, fd)
+
+    # Move the debug dir
+    debug_dir = Path("debug/")
+    if debug_dir.is_dir():
+        dest_debug = out_dir / "debug"
+        if dest_debug.is_dir():
+            shutil.rmtree(dest_debug)
+        shutil.move(debug_dir, out_dir)
+        print(f"")
+
+    # Visualize the document
+    viz_imgs = doc.get_visualization()
+    for page_no, img in viz_imgs.items():
+        if page_no is not None:
+            img.save(out_dir / f"docling_p{page_no}.png")
+
+
+if __name__ == "__main__":
+    main()