Files
docling/debug_heron.py
Nikos Livathinos 552a606b4e chore: TMP script to debug heron
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2025-08-27 16:07:49 +02:00

106 lines
3.0 KiB
Python

import argparse
import json
from pathlib import Path
import shutil
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.settings import settings
def main():
r""" """
parser = argparse.ArgumentParser()
parser.add_argument(
"-n", "--name",
type=str,
required=True,
help="Branch name"
)
parser.add_argument(
"-d", "--device",
type=str,
required=False,
default="cpu",
help="Device to run the conversion"
)
parser.add_argument(
"-w", "--work-dir",
type=Path,
required=False,
default="/Users/nli/docling/heron_debugging",
help="Work directory"
)
args = parser.parse_args()
pdf_path = args.work_dir / "2305.03393v1-pg9.pdf"
print(f"Name: {args.name}")
print(f"Input file: {pdf_path}")
# Enable debugging
settings.debug.visualize_cells = True
settings.debug.visualize_ocr = True
settings.debug.visualize_layout = True
settings.debug.visualize_raw_layout = True
settings.debug.visualize_tables = True
# Locally decide the device
if args.device.lower() == "cpu":
device = AcceleratorDevice.CPU
elif args.device.lower() == "mps":
device = AcceleratorDevice.MPS
else:
raise ValueError(f"Unsupported device: {device}")
# Setup conversion pipeline
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.generate_parsed_pages = True
pipeline_options.generate_page_images = True
pipeline_options.accelerator_options.device = device
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=PdfFormatOption().backend,
)
}
)
# Convert
doc_result = converter.convert(pdf_path)
doc = doc_result.document
# Export and save as json
out_dir = args.work_dir / args.name
out_dir.mkdir(parents=True, exist_ok=True)
save_fn = out_dir / "2305.03393v1-pg9.json"
print(f"Out dir: {out_dir}")
with open(save_fn, "w") as fd:
dd = doc.export_to_dict()
json.dump(dd, fd)
# Move the debug dir
debug_dir = Path("debug/")
if debug_dir.is_dir():
dest_debug = out_dir / "debug"
if dest_debug.is_dir():
shutil.rmtree(dest_debug)
shutil.move(debug_dir, out_dir)
print(f"")
# Visualize the document
viz_imgs = doc.get_visualization()
for page_no, img in viz_imgs.items():
if page_no is not None:
img.save(out_dir / f"docling_p{page_no}.png")
if __name__ == "__main__":
main()