From 14ab351fdb05540b4b4f228747245a658ed4d8a8 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 12 Sep 2024 08:38:08 +0200
Subject: [PATCH] chore: add simple convert script

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 examples/convert.py | 169 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100644 examples/convert.py

diff --git a/examples/convert.py b/examples/convert.py
new file mode 100644
index 00000000..4f8c985a
--- /dev/null
+++ b/examples/convert.py
@@ -0,0 +1,169 @@
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Iterable
+
+import argparse
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+from enum import Enum
+
+# Define an enum for the backend options
+class Backend(Enum):
+    PDFIUM = "pdfium"
+    DOCLING = "docling"
+
+
+def export_documents(
+    conv_results: Iterable[ConversionResult],
+    output_dir: Path,
+):
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    success_count = 0
+    failure_count = 0
+
+    for conv_res in conv_results:
+        if conv_res.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = conv_res.input.file.stem
+
+            # Export Deep Search document JSON format:
+            fname = output_dir / f"{doc_filename}.json"
+            with fname.open("w") as fp:
+                _log.info(f"writing {fname}")
+                fp.write(json.dumps(conv_res.render_as_dict()))
+
+            # Export Text format:
+            with (output_dir / f"{doc_filename}.txt").open("w") as fp:
+                fp.write(conv_res.render_as_text())
+
+            # Export Markdown format:
+            with (output_dir / f"{doc_filename}.md").open("w") as fp:
+                fp.write(conv_res.render_as_markdown())
+
+            # Export Document Tags format:
+            with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
+                fp.write(conv_res.render_as_doctags())
+
+        else:
+            _log.info(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
+
+    _log.info(
+        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
+    )
+
+    return success_count, failure_count
+
+
+def main(pdf, ocr, backend):
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_paths = [
+        Path(pdf)
+    ]
+
+    ###########################################################################
+
+    # The following sections contain a combination of PipelineOptions
+    # and PDF Backends for various configurations.
+    # Uncomment one section at the time to see the differences in the output.
+
+    doc_converter = None
+    if backend==Backend.PDFIUM.value and not ocr: # PyPdfium without OCR
+        pipeline_options = PipelineOptions()
+        pipeline_options.do_ocr=False
+        pipeline_options.do_table_structure=True
+        pipeline_options.table_structure_options.do_cell_matching = False
+        
+        doc_converter = DocumentConverter(
+            pipeline_options=pipeline_options,
+            pdf_backend=PyPdfiumDocumentBackend,
+        )
+
+    elif backend==Backend.PDFIUM.value and ocr: # PyPdfium with OCR
+        pipeline_options = PipelineOptions()
+        pipeline_options.do_ocr=False
+        pipeline_options.do_table_structure=True
+        pipeline_options.table_structure_options.do_cell_matching = True
+        
+        doc_converter = DocumentConverter(
+            pipeline_options=pipeline_options,
+            pdf_backend=PyPdfiumDocumentBackend,
+        )
+
+    elif backend==Backend.DOCLING.value and not ocr: # Docling Parse without OCR
+        pipeline_options = PipelineOptions()
+        pipeline_options.do_ocr = False
+        pipeline_options.do_table_structure = True
+        pipeline_options.table_structure_options.do_cell_matching = True
+        
+        doc_converter = DocumentConverter(
+            pipeline_options=pipeline_options,
+            pdf_backend=DoclingParseDocumentBackend,
+        )
+
+    elif backend==Backend.DOCLING.value and ocr:# Docling Parse with OCR
+         pipeline_options = PipelineOptions()
+         pipeline_options.do_ocr=True
+         pipeline_options.do_table_structure=True
+         pipeline_options.table_structure_options.do_cell_matching = True
+
+         doc_converter = DocumentConverter(
+             pipeline_options=pipeline_options,
+             pdf_backend=DoclingParseDocumentBackend,
+         )
+
+    else:
+        return
+    ###########################################################################
+
+    # Define input files
+    input = DocumentConversionInput.from_paths(input_doc_paths)
+
+    start_time = time.time()
+
+    conv_results = doc_converter.convert(input)
+    success_count, failure_count = export_documents(
+        conv_results, output_dir=Path("./scratch")
+    )
+
+    end_time = time.time() - start_time
+
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+    if failure_count > 0:
+        raise RuntimeError(
+            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+        )
+
+
+if __name__ == "__main__":
+
+    # Create an argument parser
+    parser = argparse.ArgumentParser(description="Process PDF files with optional OCR.")
+
+    # Add arguments
+    parser.add_argument("--pdf", type=str, help="Path to the PDF file.")
+    parser.add_argument("--ocr", type=bool, default=False, help="Enable OCR (True or False).")
+
+    # Add the backend option as an enum
+    parser.add_argument("--backend", type=lambda b: Backend[b.upper()], 
+                        choices=list(Backend), default=Backend.DOCLING,
+                        help="Select backend (pdfium or docling). Default is docling.")
+
+
+    
+    # Parse the arguments
+    args = parser.parse_args()
+    
+    main(args.pdf, args.ocr, args.backend.value)