From 4338dea17b31c982155c80f515d8a3ca1b79dfcb Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Mon, 12 Aug 2024 11:29:27 +0200 Subject: [PATCH] Add assemble options and example saving pages and figures Signed-off-by: Michele Dolfi --- docling/datamodel/base_models.py | 6 +++ docling/document_converter.py | 17 +++++-- examples/export_figures.py | 87 ++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 examples/export_figures.py diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 8b6796d6..28207793 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -265,3 +265,9 @@ class PipelineOptions(BaseModel): do_ocr: bool = False # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() + + +class AssembleOptions(BaseModel): + remove_page_images: bool = ( + True # True: page images are removed in the assemble step + ) diff --git a/docling/document_converter.py b/docling/document_converter.py index 95b30a06..fb6381f2 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -14,6 +14,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError from docling.backend.abstract_backend import PdfDocumentBackend from docling.datamodel.base_models import ( AssembledUnit, + AssembleOptions, ConversionStatus, Page, PipelineOptions, @@ -44,6 +45,7 @@ class DocumentConverter: pipeline_options: PipelineOptions = PipelineOptions(), pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND, pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline, + assemble_options: AssembleOptions = AssembleOptions(), ): if not artifacts_path: artifacts_path = self.download_models_hf() @@ -57,6 +59,7 @@ class DocumentConverter: self.page_assemble_model = PageAssembleModel(config={}) self.glm_model = GlmModel(config={}) self.pdf_backend = pdf_backend + self.assemble_options = assemble_options @staticmethod def download_models_hf( @@ -174,17 +177,23 @@ class DocumentConverter: pages_with_images, ) + # 4. Run pipeline stages pipeline_pages = self.model_pipeline.apply(pages_with_cells) - # 7. Assemble page elements (per page) + # 5. Assemble page elements (per page) assembled_pages = self.page_assemble_model(pipeline_pages) # exhaust assembled_pages for assembled_page in assembled_pages: # Free up mem resources before moving on with next batch - assembled_page.image = ( - None # Comment this if you want to visualize page images - ) + + # Remove page images (can be disabled) + if self.assemble_options.remove_page_images: + assembled_page.image = ( + None # Comment this if you want to visualize page images + ) + + # Unload backend assembled_page._backend.unload() all_assembled_pages.append(assembled_page) diff --git a/examples/export_figures.py b/examples/export_figures.py new file mode 100644 index 00000000..80a567ec --- /dev/null +++ b/examples/export_figures.py @@ -0,0 +1,87 @@ +import json +import logging +import time +from pathlib import Path +from typing import Iterable + +from docling.datamodel.base_models import ( + AssembleOptions, + BoundingBox, + ConversionStatus, + CoordOrigin, + PipelineOptions, +) +from docling.datamodel.document import ConvertedDocument, DocumentConversionInput +from docling.document_converter import DocumentConverter + +_log = logging.getLogger(__name__) + + +def export_figures( + converted_docs: Iterable[ConvertedDocument], + output_dir: Path, +): + output_dir.mkdir(parents=True, exist_ok=True) + + success_count = 0 + failure_count = 0 + + for doc in converted_docs: + if doc.status == ConversionStatus.SUCCESS: + success_count += 1 + doc_filename = doc.input.file.stem + + for page in doc.pages: + page_no = page.page_no + 1 + page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" + with page_image_filename.open("wb") as fp: + page.image.save(fp, format="PNG") + + for fig_ix, fig in enumerate(doc.output.figures): + page_no = fig.prov[0].page + page_ix = page_no - 1 + x0, y0, x1, y1 = fig.prov[0].bbox + crop_bbox = BoundingBox( + l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT + ).to_top_left_origin(page_height=doc.pages[page_ix].size.height) + + cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple()) + fig_image_filename = output_dir / f"{doc_filename}-fig{fig_ix+1}.png" + with fig_image_filename.open("wb") as fp: + cropped_im.save(fp, "PNG") + + else: + _log.info(f"Document {doc.input.file} failed to convert.") + failure_count += 1 + + _log.info( + f"Processed {success_count + failure_count} docs, of which {failure_count} failed" + ) + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_paths = [ + Path("./test/data/2206.01062.pdf"), + ] + + input_files = DocumentConversionInput.from_paths(input_doc_paths) + + assemble_options = AssembleOptions() + assemble_options.remove_page_images = False + + doc_converter = DocumentConverter(assemble_options=assemble_options) + + start_time = time.time() + + converted_docs = doc_converter.convert(input_files) + export_figures(converted_docs, output_dir=Path("./scratch")) + + end_time = time.time() - start_time + + _log.info(f"All documents were converted in {end_time:.2f} seconds.") + + +if __name__ == "__main__": + main()