diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 28207793..10086917 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -268,6 +268,6 @@ class PipelineOptions(BaseModel): class AssembleOptions(BaseModel): - remove_page_images: bool = ( - True # True: page images are removed in the assemble step + keep_page_images: bool = ( + False # False: page images are removed in the assemble step ) diff --git a/docling/document_converter.py b/docling/document_converter.py index fb6381f2..9954bc9b 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -188,7 +188,7 @@ class DocumentConverter: # Free up mem resources before moving on with next batch # Remove page images (can be disabled) - if self.assemble_options.remove_page_images: + if not self.assemble_options.keep_page_images: assembled_page.image = ( None # Comment this if you want to visualize page images ) diff --git a/examples/export_figures.py b/examples/export_figures.py index 80a567ec..6cd98430 100644 --- a/examples/export_figures.py +++ b/examples/export_figures.py @@ -1,15 +1,14 @@ -import json import logging import time from pathlib import Path -from typing import Iterable +from typing import Tuple from docling.datamodel.base_models import ( AssembleOptions, - BoundingBox, ConversionStatus, - CoordOrigin, - PipelineOptions, + FigureElement, + PageElement, + TableElement, ) from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.document_converter import DocumentConverter @@ -17,46 +16,43 @@ from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) -def export_figures( - converted_docs: Iterable[ConvertedDocument], +def export_page_images( + doc: ConvertedDocument, output_dir: Path, ): output_dir.mkdir(parents=True, exist_ok=True) - success_count = 0 - failure_count = 0 + doc_filename = doc.input.file.stem - for doc in converted_docs: - if doc.status == ConversionStatus.SUCCESS: - success_count += 1 - doc_filename = doc.input.file.stem + for page in doc.pages: + page_no = page.page_no + 1 + page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" + with page_image_filename.open("wb") as fp: + page.image.save(fp, format="PNG") - for page in doc.pages: - page_no = page.page_no + 1 - page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" - with page_image_filename.open("wb") as fp: - page.image.save(fp, format="PNG") - for fig_ix, fig in enumerate(doc.output.figures): - page_no = fig.prov[0].page - page_ix = page_no - 1 - x0, y0, x1, y1 = fig.prov[0].bbox - crop_bbox = BoundingBox( - l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT - ).to_top_left_origin(page_height=doc.pages[page_ix].size.height) +def export_element_images( + doc: ConvertedDocument, + output_dir: Path, + allowed_element_types: Tuple[PageElement] = (FigureElement,), +): + output_dir.mkdir(parents=True, exist_ok=True) - cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple()) - fig_image_filename = output_dir / f"{doc_filename}-fig{fig_ix+1}.png" - with fig_image_filename.open("wb") as fp: - cropped_im.save(fp, "PNG") + doc_filename = doc.input.file.stem - else: - _log.info(f"Document {doc.input.file} failed to convert.") - failure_count += 1 + for element_ix, element in enumerate(doc.assembled.elements): + if isinstance(element, allowed_element_types): + page_ix = element.page_no + crop_bbox = element.cluster.bbox.to_top_left_origin( + page_height=doc.pages[page_ix].size.height + ) - _log.info( - f"Processed {success_count + failure_count} docs, of which {failure_count} failed" - ) + cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple()) + element_image_filename = ( + output_dir / f"{doc_filename}-element-{element_ix}.png" + ) + with element_image_filename.open("wb") as fp: + cropped_im.save(fp, "PNG") def main(): @@ -68,15 +64,34 @@ def main(): input_files = DocumentConversionInput.from_paths(input_doc_paths) + # Important: For operating with page images, we must keep them, otherwise the DocumentConverter + # will destroy them for cleaning up memory. assemble_options = AssembleOptions() - assemble_options.remove_page_images = False + assemble_options.keep_page_images = True doc_converter = DocumentConverter(assemble_options=assemble_options) start_time = time.time() converted_docs = doc_converter.convert(input_files) - export_figures(converted_docs, output_dir=Path("./scratch")) + + for doc in converted_docs: + if doc.status != ConversionStatus.SUCCESS: + _log.info(f"Document {doc.input.file} failed to convert.") + continue + + # Export page images + export_page_images(doc, output_dir=Path("./scratch")) + + # Export figures + # export_element_images(doc, output_dir=Path("./scratch"), allowed_element_types=(FigureElement,)) + + # Export figures and tables + export_element_images( + doc, + output_dir=Path("./scratch"), + allowed_element_types=(FigureElement, TableElement), + ) end_time = time.time() - start_time