diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 36f61191..91da4ef2 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -7,8 +7,6 @@ from PIL import Image class PdfPageBackend(ABC): - def __init__(self, page_obj: Any) -> object: - pass @abstractmethod def get_text_in_rect(self, bbox: "BoundingBox") -> str: diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 18f6c69e..6b15a2ba 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -19,7 +19,6 @@ class DoclingParsePageBackend(PdfPageBackend): def __init__( self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage ): - super().__init__(page_obj) self._ppage = page_obj parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index 56758b1d..adf0b63c 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -13,9 +13,8 @@ from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSi class PyPdfiumPageBackend(PdfPageBackend): - def __init__(self, page_obj: PdfPage): - super().__init__(page_obj) - self._ppage = page_obj + def __init__(self, pdfium_doc: pdfium.PdfDocument, page_no: int): + self._ppage: pdfium.PdfPage = pdfium_doc[page_no] self.text_page = None def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]: @@ -223,7 +222,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend): return len(self._pdoc) def load_page(self, page_no: int) -> PyPdfiumPageBackend: - return PyPdfiumPageBackend(self._pdoc[page_no]) + return PyPdfiumPageBackend(self._pdoc, page_no) def is_valid(self) -> bool: return self.page_count() > 0 diff --git a/docling/document_converter.py b/docling/document_converter.py index 8b1b0e15..dce42cfa 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -157,47 +157,54 @@ class DocumentConverter: for page_batch in chunkify( converted_doc.pages, settings.perf.page_batch_size ): + try: - start_pb_time = time.time() - # Pipeline + start_pb_time = time.time() + # Pipeline - # 1. Initialise the page resources - init_pages = map( - functools.partial(self.initialize_page, in_doc), page_batch - ) + # 1. Initialise the page resources + init_pages = map( + functools.partial(self.initialize_page, in_doc), page_batch + ) - # 2. Populate page image - pages_with_images = map( - functools.partial(self.populate_page_images, in_doc), init_pages - ) + # 2. Populate page image + pages_with_images = map( + functools.partial(self.populate_page_images, in_doc), init_pages + ) - # 3. Populate programmatic page cells - pages_with_cells = map( - functools.partial(self.parse_page_cells, in_doc), - pages_with_images, - ) + # 3. Populate programmatic page cells + pages_with_cells = map( + functools.partial(self.parse_page_cells, in_doc), + pages_with_images, + ) - # 4. Run pipeline stages - pipeline_pages = self.model_pipeline.apply(pages_with_cells) + # 4. Run pipeline stages + pipeline_pages = self.model_pipeline.apply(pages_with_cells) - # 5. Assemble page elements (per page) - assembled_pages = self.page_assemble_model(pipeline_pages) + # 5. Assemble page elements (per page) + assembled_pages = self.page_assemble_model(pipeline_pages) - # exhaust assembled_pages - for assembled_page in assembled_pages: - # Free up mem resources before moving on with next batch + # exhaust assembled_pages + for assembled_page in assembled_pages: + # Free up mem resources before moving on with next batch - # Remove page images (can be disabled) - if self.assemble_options.images_scale is None: - assembled_page._image_cache = {} + # Remove page images (can be disabled) + if self.assemble_options.images_scale is None: + assembled_page._image_cache = {} - # Unload backend - assembled_page._backend.unload() + # Unload backend + assembled_page._backend.unload() - all_assembled_pages.append(assembled_page) + all_assembled_pages.append(assembled_page) - end_pb_time = time.time() - start_pb_time - _log.info(f"Finished converting page batch time={end_pb_time:.3f}") + end_pb_time = time.time() - start_pb_time + _log.info(f"Finished converting page batch time={end_pb_time:.3f}") + + except Exception as e: + trace = "\n".join(traceback.format_exception(e)) + _log.info( + f"Encountered an error during processing of page batch: {trace}" + ) # Free up mem resources of PDF backend in_doc._backend.unload() @@ -230,7 +237,9 @@ class DocumentConverter: # Generate the page image and store it in the page object def populate_page_images(self, doc: InputDocument, page: Page) -> Page: # default scale - page.get_image(scale=1.0) + page.get_image( + scale=1.0 + ) # puts the page image on the image cache at default scale # user requested scales if self.assemble_options.images_scale is not None: