diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f15f057..773b1ac9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [v1.7.1](https://github.com/DS4SD/docling/releases/tag/v1.7.1) - 2024-08-23 + +### Fix + +* Better raise exception when a page fails to parse ([#46](https://github.com/DS4SD/docling/issues/46)) ([`8808463`](https://github.com/DS4SD/docling/commit/8808463cecd7ff3a92bd99d2e3d65fd248672c9e)) +* Upgrade docling-parse to 1.1.1, safety checks for failed parse on pages ([#45](https://github.com/DS4SD/docling/issues/45)) ([`7e84533`](https://github.com/DS4SD/docling/commit/7e845332992ab37386daee087573773051bfd065)) + ## [v1.7.0](https://github.com/DS4SD/docling/releases/tag/v1.7.0) - 2024-08-22 ### Feature diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 50c1ff76..aeaf4739 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -26,6 +26,10 @@ class DoclingParsePageBackend(PdfPageBackend): self.valid = "pages" in parsed_page if self.valid: self._dpage = parsed_page["pages"][0] + else: + _log.info( + f"An error occured when loading page {page_no} of document {document_hash}." + ) def is_valid(self) -> bool: return self.valid @@ -198,7 +202,9 @@ class DoclingParseDocumentBackend(PdfDocumentBackend): success = self.parser.load_document(document_hash, str(path_or_stream)) if not success: - raise RuntimeError("docling-parse could not load this document.") + raise RuntimeError( + f"docling-parse could not load document {document_hash}." + ) def page_count(self) -> int: return len(self._pdoc) # To be replaced with docling-parse API diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index 6715339f..b7ec824a 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -1,3 +1,4 @@ +import logging import random from io import BytesIO from pathlib import Path @@ -7,16 +8,28 @@ import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c from PIL import Image, ImageDraw from pypdfium2 import PdfPage +from pypdfium2._helpers.misc import PdfiumError from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize +_log = logging.getLogger(__name__) + class PyPdfiumPageBackend(PdfPageBackend): - def __init__(self, pdfium_doc: pdfium.PdfDocument, page_no: int): - self._ppage: pdfium.PdfPage = pdfium_doc[page_no] + def __init__( + self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int + ): + self.valid = True # No better way to tell from pypdfium. + try: + self._ppage: pdfium.PdfPage = pdfium_doc[page_no] + except PdfiumError as e: + _log.info( + f"An exception occured when loading page {page_no} of document {document_hash}.", + exc_info=True, + ) + self.valid = False self.text_page = None - self.valid = True def is_valid(self) -> bool: return self.valid @@ -220,13 +233,18 @@ class PyPdfiumPageBackend(PdfPageBackend): class PyPdfiumDocumentBackend(PdfDocumentBackend): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): super().__init__(path_or_stream, document_hash) - self._pdoc = pdfium.PdfDocument(path_or_stream) + try: + self._pdoc = pdfium.PdfDocument(path_or_stream) + except PdfiumError as e: + raise RuntimeError( + f"pypdfium could not load document {document_hash}" + ) from e def page_count(self) -> int: return len(self._pdoc) def load_page(self, page_no: int) -> PyPdfiumPageBackend: - return PyPdfiumPageBackend(self._pdoc, page_no) + return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no) def is_valid(self) -> bool: return self.page_count() > 0 diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index c579cb50..2705c9df 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -16,7 +16,7 @@ class ConversionStatus(str, Enum): STARTED = auto() FAILURE = auto() SUCCESS = auto() - SUCCESS_WITH_ERRORS = auto() + PARTIAL_SUCCESS = auto() class DocInputType(str, Enum): @@ -29,6 +29,18 @@ class CoordOrigin(str, Enum): BOTTOMLEFT = auto() +class DoclingComponentType(str, Enum): + PDF_BACKEND = auto() + MODEL = auto() + DOC_ASSEMBLER = auto() + + +class ErrorItem(BaseModel): + component_type: DoclingComponentType + module_name: str + error_message: str + + class PageSize(BaseModel): width: float = 0.0 height: float = 0.0 diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 5726b76d..57d40c35 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -19,6 +19,7 @@ from docling.datamodel.base_models import ( AssembledUnit, ConversionStatus, DocumentStream, + ErrorItem, FigureElement, Page, PageElement, @@ -118,7 +119,7 @@ class ConvertedDocument(BaseModel): input: InputDocument status: ConversionStatus = ConversionStatus.PENDING # failure, success - errors: List[Dict] = [] # structure to keep errors + errors: List[ErrorItem] = [] # structure to keep errors pages: List[Page] = [] assembled: Optional[AssembledUnit] = None diff --git a/docling/document_converter.py b/docling/document_converter.py index dce42cfa..8a71a570 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -16,6 +16,8 @@ from docling.datamodel.base_models import ( AssembledUnit, AssembleOptions, ConversionStatus, + DoclingComponentType, + ErrorItem, Page, PipelineOptions, ) @@ -157,54 +159,46 @@ class DocumentConverter: for page_batch in chunkify( converted_doc.pages, settings.perf.page_batch_size ): - try: + start_pb_time = time.time() + # Pipeline - start_pb_time = time.time() - # Pipeline + # 1. Initialise the page resources + init_pages = map( + functools.partial(self.initialize_page, in_doc), page_batch + ) - # 1. Initialise the page resources - init_pages = map( - functools.partial(self.initialize_page, in_doc), page_batch - ) + # 2. Populate page image + pages_with_images = map( + functools.partial(self.populate_page_images, in_doc), init_pages + ) - # 2. Populate page image - pages_with_images = map( - functools.partial(self.populate_page_images, in_doc), init_pages - ) + # 3. Populate programmatic page cells + pages_with_cells = map( + functools.partial(self.parse_page_cells, in_doc), + pages_with_images, + ) - # 3. Populate programmatic page cells - pages_with_cells = map( - functools.partial(self.parse_page_cells, in_doc), - pages_with_images, - ) + # 4. Run pipeline stages + pipeline_pages = self.model_pipeline.apply(pages_with_cells) - # 4. Run pipeline stages - pipeline_pages = self.model_pipeline.apply(pages_with_cells) + # 5. Assemble page elements (per page) + assembled_pages = self.page_assemble_model(pipeline_pages) - # 5. Assemble page elements (per page) - assembled_pages = self.page_assemble_model(pipeline_pages) + # exhaust assembled_pages + for assembled_page in assembled_pages: + # Free up mem resources before moving on with next batch - # exhaust assembled_pages - for assembled_page in assembled_pages: - # Free up mem resources before moving on with next batch + # Remove page images (can be disabled) + if self.assemble_options.images_scale is None: + assembled_page._image_cache = {} - # Remove page images (can be disabled) - if self.assemble_options.images_scale is None: - assembled_page._image_cache = {} + # Unload backend + assembled_page._backend.unload() - # Unload backend - assembled_page._backend.unload() + all_assembled_pages.append(assembled_page) - all_assembled_pages.append(assembled_page) - - end_pb_time = time.time() - start_pb_time - _log.info(f"Finished converting page batch time={end_pb_time:.3f}") - - except Exception as e: - trace = "\n".join(traceback.format_exception(e)) - _log.info( - f"Encountered an error during processing of page batch: {trace}" - ) + end_pb_time = time.time() - start_pb_time + _log.info(f"Finished converting page batch time={end_pb_time:.3f}") # Free up mem resources of PDF backend in_doc._backend.unload() @@ -212,12 +206,27 @@ class DocumentConverter: converted_doc.pages = all_assembled_pages self.assemble_doc(converted_doc) - converted_doc.status = ConversionStatus.SUCCESS + status = ConversionStatus.SUCCESS + for page in converted_doc.pages: + if not page._backend.is_valid(): + converted_doc.errors.append( + ErrorItem( + component_type=DoclingComponentType.PDF_BACKEND, + module_name=type(page._backend).__name__, + error_message=f"Page {page.page_no} failed to parse.", + ) + ) + status = ConversionStatus.PARTIAL_SUCCESS + + converted_doc.status = status except Exception as e: converted_doc.status = ConversionStatus.FAILURE trace = "\n".join(traceback.format_exception(e)) - _log.info(f"Encountered an error during conversion: {trace}") + _log.info( + f"Encountered an error during conversion of document {in_doc.document_hash}:\n" + f"{trace}" + ) end_doc_time = time.time() - start_doc_time _log.info( diff --git a/examples/batch_convert.py b/examples/batch_convert.py index 76bbdcd4..f1a5c8b3 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -1,15 +1,10 @@ import json import logging import time -from io import BytesIO from pathlib import Path from typing import Iterable -from docling.datamodel.base_models import ( - ConversionStatus, - DocumentStream, - PipelineOptions, -) +from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.document_converter import DocumentConverter @@ -24,6 +19,7 @@ def export_documents( success_count = 0 failure_count = 0 + partial_success_count = 0 for doc in converted_docs: if doc.status == ConversionStatus.SUCCESS: @@ -37,12 +33,21 @@ def export_documents( # Export Markdown format: with (output_dir / f"{doc_filename}.md").open("w") as fp: fp.write(doc.render_as_markdown()) + elif doc.status == ConversionStatus.PARTIAL_SUCCESS: + _log.info( + f"Document {doc.input.file} was partially converted with the following errors:" + ) + for item in doc.errors: + _log.info(f"\t{item.error_message}") + partial_success_count += 1 else: _log.info(f"Document {doc.input.file} failed to convert.") failure_count += 1 _log.info( - f"Processed {success_count + failure_count} docs, of which {failure_count} failed" + f"Processed {success_count + partial_success_count + failure_count} docs, " + f"of which {failure_count} failed " + f"and {partial_success_count} were partially converted." ) @@ -61,7 +66,7 @@ def main(): # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] # input = DocumentConversionInput.from_streams(docs) - doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False)) + doc_converter = DocumentConverter() input = DocumentConversionInput.from_paths(input_doc_paths) diff --git a/pyproject.toml b/pyproject.toml index e24645a7..35abbd29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "1.7.0" # DO NOT EDIT, updated automatically +version = "1.7.1" # DO NOT EDIT, updated automatically description = "Docling PDF conversion package" authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Peter Staar "] license = "MIT"