Add ErrorItem and evaluate page valid status

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-08-23 15:46:03 +02:00
commit 6a8e4f565e
8 changed files with 115 additions and 57 deletions

View File

@ -1,3 +1,10 @@
## [v1.7.1](https://github.com/DS4SD/docling/releases/tag/v1.7.1) - 2024-08-23
### Fix
* Better raise exception when a page fails to parse ([#46](https://github.com/DS4SD/docling/issues/46)) ([`8808463`](https://github.com/DS4SD/docling/commit/8808463cecd7ff3a92bd99d2e3d65fd248672c9e))
* Upgrade docling-parse to 1.1.1, safety checks for failed parse on pages ([#45](https://github.com/DS4SD/docling/issues/45)) ([`7e84533`](https://github.com/DS4SD/docling/commit/7e845332992ab37386daee087573773051bfd065))
## [v1.7.0](https://github.com/DS4SD/docling/releases/tag/v1.7.0) - 2024-08-22 ## [v1.7.0](https://github.com/DS4SD/docling/releases/tag/v1.7.0) - 2024-08-22
### Feature ### Feature

View File

@ -26,6 +26,10 @@ class DoclingParsePageBackend(PdfPageBackend):
self.valid = "pages" in parsed_page self.valid = "pages" in parsed_page
if self.valid: if self.valid:
self._dpage = parsed_page["pages"][0] self._dpage = parsed_page["pages"][0]
else:
_log.info(
f"An error occured when loading page {page_no} of document {document_hash}."
)
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
@ -198,7 +202,9 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
success = self.parser.load_document(document_hash, str(path_or_stream)) success = self.parser.load_document(document_hash, str(path_or_stream))
if not success: if not success:
raise RuntimeError("docling-parse could not load this document.") raise RuntimeError(
f"docling-parse could not load document {document_hash}."
)
def page_count(self) -> int: def page_count(self) -> int:
return len(self._pdoc) # To be replaced with docling-parse API return len(self._pdoc) # To be replaced with docling-parse API

View File

@ -1,3 +1,4 @@
import logging
import random import random
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
@ -7,16 +8,28 @@ import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from pypdfium2._helpers.misc import PdfiumError
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
_log = logging.getLogger(__name__)
class PyPdfiumPageBackend(PdfPageBackend): class PyPdfiumPageBackend(PdfPageBackend):
def __init__(self, pdfium_doc: pdfium.PdfDocument, page_no: int): def __init__(
self._ppage: pdfium.PdfPage = pdfium_doc[page_no] self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
):
self.valid = True # No better way to tell from pypdfium.
try:
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
except PdfiumError as e:
_log.info(
f"An exception occured when loading page {page_no} of document {document_hash}.",
exc_info=True,
)
self.valid = False
self.text_page = None self.text_page = None
self.valid = True
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
@ -220,13 +233,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
class PyPdfiumDocumentBackend(PdfDocumentBackend): class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash) super().__init__(path_or_stream, document_hash)
self._pdoc = pdfium.PdfDocument(path_or_stream) try:
self._pdoc = pdfium.PdfDocument(path_or_stream)
except PdfiumError as e:
raise RuntimeError(
f"pypdfium could not load document {document_hash}"
) from e
def page_count(self) -> int: def page_count(self) -> int:
return len(self._pdoc) return len(self._pdoc)
def load_page(self, page_no: int) -> PyPdfiumPageBackend: def load_page(self, page_no: int) -> PyPdfiumPageBackend:
return PyPdfiumPageBackend(self._pdoc, page_no) return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.page_count() > 0 return self.page_count() > 0

View File

@ -16,7 +16,7 @@ class ConversionStatus(str, Enum):
STARTED = auto() STARTED = auto()
FAILURE = auto() FAILURE = auto()
SUCCESS = auto() SUCCESS = auto()
SUCCESS_WITH_ERRORS = auto() PARTIAL_SUCCESS = auto()
class DocInputType(str, Enum): class DocInputType(str, Enum):
@ -29,6 +29,18 @@ class CoordOrigin(str, Enum):
BOTTOMLEFT = auto() BOTTOMLEFT = auto()
class DoclingComponentType(str, Enum):
PDF_BACKEND = auto()
MODEL = auto()
DOC_ASSEMBLER = auto()
class ErrorItem(BaseModel):
component_type: DoclingComponentType
module_name: str
error_message: str
class PageSize(BaseModel): class PageSize(BaseModel):
width: float = 0.0 width: float = 0.0
height: float = 0.0 height: float = 0.0

View File

@ -19,6 +19,7 @@ from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
ConversionStatus, ConversionStatus,
DocumentStream, DocumentStream,
ErrorItem,
FigureElement, FigureElement,
Page, Page,
PageElement, PageElement,
@ -118,7 +119,7 @@ class ConvertedDocument(BaseModel):
input: InputDocument input: InputDocument
status: ConversionStatus = ConversionStatus.PENDING # failure, success status: ConversionStatus = ConversionStatus.PENDING # failure, success
errors: List[Dict] = [] # structure to keep errors errors: List[ErrorItem] = [] # structure to keep errors
pages: List[Page] = [] pages: List[Page] = []
assembled: Optional[AssembledUnit] = None assembled: Optional[AssembledUnit] = None

View File

@ -16,6 +16,8 @@ from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
AssembleOptions, AssembleOptions,
ConversionStatus, ConversionStatus,
DoclingComponentType,
ErrorItem,
Page, Page,
PipelineOptions, PipelineOptions,
) )
@ -157,54 +159,46 @@ class DocumentConverter:
for page_batch in chunkify( for page_batch in chunkify(
converted_doc.pages, settings.perf.page_batch_size converted_doc.pages, settings.perf.page_batch_size
): ):
try: start_pb_time = time.time()
# Pipeline
start_pb_time = time.time() # 1. Initialise the page resources
# Pipeline init_pages = map(
functools.partial(self.initialize_page, in_doc), page_batch
)
# 1. Initialise the page resources # 2. Populate page image
init_pages = map( pages_with_images = map(
functools.partial(self.initialize_page, in_doc), page_batch functools.partial(self.populate_page_images, in_doc), init_pages
) )
# 2. Populate page image # 3. Populate programmatic page cells
pages_with_images = map( pages_with_cells = map(
functools.partial(self.populate_page_images, in_doc), init_pages functools.partial(self.parse_page_cells, in_doc),
) pages_with_images,
)
# 3. Populate programmatic page cells # 4. Run pipeline stages
pages_with_cells = map( pipeline_pages = self.model_pipeline.apply(pages_with_cells)
functools.partial(self.parse_page_cells, in_doc),
pages_with_images,
)
# 4. Run pipeline stages # 5. Assemble page elements (per page)
pipeline_pages = self.model_pipeline.apply(pages_with_cells) assembled_pages = self.page_assemble_model(pipeline_pages)
# 5. Assemble page elements (per page) # exhaust assembled_pages
assembled_pages = self.page_assemble_model(pipeline_pages) for assembled_page in assembled_pages:
# Free up mem resources before moving on with next batch
# exhaust assembled_pages # Remove page images (can be disabled)
for assembled_page in assembled_pages: if self.assemble_options.images_scale is None:
# Free up mem resources before moving on with next batch assembled_page._image_cache = {}
# Remove page images (can be disabled) # Unload backend
if self.assemble_options.images_scale is None: assembled_page._backend.unload()
assembled_page._image_cache = {}
# Unload backend all_assembled_pages.append(assembled_page)
assembled_page._backend.unload()
all_assembled_pages.append(assembled_page) end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
except Exception as e:
trace = "\n".join(traceback.format_exception(e))
_log.info(
f"Encountered an error during processing of page batch: {trace}"
)
# Free up mem resources of PDF backend # Free up mem resources of PDF backend
in_doc._backend.unload() in_doc._backend.unload()
@ -212,12 +206,27 @@ class DocumentConverter:
converted_doc.pages = all_assembled_pages converted_doc.pages = all_assembled_pages
self.assemble_doc(converted_doc) self.assemble_doc(converted_doc)
converted_doc.status = ConversionStatus.SUCCESS status = ConversionStatus.SUCCESS
for page in converted_doc.pages:
if not page._backend.is_valid():
converted_doc.errors.append(
ErrorItem(
component_type=DoclingComponentType.PDF_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS
converted_doc.status = status
except Exception as e: except Exception as e:
converted_doc.status = ConversionStatus.FAILURE converted_doc.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e)) trace = "\n".join(traceback.format_exception(e))
_log.info(f"Encountered an error during conversion: {trace}") _log.info(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
)
end_doc_time = time.time() - start_doc_time end_doc_time = time.time() - start_doc_time
_log.info( _log.info(

View File

@ -1,15 +1,10 @@
import json import json
import logging import logging
import time import time
from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
from docling.datamodel.base_models import ( from docling.datamodel.base_models import ConversionStatus, PipelineOptions
ConversionStatus,
DocumentStream,
PipelineOptions,
)
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@ -24,6 +19,7 @@ def export_documents(
success_count = 0 success_count = 0
failure_count = 0 failure_count = 0
partial_success_count = 0
for doc in converted_docs: for doc in converted_docs:
if doc.status == ConversionStatus.SUCCESS: if doc.status == ConversionStatus.SUCCESS:
@ -37,12 +33,21 @@ def export_documents(
# Export Markdown format: # Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp: with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(doc.render_as_markdown()) fp.write(doc.render_as_markdown())
elif doc.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(
f"Document {doc.input.file} was partially converted with the following errors:"
)
for item in doc.errors:
_log.info(f"\t{item.error_message}")
partial_success_count += 1
else: else:
_log.info(f"Document {doc.input.file} failed to convert.") _log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1 failure_count += 1
_log.info( _log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed" f"Processed {success_count + partial_success_count + failure_count} docs, "
f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted."
) )
@ -61,7 +66,7 @@ def main():
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs) # input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False)) doc_converter = DocumentConverter()
input = DocumentConversionInput.from_paths(input_doc_paths) input = DocumentConversionInput.from_paths(input_doc_paths)

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "docling" name = "docling"
version = "1.7.0" # DO NOT EDIT, updated automatically version = "1.7.1" # DO NOT EDIT, updated automatically
description = "Docling PDF conversion package" description = "Docling PDF conversion package"
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"] authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
license = "MIT" license = "MIT"