Add ErrorItem and evaluate page valid status

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-08-23 15:46:03 +02:00
commit 6a8e4f565e
8 changed files with 115 additions and 57 deletions

View File

@ -1,3 +1,10 @@
## [v1.7.1](https://github.com/DS4SD/docling/releases/tag/v1.7.1) - 2024-08-23
### Fix
* Better raise exception when a page fails to parse ([#46](https://github.com/DS4SD/docling/issues/46)) ([`8808463`](https://github.com/DS4SD/docling/commit/8808463cecd7ff3a92bd99d2e3d65fd248672c9e))
* Upgrade docling-parse to 1.1.1, safety checks for failed parse on pages ([#45](https://github.com/DS4SD/docling/issues/45)) ([`7e84533`](https://github.com/DS4SD/docling/commit/7e845332992ab37386daee087573773051bfd065))
## [v1.7.0](https://github.com/DS4SD/docling/releases/tag/v1.7.0) - 2024-08-22 ## [v1.7.0](https://github.com/DS4SD/docling/releases/tag/v1.7.0) - 2024-08-22
### Feature ### Feature

View File

@ -26,6 +26,10 @@ class DoclingParsePageBackend(PdfPageBackend):
self.valid = "pages" in parsed_page self.valid = "pages" in parsed_page
if self.valid: if self.valid:
self._dpage = parsed_page["pages"][0] self._dpage = parsed_page["pages"][0]
else:
_log.info(
f"An error occured when loading page {page_no} of document {document_hash}."
)
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
@ -198,7 +202,9 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
success = self.parser.load_document(document_hash, str(path_or_stream)) success = self.parser.load_document(document_hash, str(path_or_stream))
if not success: if not success:
raise RuntimeError("docling-parse could not load this document.") raise RuntimeError(
f"docling-parse could not load document {document_hash}."
)
def page_count(self) -> int: def page_count(self) -> int:
return len(self._pdoc) # To be replaced with docling-parse API return len(self._pdoc) # To be replaced with docling-parse API

View File

@ -1,3 +1,4 @@
import logging
import random import random
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
@ -7,16 +8,28 @@ import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from pypdfium2._helpers.misc import PdfiumError
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
_log = logging.getLogger(__name__)
class PyPdfiumPageBackend(PdfPageBackend): class PyPdfiumPageBackend(PdfPageBackend):
def __init__(self, pdfium_doc: pdfium.PdfDocument, page_no: int): def __init__(
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
):
self.valid = True # No better way to tell from pypdfium.
try:
self._ppage: pdfium.PdfPage = pdfium_doc[page_no] self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
except PdfiumError as e:
_log.info(
f"An exception occured when loading page {page_no} of document {document_hash}.",
exc_info=True,
)
self.valid = False
self.text_page = None self.text_page = None
self.valid = True
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
@ -220,13 +233,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
class PyPdfiumDocumentBackend(PdfDocumentBackend): class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash) super().__init__(path_or_stream, document_hash)
try:
self._pdoc = pdfium.PdfDocument(path_or_stream) self._pdoc = pdfium.PdfDocument(path_or_stream)
except PdfiumError as e:
raise RuntimeError(
f"pypdfium could not load document {document_hash}"
) from e
def page_count(self) -> int: def page_count(self) -> int:
return len(self._pdoc) return len(self._pdoc)
def load_page(self, page_no: int) -> PyPdfiumPageBackend: def load_page(self, page_no: int) -> PyPdfiumPageBackend:
return PyPdfiumPageBackend(self._pdoc, page_no) return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.page_count() > 0 return self.page_count() > 0

View File

@ -16,7 +16,7 @@ class ConversionStatus(str, Enum):
STARTED = auto() STARTED = auto()
FAILURE = auto() FAILURE = auto()
SUCCESS = auto() SUCCESS = auto()
SUCCESS_WITH_ERRORS = auto() PARTIAL_SUCCESS = auto()
class DocInputType(str, Enum): class DocInputType(str, Enum):
@ -29,6 +29,18 @@ class CoordOrigin(str, Enum):
BOTTOMLEFT = auto() BOTTOMLEFT = auto()
class DoclingComponentType(str, Enum):
PDF_BACKEND = auto()
MODEL = auto()
DOC_ASSEMBLER = auto()
class ErrorItem(BaseModel):
component_type: DoclingComponentType
module_name: str
error_message: str
class PageSize(BaseModel): class PageSize(BaseModel):
width: float = 0.0 width: float = 0.0
height: float = 0.0 height: float = 0.0

View File

@ -19,6 +19,7 @@ from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
ConversionStatus, ConversionStatus,
DocumentStream, DocumentStream,
ErrorItem,
FigureElement, FigureElement,
Page, Page,
PageElement, PageElement,
@ -118,7 +119,7 @@ class ConvertedDocument(BaseModel):
input: InputDocument input: InputDocument
status: ConversionStatus = ConversionStatus.PENDING # failure, success status: ConversionStatus = ConversionStatus.PENDING # failure, success
errors: List[Dict] = [] # structure to keep errors errors: List[ErrorItem] = [] # structure to keep errors
pages: List[Page] = [] pages: List[Page] = []
assembled: Optional[AssembledUnit] = None assembled: Optional[AssembledUnit] = None

View File

@ -16,6 +16,8 @@ from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
AssembleOptions, AssembleOptions,
ConversionStatus, ConversionStatus,
DoclingComponentType,
ErrorItem,
Page, Page,
PipelineOptions, PipelineOptions,
) )
@ -157,8 +159,6 @@ class DocumentConverter:
for page_batch in chunkify( for page_batch in chunkify(
converted_doc.pages, settings.perf.page_batch_size converted_doc.pages, settings.perf.page_batch_size
): ):
try:
start_pb_time = time.time() start_pb_time = time.time()
# Pipeline # Pipeline
@ -200,24 +200,33 @@ class DocumentConverter:
end_pb_time = time.time() - start_pb_time end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}") _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
except Exception as e:
trace = "\n".join(traceback.format_exception(e))
_log.info(
f"Encountered an error during processing of page batch: {trace}"
)
# Free up mem resources of PDF backend # Free up mem resources of PDF backend
in_doc._backend.unload() in_doc._backend.unload()
converted_doc.pages = all_assembled_pages converted_doc.pages = all_assembled_pages
self.assemble_doc(converted_doc) self.assemble_doc(converted_doc)
converted_doc.status = ConversionStatus.SUCCESS status = ConversionStatus.SUCCESS
for page in converted_doc.pages:
if not page._backend.is_valid():
converted_doc.errors.append(
ErrorItem(
component_type=DoclingComponentType.PDF_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS
converted_doc.status = status
except Exception as e: except Exception as e:
converted_doc.status = ConversionStatus.FAILURE converted_doc.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e)) trace = "\n".join(traceback.format_exception(e))
_log.info(f"Encountered an error during conversion: {trace}") _log.info(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
)
end_doc_time = time.time() - start_doc_time end_doc_time = time.time() - start_doc_time
_log.info( _log.info(

View File

@ -1,15 +1,10 @@
import json import json
import logging import logging
import time import time
from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
from docling.datamodel.base_models import ( from docling.datamodel.base_models import ConversionStatus, PipelineOptions
ConversionStatus,
DocumentStream,
PipelineOptions,
)
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@ -24,6 +19,7 @@ def export_documents(
success_count = 0 success_count = 0
failure_count = 0 failure_count = 0
partial_success_count = 0
for doc in converted_docs: for doc in converted_docs:
if doc.status == ConversionStatus.SUCCESS: if doc.status == ConversionStatus.SUCCESS:
@ -37,12 +33,21 @@ def export_documents(
# Export Markdown format: # Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp: with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(doc.render_as_markdown()) fp.write(doc.render_as_markdown())
elif doc.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(
f"Document {doc.input.file} was partially converted with the following errors:"
)
for item in doc.errors:
_log.info(f"\t{item.error_message}")
partial_success_count += 1
else: else:
_log.info(f"Document {doc.input.file} failed to convert.") _log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1 failure_count += 1
_log.info( _log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed" f"Processed {success_count + partial_success_count + failure_count} docs, "
f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted."
) )
@ -61,7 +66,7 @@ def main():
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs) # input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False)) doc_converter = DocumentConverter()
input = DocumentConversionInput.from_paths(input_doc_paths) input = DocumentConversionInput.from_paths(input_doc_paths)

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "docling" name = "docling"
version = "1.7.0" # DO NOT EDIT, updated automatically version = "1.7.1" # DO NOT EDIT, updated automatically
description = "Docling PDF conversion package" description = "Docling PDF conversion package"
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"] authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
license = "MIT" license = "MIT"