Propagate document_hash to PDF backends, use docling-parse 1.0.0

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-08-22 13:05:24 +02:00
parent 55b538fa1b
commit ebcc1e5524
7 changed files with 57 additions and 45 deletions

View File

@ -39,8 +39,9 @@ class PdfPageBackend(ABC):
class PdfDocumentBackend(ABC): class PdfDocumentBackend(ABC):
@abstractmethod @abstractmethod
def __init__(self, path_or_stream: Union[BytesIO, Path]): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
pass self.path_or_stream = path_or_stream
self.document_hash = document_hash
@abstractmethod @abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend: def load_page(self, page_no: int) -> PdfPageBackend:
@ -56,4 +57,7 @@ class PdfDocumentBackend(ABC):
@abstractmethod @abstractmethod
def unload(self): def unload(self):
pass if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None

View File

@ -17,16 +17,14 @@ _log = logging.getLogger(__name__)
class DoclingParsePageBackend(PdfPageBackend): class DoclingParsePageBackend(PdfPageBackend):
def __init__( def __init__(
self, parser: pdf_parser, pdf_bytes: BytesIO, page_no: int, page_obj: PdfPage self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
): ):
super().__init__(page_obj) super().__init__(page_obj)
self._ppage = page_obj self._ppage = page_obj
parsed_page = parser.find_cells_from_bytesio_on_page(pdf_bytes, page_no) parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
self._dpage = parsed_page["pages"][0] self._dpage = parsed_page["pages"][0]
print(f"Parsed page {page_no} of doc.")
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
# Find intersecting cells on the page # Find intersecting cells on the page
text_piece = "" text_piece = ""
@ -175,28 +173,36 @@ class DoclingParsePageBackend(PdfPageBackend):
class DoclingParseDocumentBackend(PdfDocumentBackend): class DoclingParseDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path]): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream) super().__init__(path_or_stream, document_hash)
with open(path_or_stream, "rb") as fh:
self.pdf_bytes = BytesIO(fh.read())
self._pdoc = pdfium.PdfDocument(path_or_stream) self._pdoc = pdfium.PdfDocument(path_or_stream)
self.parser = pdf_parser() self.parser = pdf_parser()
success = False
if isinstance(path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio(
document_hash, path_or_stream
)
elif isinstance(path_or_stream, Path):
success = self.parser.load_document(document_hash, str(path_or_stream))
if not success:
raise RuntimeError("docling-parse could not load this document.")
def page_count(self) -> int: def page_count(self) -> int:
return len(self._pdoc) # To be replaced with docling-parse API return len(self._pdoc) # To be replaced with docling-parse API
def load_page(self, page_no: int) -> DoclingParsePageBackend: def load_page(self, page_no: int) -> DoclingParsePageBackend:
return DoclingParsePageBackend( return DoclingParsePageBackend(
self.parser, self.pdf_bytes, page_no, self._pdoc[page_no] self.parser, self.document_hash, page_no, self._pdoc[page_no]
) )
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.page_count() > 0 return self.page_count() > 0
def unload(self): def unload(self):
super().unload()
self.parser.unload_document(self.document_hash)
self._pdoc.close() self._pdoc.close()
self._pdoc = None self._pdoc = None
self._parser_doc = None

View File

@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
class PyPdfiumDocumentBackend(PdfDocumentBackend): class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path]): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream) super().__init__(path_or_stream, document_hash)
self._pdoc = pdfium.PdfDocument(path_or_stream) self._pdoc = pdfium.PdfDocument(path_or_stream)
def page_count(self) -> int: def page_count(self) -> int:
@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
return self.page_count() > 0 return self.page_count() > 0
def unload(self): def unload(self):
super().unload()
self._pdoc.close() self._pdoc.close()
self._pdoc = None self._pdoc = None

View File

@ -79,7 +79,9 @@ class InputDocument(BaseModel):
self.valid = False self.valid = False
else: else:
self.document_hash = create_file_hash(path_or_stream) self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(path_or_stream=path_or_stream) self._backend = pdf_backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
elif isinstance(path_or_stream, BytesIO): elif isinstance(path_or_stream, BytesIO):
self.file = PurePath(filename) self.file = PurePath(filename)
@ -89,7 +91,9 @@ class InputDocument(BaseModel):
self.valid = False self.valid = False
else: else:
self.document_hash = create_file_hash(path_or_stream) self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(path_or_stream=path_or_stream) self._backend = pdf_backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
if self.document_hash and self._backend.page_count() > 0: if self.document_hash and self._backend.page_count() > 0:
self.page_count = self._backend.page_count() self.page_count = self._backend.page_count()

View File

@ -1,10 +1,15 @@
import json import json
import logging import logging
import time import time
from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
PipelineOptions,
)
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@ -52,6 +57,10 @@ def main():
Path("./test/data/redp5695.pdf"), Path("./test/data/redp5695.pdf"),
] ]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False)) doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False))
input = DocumentConversionInput.from_paths(input_doc_paths) input = DocumentConversionInput.from_paths(input_doc_paths)

36
poetry.lock generated
View File

@ -822,35 +822,23 @@ tqdm = ">=4.64.0,<5.0.0"
[[package]] [[package]]
name = "docling-parse" name = "docling-parse"
version = "0.2.0" version = "1.0.0"
description = "Simple package to extract text with coordinates from programmatic PDFs" description = "Simple package to extract text with coordinates from programmatic PDFs"
optional = false optional = false
python-versions = "<4.0,>=3.9" python-versions = "<4.0,>=3.9"
files = [ files = [
{file = "docling_parse-0.2.0-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:3ec6458d36bd33862ae1ca38accbcd2ddc8a881fb5a3ab0aeb9e023bc20d8e04"}, {file = "docling_parse-1.0.0-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:068db83a192b21783cc7bc66e9d3efb9072a57edeb8c07ef1a83a93353efcc36"},
{file = "docling_parse-0.2.0-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:898ee83f1e6f97dd34362948fcc70753fa95c83f77eddf48de5e352db10402f7"}, {file = "docling_parse-1.0.0-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:f57f9bba3ac6a81fc30c34bb08261d7308b0a780d90cbee903821aec2f5fbd88"},
{file = "docling_parse-0.2.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:9247e6902f979d23860e4b819b0145a9f55be78b14cf2906ac98f8fb0e9627cd"}, {file = "docling_parse-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdf142dea82f0a5f5e1bcaa74cc9feeda12899077589e3eb6c728d334b43cdda"},
{file = "docling_parse-0.2.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:ebd0f091bdb106f1c3f72448aedfee52a904cb01e4de73827446e30fc3ac3b54"}, {file = "docling_parse-1.0.0-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:8834a8387a55b4082c20da184e7d09f705c17558c465da9a5f35974b19013fe5"},
{file = "docling_parse-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9846bd3347a41337d6e83d7fbfbc636274ed3863ac375f4ca5eac1ea0eb88b8f"}, {file = "docling_parse-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5c4b80a8d5e8f832910f32188501a9a6718a0223fb9921ee7cc5cfe62adb857"},
{file = "docling_parse-0.2.0-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:b71b0f9bfe033f9c872eb8298cd1cf5420b5cad74708ae2008257202fe1218a6"}, {file = "docling_parse-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0358eb13822ce2120362d6e7d63eb80a50d819b5bed5a2ccb7bd9beee4d83a61"},
{file = "docling_parse-0.2.0-cp311-cp311-macosx_13_6_x86_64.whl", hash = "sha256:aa0e840a9007c673f9fededf04e2372b3d1bde7c6360ac7d1b49a78ad58145f8"}, {file = "docling_parse-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4a67df4699b4ffc2b01e77395ef35843ab23f40ac62bcdf593b6cc1f443eca6"},
{file = "docling_parse-0.2.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:66e622564073fe5dce4b104b5c80cafea2ae1114efa886ef0bc0f1b1488163a9"},
{file = "docling_parse-0.2.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:96e5c6b1d4f7df936b2461908e99eb5fe756486d6414de71bced8324f4ce2108"},
{file = "docling_parse-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aeaec873f8f3f8549a2511a321cfb3dc9958d9731f538e2c619fba41eea98c5"},
{file = "docling_parse-0.2.0-cp312-cp312-macosx_13_6_arm64.whl", hash = "sha256:f3e917407a6eb4e71ce4b82ca7aefb9366e750d526011554f9aeae33fdfd53d5"},
{file = "docling_parse-0.2.0-cp312-cp312-macosx_13_6_x86_64.whl", hash = "sha256:0e4dde0bcffe59c7e1b9f2146eac2789f6a350571f66de5f4c58e8bf031ad5f6"},
{file = "docling_parse-0.2.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:12f393a0cba357016e8704e6836e553506b893d5ba16f19e47b0d201c8f6dc6d"},
{file = "docling_parse-0.2.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e07f6439fbb53c3898cd24d7d6628dcc514097314eac4832b095291dbd9c23e0"},
{file = "docling_parse-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cea14f84e196d01f5ae77f59bc9640c488fde9a4eaf25433a7372794ca9433fc"},
{file = "docling_parse-0.2.0-cp39-cp39-macosx_13_6_arm64.whl", hash = "sha256:1d7b7dc072d029869387c2ec8f2d816d066a62d79f18d5c6d037b19b1cda07c6"},
{file = "docling_parse-0.2.0-cp39-cp39-macosx_13_6_x86_64.whl", hash = "sha256:acff58ac3ae9c1198956e9dd566949e4ea06c130f9e0050b2a88c7150716fd4f"},
{file = "docling_parse-0.2.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:06c688993087b763e7aaa10a8282b2cbe615b6c68540f3538998a6bc85f944f0"},
{file = "docling_parse-0.2.0-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:179595753f74d121ad21e4d422e4360a5e54a36c48def130d7d93886807fcdac"},
{file = "docling_parse-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08be7f229bbf4b89d2dba77a80939f6dbdc3a434a26342a6380dc40e25e69fcb"},
] ]
[package.dependencies] [package.dependencies]
cibuildwheel = ">=2.20.0,<3.0.0" cibuildwheel = ">=2.20.0,<3.0.0"
tabulate = ">=0.9.0,<1.0.0"
[[package]] [[package]]
name = "docutils" name = "docutils"
@ -2695,8 +2683,8 @@ files = [
numpy = [ numpy = [
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
] ]
[[package]] [[package]]
@ -2751,8 +2739,8 @@ files = [
[package.dependencies] [package.dependencies]
numpy = [ numpy = [
{version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
] ]
python-dateutil = ">=2.8.2" python-dateutil = ">=2.8.2"
pytz = ">=2020.1" pytz = ">=2020.1"
@ -5142,4 +5130,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "4b0af4695af17ce1cdbcd04b4c29360cacd866acc77b5a0529749651ee633323" content-hash = "98d40c4d763018d5aa79b8c0ec00adac2fc06a036a9850b60f8ecce14db7cbcc"

View File

@ -32,7 +32,7 @@ pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1" huggingface_hub = ">=0.23,<1"
requests = "^2.32.3" requests = "^2.32.3"
easyocr = "^1.7" easyocr = "^1.7"
docling-parse = "^0.2.0" docling-parse = "^1.0.0"
certifi = ">=2024.7.4" certifi = ">=2024.7.4"
rtree = "^1.3.0" rtree = "^1.3.0"
scipy = "^1.14.1" scipy = "^1.14.1"