diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 2074b94c..36f61191 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -39,8 +39,9 @@ class PdfPageBackend(ABC): class PdfDocumentBackend(ABC): @abstractmethod - def __init__(self, path_or_stream: Union[BytesIO, Path]): - pass + def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + self.path_or_stream = path_or_stream + self.document_hash = document_hash @abstractmethod def load_page(self, page_no: int) -> PdfPageBackend: @@ -56,4 +57,7 @@ class PdfDocumentBackend(ABC): @abstractmethod def unload(self): - pass + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + + self.path_or_stream = None diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 4af3e1b7..18f6c69e 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -17,16 +17,14 @@ _log = logging.getLogger(__name__) class DoclingParsePageBackend(PdfPageBackend): def __init__( - self, parser: pdf_parser, pdf_bytes: BytesIO, page_no: int, page_obj: PdfPage + self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage ): super().__init__(page_obj) self._ppage = page_obj - parsed_page = parser.find_cells_from_bytesio_on_page(pdf_bytes, page_no) + parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) self._dpage = parsed_page["pages"][0] - print(f"Parsed page {page_no} of doc.") - def get_text_in_rect(self, bbox: BoundingBox) -> str: # Find intersecting cells on the page text_piece = "" @@ -175,28 +173,36 @@ class DoclingParsePageBackend(PdfPageBackend): class DoclingParseDocumentBackend(PdfDocumentBackend): - def __init__(self, path_or_stream: Union[BytesIO, Path]): - super().__init__(path_or_stream) - - with open(path_or_stream, "rb") as fh: - self.pdf_bytes = BytesIO(fh.read()) + def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + super().__init__(path_or_stream, document_hash) self._pdoc = pdfium.PdfDocument(path_or_stream) self.parser = pdf_parser() + success = False + if isinstance(path_or_stream, BytesIO): + success = self.parser.load_document_from_bytesio( + document_hash, path_or_stream + ) + elif isinstance(path_or_stream, Path): + success = self.parser.load_document(document_hash, str(path_or_stream)) + + if not success: + raise RuntimeError("docling-parse could not load this document.") + def page_count(self) -> int: return len(self._pdoc) # To be replaced with docling-parse API def load_page(self, page_no: int) -> DoclingParsePageBackend: - return DoclingParsePageBackend( - self.parser, self.pdf_bytes, page_no, self._pdoc[page_no] + self.parser, self.document_hash, page_no, self._pdoc[page_no] ) def is_valid(self) -> bool: return self.page_count() > 0 def unload(self): + super().unload() + self.parser.unload_document(self.document_hash) self._pdoc.close() self._pdoc = None - self._parser_doc = None diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index 2d0621bb..56758b1d 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend): class PyPdfiumDocumentBackend(PdfDocumentBackend): - def __init__(self, path_or_stream: Union[BytesIO, Path]): - super().__init__(path_or_stream) + def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + super().__init__(path_or_stream, document_hash) self._pdoc = pdfium.PdfDocument(path_or_stream) def page_count(self) -> int: @@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend): return self.page_count() > 0 def unload(self): + super().unload() self._pdoc.close() self._pdoc = None diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index fe19afbc..5726b76d 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -79,7 +79,9 @@ class InputDocument(BaseModel): self.valid = False else: self.document_hash = create_file_hash(path_or_stream) - self._backend = pdf_backend(path_or_stream=path_or_stream) + self._backend = pdf_backend( + path_or_stream=path_or_stream, document_hash=self.document_hash + ) elif isinstance(path_or_stream, BytesIO): self.file = PurePath(filename) @@ -89,7 +91,9 @@ class InputDocument(BaseModel): self.valid = False else: self.document_hash = create_file_hash(path_or_stream) - self._backend = pdf_backend(path_or_stream=path_or_stream) + self._backend = pdf_backend( + path_or_stream=path_or_stream, document_hash=self.document_hash + ) if self.document_hash and self._backend.page_count() > 0: self.page_count = self._backend.page_count() diff --git a/examples/batch_convert.py b/examples/batch_convert.py index 29566a43..76bbdcd4 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -1,10 +1,15 @@ import json import logging import time +from io import BytesIO from pathlib import Path from typing import Iterable -from docling.datamodel.base_models import ConversionStatus, PipelineOptions +from docling.datamodel.base_models import ( + ConversionStatus, + DocumentStream, + PipelineOptions, +) from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.document_converter import DocumentConverter @@ -52,6 +57,10 @@ def main(): Path("./test/data/redp5695.pdf"), ] + # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) + # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] + # input = DocumentConversionInput.from_streams(docs) + doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False)) input = DocumentConversionInput.from_paths(input_doc_paths) diff --git a/poetry.lock b/poetry.lock index 4b24f128..68140cb0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -822,35 +822,23 @@ tqdm = ">=4.64.0,<5.0.0" [[package]] name = "docling-parse" -version = "0.2.0" +version = "1.0.0" description = "Simple package to extract text with coordinates from programmatic PDFs" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_parse-0.2.0-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:3ec6458d36bd33862ae1ca38accbcd2ddc8a881fb5a3ab0aeb9e023bc20d8e04"}, - {file = "docling_parse-0.2.0-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:898ee83f1e6f97dd34362948fcc70753fa95c83f77eddf48de5e352db10402f7"}, - {file = "docling_parse-0.2.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:9247e6902f979d23860e4b819b0145a9f55be78b14cf2906ac98f8fb0e9627cd"}, - {file = "docling_parse-0.2.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:ebd0f091bdb106f1c3f72448aedfee52a904cb01e4de73827446e30fc3ac3b54"}, - {file = "docling_parse-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9846bd3347a41337d6e83d7fbfbc636274ed3863ac375f4ca5eac1ea0eb88b8f"}, - {file = "docling_parse-0.2.0-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:b71b0f9bfe033f9c872eb8298cd1cf5420b5cad74708ae2008257202fe1218a6"}, - {file = "docling_parse-0.2.0-cp311-cp311-macosx_13_6_x86_64.whl", hash = "sha256:aa0e840a9007c673f9fededf04e2372b3d1bde7c6360ac7d1b49a78ad58145f8"}, - {file = "docling_parse-0.2.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:66e622564073fe5dce4b104b5c80cafea2ae1114efa886ef0bc0f1b1488163a9"}, - {file = "docling_parse-0.2.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:96e5c6b1d4f7df936b2461908e99eb5fe756486d6414de71bced8324f4ce2108"}, - {file = "docling_parse-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aeaec873f8f3f8549a2511a321cfb3dc9958d9731f538e2c619fba41eea98c5"}, - {file = "docling_parse-0.2.0-cp312-cp312-macosx_13_6_arm64.whl", hash = "sha256:f3e917407a6eb4e71ce4b82ca7aefb9366e750d526011554f9aeae33fdfd53d5"}, - {file = "docling_parse-0.2.0-cp312-cp312-macosx_13_6_x86_64.whl", hash = "sha256:0e4dde0bcffe59c7e1b9f2146eac2789f6a350571f66de5f4c58e8bf031ad5f6"}, - {file = "docling_parse-0.2.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:12f393a0cba357016e8704e6836e553506b893d5ba16f19e47b0d201c8f6dc6d"}, - {file = "docling_parse-0.2.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e07f6439fbb53c3898cd24d7d6628dcc514097314eac4832b095291dbd9c23e0"}, - {file = "docling_parse-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cea14f84e196d01f5ae77f59bc9640c488fde9a4eaf25433a7372794ca9433fc"}, - {file = "docling_parse-0.2.0-cp39-cp39-macosx_13_6_arm64.whl", hash = "sha256:1d7b7dc072d029869387c2ec8f2d816d066a62d79f18d5c6d037b19b1cda07c6"}, - {file = "docling_parse-0.2.0-cp39-cp39-macosx_13_6_x86_64.whl", hash = "sha256:acff58ac3ae9c1198956e9dd566949e4ea06c130f9e0050b2a88c7150716fd4f"}, - {file = "docling_parse-0.2.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:06c688993087b763e7aaa10a8282b2cbe615b6c68540f3538998a6bc85f944f0"}, - {file = "docling_parse-0.2.0-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:179595753f74d121ad21e4d422e4360a5e54a36c48def130d7d93886807fcdac"}, - {file = "docling_parse-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08be7f229bbf4b89d2dba77a80939f6dbdc3a434a26342a6380dc40e25e69fcb"}, + {file = "docling_parse-1.0.0-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:068db83a192b21783cc7bc66e9d3efb9072a57edeb8c07ef1a83a93353efcc36"}, + {file = "docling_parse-1.0.0-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:f57f9bba3ac6a81fc30c34bb08261d7308b0a780d90cbee903821aec2f5fbd88"}, + {file = "docling_parse-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdf142dea82f0a5f5e1bcaa74cc9feeda12899077589e3eb6c728d334b43cdda"}, + {file = "docling_parse-1.0.0-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:8834a8387a55b4082c20da184e7d09f705c17558c465da9a5f35974b19013fe5"}, + {file = "docling_parse-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5c4b80a8d5e8f832910f32188501a9a6718a0223fb9921ee7cc5cfe62adb857"}, + {file = "docling_parse-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0358eb13822ce2120362d6e7d63eb80a50d819b5bed5a2ccb7bd9beee4d83a61"}, + {file = "docling_parse-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4a67df4699b4ffc2b01e77395ef35843ab23f40ac62bcdf593b6cc1f443eca6"}, ] [package.dependencies] cibuildwheel = ">=2.20.0,<3.0.0" +tabulate = ">=0.9.0,<1.0.0" [[package]] name = "docutils" @@ -2695,8 +2683,8 @@ files = [ numpy = [ {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -2751,8 +2739,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.22.4", markers = "python_version < \"3.11\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -5142,4 +5130,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "4b0af4695af17ce1cdbcd04b4c29360cacd866acc77b5a0529749651ee633323" +content-hash = "98d40c4d763018d5aa79b8c0ec00adac2fc06a036a9850b60f8ecce14db7cbcc" diff --git a/pyproject.toml b/pyproject.toml index 63d48d0f..b42f6156 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ pydantic-settings = "^2.3.0" huggingface_hub = ">=0.23,<1" requests = "^2.32.3" easyocr = "^1.7" -docling-parse = "^0.2.0" +docling-parse = "^1.0.0" certifi = ">=2024.7.4" rtree = "^1.3.0" scipy = "^1.14.1"