Use docling-parse page-by-page

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-08-21 16:21:32 +02:00
parent 22a5c29c63
commit 9c1ee6c7e0
2 changed files with 18 additions and 23 deletions

View File

@ -1,6 +1,5 @@
import logging import logging
import random import random
import time
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Union from typing import Iterable, Optional, Union
@ -17,11 +16,16 @@ _log = logging.getLogger(__name__)
class DoclingParsePageBackend(PdfPageBackend): class DoclingParsePageBackend(PdfPageBackend):
def __init__(self, page_obj: PdfPage, docling_page_obj): def __init__(
self, parser: pdf_parser, pdf_bytes: BytesIO, page_no: int, page_obj: PdfPage
):
super().__init__(page_obj) super().__init__(page_obj)
self._ppage = page_obj self._ppage = page_obj
self._dpage = docling_page_obj
self.text_page = None parsed_page = parser.find_cells_from_bytesio_on_page(pdf_bytes, page_no)
self._dpage = parsed_page["pages"][0]
print(f"Parsed page {page_no} of doc.")
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
# Find intersecting cells on the page # Find intersecting cells on the page
@ -168,34 +172,25 @@ class DoclingParsePageBackend(PdfPageBackend):
def unload(self): def unload(self):
self._ppage = None self._ppage = None
self._dpage = None self._dpage = None
self.text_page = None
class DoclingParseDocumentBackend(PdfDocumentBackend): class DoclingParseDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path]): def __init__(self, path_or_stream: Union[BytesIO, Path]):
super().__init__(path_or_stream) super().__init__(path_or_stream)
with open(path_or_stream, "rb") as fh:
self.pdf_bytes = BytesIO(fh.read())
self._pdoc = pdfium.PdfDocument(path_or_stream) self._pdoc = pdfium.PdfDocument(path_or_stream)
# Parsing cells with docling_parser call self.parser = pdf_parser()
parser = pdf_parser()
start_pb_time = time.time()
if isinstance(path_or_stream, BytesIO):
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
else:
self._parser_doc = parser.find_cells(str(path_or_stream))
end_pb_time = time.time() - start_pb_time
_log.info(
f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
)
def page_count(self) -> int: def page_count(self) -> int:
return len(self._parser_doc["pages"]) return len(self._pdoc) # To be replaced with docling-parse API
def load_page(self, page_no: int) -> DoclingParsePageBackend: def load_page(self, page_no: int) -> DoclingParsePageBackend:
return DoclingParsePageBackend( return DoclingParsePageBackend(
self._pdoc[page_no], self._parser_doc["pages"][page_no] self.parser, self.pdf_bytes, page_no, self._pdoc[page_no]
) )
def is_valid(self) -> bool: def is_valid(self) -> bool:

View File

@ -4,7 +4,7 @@ import time
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@ -52,7 +52,7 @@ def main():
Path("./test/data/redp5695.pdf"), Path("./test/data/redp5695.pdf"),
] ]
doc_converter = DocumentConverter() doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False))
input = DocumentConversionInput.from_paths(input_doc_paths) input = DocumentConversionInput.from_paths(input_doc_paths)