mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Fixes for DPv4 backend init, better test coverage
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
e34c0750a7
commit
fe45d30942
@ -13,6 +13,7 @@ from pypdfium2 import PdfPage
|
|||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Size
|
from docling.datamodel.base_models import Size
|
||||||
|
from docling.utils.locks import pypdfium2_lock
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
@ -138,9 +139,10 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
with pypdfium2_lock:
|
||||||
|
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||||
self.parser = DoclingPdfParser(loglevel="fatal")
|
self.parser = DoclingPdfParser(loglevel="fatal")
|
||||||
self.dp_doc: PdfDocument = self.parser.load(path_or_stream=path_or_stream)
|
self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
|
||||||
success = self.dp_doc is not None
|
success = self.dp_doc is not None
|
||||||
|
|
||||||
if not success:
|
if not success:
|
||||||
@ -162,14 +164,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|||||||
def load_page(
|
def load_page(
|
||||||
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
||||||
) -> DoclingParseV4PageBackend:
|
) -> DoclingParseV4PageBackend:
|
||||||
return DoclingParseV4PageBackend(
|
with pypdfium2_lock:
|
||||||
self.dp_doc.get_page(
|
return DoclingParseV4PageBackend(
|
||||||
page_no + 1,
|
self.dp_doc.get_page(
|
||||||
create_words=create_words,
|
page_no + 1,
|
||||||
create_textlines=create_textlines,
|
create_words=create_words,
|
||||||
),
|
create_textlines=create_textlines,
|
||||||
self._pdoc[page_no],
|
),
|
||||||
)
|
self._pdoc[page_no],
|
||||||
|
)
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return self.page_count() > 0
|
return self.page_count() > 0
|
||||||
@ -177,5 +180,6 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|||||||
def unload(self):
|
def unload(self):
|
||||||
super().unload()
|
super().unload()
|
||||||
self.dp_doc.unload()
|
self.dp_doc.unload()
|
||||||
self._pdoc.close()
|
with pypdfium2_lock:
|
||||||
|
self._pdoc.close()
|
||||||
self._pdoc = None
|
self._pdoc = None
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||||
@ -40,7 +43,39 @@ def test_in_doc_from_invalid_buf():
|
|||||||
assert doc.valid == False
|
assert doc.valid == False
|
||||||
|
|
||||||
|
|
||||||
|
def test_image_in_pdf_backend():
|
||||||
|
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
|
format=InputFormat.IMAGE,
|
||||||
|
backend=PyPdfiumDocumentBackend,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert in_doc.valid
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
|
format=InputFormat.IMAGE,
|
||||||
|
backend=DoclingParseDocumentBackend,
|
||||||
|
)
|
||||||
|
assert in_doc.valid
|
||||||
|
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
|
format=InputFormat.IMAGE,
|
||||||
|
backend=DoclingParseV2DocumentBackend,
|
||||||
|
)
|
||||||
|
assert in_doc.valid
|
||||||
|
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
|
format=InputFormat.IMAGE,
|
||||||
|
backend=DoclingParseV4DocumentBackend,
|
||||||
|
)
|
||||||
|
assert in_doc.valid
|
||||||
|
|
||||||
|
|
||||||
def test_in_doc_with_page_range():
|
def test_in_doc_with_page_range():
|
||||||
|
|
||||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
limits = DocumentLimits()
|
limits = DocumentLimits()
|
||||||
limits.page_range = (1, 10)
|
limits.page_range = (1, 10)
|
||||||
@ -192,7 +227,7 @@ def _make_input_doc(path):
|
|||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
path_or_stream=path,
|
path_or_stream=path,
|
||||||
format=InputFormat.PDF,
|
format=InputFormat.PDF,
|
||||||
backend=PyPdfiumDocumentBackend,
|
backend=DoclingParseV4DocumentBackend,
|
||||||
)
|
)
|
||||||
return in_doc
|
return in_doc
|
||||||
|
|
||||||
@ -202,6 +237,6 @@ def _make_input_doc_from_stream(doc_stream):
|
|||||||
path_or_stream=doc_stream.stream,
|
path_or_stream=doc_stream.stream,
|
||||||
format=InputFormat.PDF,
|
format=InputFormat.PDF,
|
||||||
filename=doc_stream.name,
|
filename=doc_stream.name,
|
||||||
backend=PyPdfiumDocumentBackend,
|
backend=DoclingParseV4DocumentBackend,
|
||||||
)
|
)
|
||||||
return in_doc
|
return in_doc
|
||||||
|
@ -4,6 +4,7 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
@ -30,7 +31,7 @@ def get_converter():
|
|||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
||||||
InputFormat.IMAGE: PdfFormatOption(
|
InputFormat.IMAGE: PdfFormatOption(
|
||||||
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
|
pipeline_options=pipeline_options,
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user