mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: add the Image backend (#2627)
* feat: add the Image backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the pre-commit Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Fixed single- versus multi-frame image formats Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix: Proper usage of ImageDocumentBackend in the pipeline, deprecate old code. Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Adapt tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: correct mets_gbs backend test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Make ImagePageBackend.get_bitmap_rects() yield Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
ae30373ee7
commit
3495b73de8
@@ -4,9 +4,6 @@ from pathlib import Path
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.backend_options import (
|
||||
@@ -17,7 +14,7 @@ from docling.datamodel.backend_options import (
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
from docling.document_converter import PdfFormatOption
|
||||
from docling.document_converter import ImageFormatOption, PdfFormatOption
|
||||
|
||||
|
||||
def test_in_doc_from_valid_path():
|
||||
@@ -51,36 +48,6 @@ def test_in_doc_from_invalid_buf():
|
||||
assert doc.valid is False
|
||||
|
||||
|
||||
def test_image_in_pdf_backend():
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
)
|
||||
|
||||
assert in_doc.valid
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
assert in_doc.valid
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=DoclingParseV2DocumentBackend,
|
||||
)
|
||||
assert in_doc.valid
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=DoclingParseV4DocumentBackend,
|
||||
)
|
||||
assert in_doc.valid
|
||||
|
||||
|
||||
def test_in_doc_with_page_range():
|
||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
limits = DocumentLimits()
|
||||
@@ -297,7 +264,7 @@ def test_tiff_two_pages():
|
||||
doc = InputDocument(
|
||||
path_or_stream=tiff_path,
|
||||
format=InputFormat.IMAGE,
|
||||
backend=PdfFormatOption().backend, # use default backend
|
||||
backend=ImageFormatOption().backend, # use default backend
|
||||
)
|
||||
assert doc.valid is True
|
||||
assert doc.page_count == 2
|
||||
|
||||
Reference in New Issue
Block a user