mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat(backend): add generic options support and HTML image handling modes (#2011)
* feat: add backend options support to document backends Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * feat: enhance document backends with generic backend options and improve HTML image handling Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * Refactor tests for declarativebackend Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(HTML): improve image caption handling and ensure backend options are set correctly Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix: enhance HTML backend image handling and add support for local file paths Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: Add ground truth data for test data Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(HTML): skip loading SVG files in image data handling Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(html): simplify backend options and address gaps Backend options for DeclarativeDocumentBackend classes and only when necessary. Refactor caption parsing in 'img' elements and remove dummy text. Replace deprecated annotations from Typing library with native types. Replace typing annotations according to pydantic guidelines. Some documentation with pydantic annotations. Fix diff issue with test files. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(html): add tests and fix bugs Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(html): refactor backend options Move backend option classes to its own module within datamodel package. Rename 'source_location' with 'source_uri' in HTMLBackendOptions. Rename 'image_fetch' with 'fetch_images' in HTMLBackendOptions. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(markdown): create a class for the markdown backend options Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Leg0shii <dragonsaremyfavourite@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
@@ -1,10 +1,19 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.backend_options import (
|
||||
BaseBackendOptions,
|
||||
DeclarativeBackendOptions,
|
||||
HTMLBackendOptions,
|
||||
)
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
@@ -15,6 +24,7 @@ def test_in_doc_from_valid_path():
|
||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
doc = _make_input_doc(test_doc_path)
|
||||
assert doc.valid is True
|
||||
assert doc.backend_options is None
|
||||
|
||||
|
||||
def test_in_doc_from_invalid_path():
|
||||
@@ -105,6 +115,38 @@ def test_in_doc_with_page_range():
|
||||
assert doc.valid is False
|
||||
|
||||
|
||||
def test_in_doc_with_backend_options():
|
||||
test_doc_path = Path("./tests/data/html/example_01.html")
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
backend_options=HTMLBackendOptions(),
|
||||
)
|
||||
assert doc.valid
|
||||
assert doc.backend_options
|
||||
assert isinstance(doc.backend_options, HTMLBackendOptions)
|
||||
assert not doc.backend_options.fetch_images
|
||||
assert not doc.backend_options.enable_local_fetch
|
||||
assert not doc.backend_options.enable_remote_fetch
|
||||
|
||||
with pytest.raises(ValueError, match="Incompatible types"):
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
backend_options=DeclarativeBackendOptions(),
|
||||
)
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
backend_options=BaseBackendOptions(),
|
||||
)
|
||||
|
||||
|
||||
def test_guess_format(tmp_path):
|
||||
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
|
||||
dci = _DocumentConversionInput(path_or_stream_iterator=[])
|
||||
|
||||
Reference in New Issue
Block a user