feat: docling-parse v2 as default PDF backend (#549)

* Move to_docling_document from ds-glm to this repo

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Upgrade to ds-glm 1.0 and docling-parse 3.0

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lock

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix DP2 backend code, change CLI default backend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-12-09 13:26:17 +01:00
committed by GitHub
parent 9fd2cf847a
commit aca57f0527
8 changed files with 500 additions and 177 deletions

View File

@@ -9,7 +9,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
@@ -84,12 +84,12 @@ class HTMLFormatOption(FormatOption):
class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
def _get_default_option(format: InputFormat) -> FormatOption:
@@ -113,10 +113,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
}
if (options := format_to_default_options.get(format)) is not None: