feat: docling-parse v2 as default PDF backend (#549)

* Move to_docling_document from ds-glm to this repo

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Upgrade to ds-glm 1.0 and docling-parse 3.0

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lock

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix DP2 backend code, change CLI default backend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-12-09 13:26:17 +01:00
committed by GitHub
parent 9fd2cf847a
commit aca57f0527
8 changed files with 500 additions and 177 deletions

View File

@@ -4,7 +4,6 @@ from pathlib import Path
from typing import List, Union
from deepsearch_glm.nlp_utils import init_nlp_model
from deepsearch_glm.utils.doc_utils import to_docling_document
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
@@ -29,6 +28,7 @@ from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.datamodel.settings import settings
from docling.utils.glm_utils import to_docling_document
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import create_hash
@@ -232,7 +232,7 @@ class GlmModel:
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True)
ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
glm_doc = self.model.apply_on_doc(ds_doc_dict)