mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-16 08:38:14 +00:00
Put stub for experimental format export
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -3,7 +3,7 @@ from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
|
||||
from docling_core.types import BaseCell, BaseText
|
||||
from docling_core.types import BaseCell, BaseText, DoclingDocument
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||
@@ -60,6 +60,8 @@ _EMPTY_DOC = DsDocument(
|
||||
),
|
||||
)
|
||||
|
||||
_EMPTY_DOCLING_DOC = DoclingDocument(description={}, file_info={}) # TODO: Stub
|
||||
|
||||
|
||||
class InputDocument(BaseModel):
|
||||
file: PurePath = None
|
||||
@@ -137,6 +139,7 @@ class ConvertedDocument(BaseModel):
|
||||
assembled: AssembledUnit = AssembledUnit()
|
||||
|
||||
output: DsDocument = _EMPTY_DOC
|
||||
experimental: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||
|
||||
def _to_ds_document(self) -> DsDocument:
|
||||
title = ""
|
||||
|
||||
@@ -289,4 +289,4 @@ class DocumentConverter:
|
||||
elements=all_elements, headers=all_headers, body=all_body
|
||||
)
|
||||
|
||||
conv_res.output = self.glm_model(conv_res)
|
||||
conv_res.output, conv_res.experimental = self.glm_model(conv_res)
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
import copy
|
||||
import random
|
||||
from typing import Tuple
|
||||
|
||||
from deepsearch_glm.nlp_utils import init_nlp_model
|
||||
from deepsearch_glm.utils.doc_utils import to_legacy_document_format
|
||||
from deepsearch_glm.utils.doc_utils import (
|
||||
to_docling_document,
|
||||
to_legacy_document_format,
|
||||
)
|
||||
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
||||
from docling_core.types import BaseText
|
||||
from docling_core.types import BaseText, DoclingDocument
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import Ref
|
||||
from docling_core.types.experimental.base import BoundingBox, CoordOrigin
|
||||
@@ -25,7 +29,9 @@ class GlmModel:
|
||||
model = init_nlp_model(model_names=self.model_names)
|
||||
self.model = model
|
||||
|
||||
def __call__(self, conv_res: ConversionResult) -> DsDocument:
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult
|
||||
) -> Tuple[DsDocument, DoclingDocument]:
|
||||
ds_doc = conv_res._to_ds_document()
|
||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||
|
||||
@@ -34,6 +40,7 @@ class GlmModel:
|
||||
glm_doc, ds_doc_dict, update_name_label=True
|
||||
)
|
||||
|
||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||
exported_doc = DsDocument.model_validate(ds_doc_dict)
|
||||
|
||||
# DEBUG code:
|
||||
@@ -84,4 +91,4 @@ class GlmModel:
|
||||
# draw_clusters_and_cells(ds_doc, 0)
|
||||
# draw_clusters_and_cells(exported_doc, 0)
|
||||
|
||||
return exported_doc
|
||||
return (exported_doc, docling_doc)
|
||||
|
||||
Reference in New Issue
Block a user