Put stub for experimental format export

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-09-20 11:05:26 +02:00
parent abb6dddea8
commit ac51a09065
5 changed files with 90 additions and 71 deletions

View File

@@ -3,7 +3,7 @@ from io import BytesIO
from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
from docling_core.types import BaseCell, BaseText
from docling_core.types import BaseCell, BaseText, DoclingDocument
from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
@@ -60,6 +60,8 @@ _EMPTY_DOC = DsDocument(
),
)
_EMPTY_DOCLING_DOC = DoclingDocument(description={}, file_info={}) # TODO: Stub
class InputDocument(BaseModel):
file: PurePath = None
@@ -137,6 +139,7 @@ class ConvertedDocument(BaseModel):
assembled: AssembledUnit = AssembledUnit()
output: DsDocument = _EMPTY_DOC
experimental: DoclingDocument = _EMPTY_DOCLING_DOC
def _to_ds_document(self) -> DsDocument:
title = ""

View File

@@ -289,4 +289,4 @@ class DocumentConverter:
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.output = self.glm_model(conv_res)
conv_res.output, conv_res.experimental = self.glm_model(conv_res)

View File

@@ -1,10 +1,14 @@
import copy
import random
from typing import Tuple
from deepsearch_glm.nlp_utils import init_nlp_model
from deepsearch_glm.utils.doc_utils import to_legacy_document_format
from deepsearch_glm.utils.doc_utils import (
to_docling_document,
to_legacy_document_format,
)
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling_core.types import BaseText
from docling_core.types import BaseText, DoclingDocument
from docling_core.types import Document as DsDocument
from docling_core.types import Ref
from docling_core.types.experimental.base import BoundingBox, CoordOrigin
@@ -25,7 +29,9 @@ class GlmModel:
model = init_nlp_model(model_names=self.model_names)
self.model = model
def __call__(self, conv_res: ConversionResult) -> DsDocument:
def __call__(
self, conv_res: ConversionResult
) -> Tuple[DsDocument, DoclingDocument]:
ds_doc = conv_res._to_ds_document()
ds_doc_dict = ds_doc.model_dump(by_alias=True)
@@ -34,6 +40,7 @@ class GlmModel:
glm_doc, ds_doc_dict, update_name_label=True
)
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
exported_doc = DsDocument.model_validate(ds_doc_dict)
# DEBUG code:
@@ -84,4 +91,4 @@ class GlmModel:
# draw_clusters_and_cells(ds_doc, 0)
# draw_clusters_and_cells(exported_doc, 0)
return exported_doc
return (exported_doc, docling_doc)