mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-17 00:58:25 +00:00
Refactor test data, legacy usage and more
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -94,21 +94,21 @@ def export_documents(
|
||||
fname = output_dir / f"{doc_filename}.txt"
|
||||
with fname.open("w") as fp:
|
||||
_log.info(f"writing Text output to {fname}")
|
||||
fp.write(conv_res.render_as_text())
|
||||
fp.write(conv_res.render_as_text_v1())
|
||||
|
||||
# Export Markdown format:
|
||||
if export_md:
|
||||
fname = output_dir / f"{doc_filename}.md"
|
||||
with fname.open("w") as fp:
|
||||
_log.info(f"writing Markdown output to {fname}")
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
fp.write(conv_res.render_as_markdown_v1())
|
||||
|
||||
# Export Document Tags format:
|
||||
if export_doctags:
|
||||
fname = output_dir / f"{doc_filename}.doctags"
|
||||
with fname.open("w") as fp:
|
||||
_log.info(f"writing Doc Tags output to {fname}")
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
fp.write(conv_res.render_as_doctags_v1())
|
||||
|
||||
else:
|
||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
@@ -61,7 +62,7 @@ layout_label_to_ds_type = {
|
||||
DocItemLabel.TEXT: "paragraph",
|
||||
}
|
||||
|
||||
_EMPTY_DOC = DsDocument(
|
||||
_EMPTY_LEGACY_DOC = DsDocument(
|
||||
_name="",
|
||||
description=DsDocumentDescription(logs=[]),
|
||||
file_info=DsFileInfoObject(
|
||||
@@ -155,6 +156,11 @@ class InputDocument(BaseModel):
|
||||
)
|
||||
|
||||
|
||||
class DocumentFormat(str, Enum):
|
||||
V2 = "v2"
|
||||
V1 = "v1"
|
||||
|
||||
|
||||
@deprecated("Use `ConversionResult` instead.")
|
||||
class ConvertedDocument(BaseModel):
|
||||
input: InputDocument
|
||||
@@ -165,10 +171,10 @@ class ConvertedDocument(BaseModel):
|
||||
pages: List[Page] = []
|
||||
assembled: AssembledUnit = AssembledUnit()
|
||||
|
||||
output: DsDocument = _EMPTY_DOC
|
||||
experimental: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||
legacy_output: DsDocument = _EMPTY_LEGACY_DOC
|
||||
output: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||
|
||||
def _to_ds_document(self) -> DsDocument:
|
||||
def _to_legacy_document(self) -> DsDocument:
|
||||
title = ""
|
||||
desc = DsDocumentDescription(logs=[])
|
||||
|
||||
@@ -344,10 +350,12 @@ class ConvertedDocument(BaseModel):
|
||||
|
||||
return ds_doc
|
||||
|
||||
def render_as_dict(self):
|
||||
return self.output.model_dump(by_alias=True, exclude_none=True)
|
||||
@deprecated("Use output.export_to_dict() instead.")
|
||||
def render_as_dict_v1(self):
|
||||
return self.legacy_output.model_dump(by_alias=True, exclude_none=True)
|
||||
|
||||
def render_as_markdown(
|
||||
@deprecated("Use output.export_to_markdown() instead.")
|
||||
def render_as_markdown_v1(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
@@ -362,8 +370,8 @@ class ConvertedDocument(BaseModel):
|
||||
],
|
||||
strict_text: bool = False,
|
||||
image_placeholder: str = "<!-- image -->",
|
||||
):
|
||||
return self.output.export_to_markdown(
|
||||
) -> str:
|
||||
return self.legacy_output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
@@ -372,7 +380,8 @@ class ConvertedDocument(BaseModel):
|
||||
image_placeholder=image_placeholder,
|
||||
)
|
||||
|
||||
def render_as_text(
|
||||
@deprecated("Use output.export_to_text() instead.")
|
||||
def render_as_text_v1(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
@@ -383,8 +392,8 @@ class ConvertedDocument(BaseModel):
|
||||
"paragraph",
|
||||
"caption",
|
||||
],
|
||||
):
|
||||
return self.output.export_to_markdown(
|
||||
) -> str:
|
||||
return self.legacy_output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
@@ -392,7 +401,8 @@ class ConvertedDocument(BaseModel):
|
||||
strict_text=True,
|
||||
)
|
||||
|
||||
def render_as_doctags(
|
||||
@deprecated("Use output.export_to_document_tokens() instead.")
|
||||
def render_as_doctags_v1(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
@@ -415,7 +425,7 @@ class ConvertedDocument(BaseModel):
|
||||
add_table_cell_label: bool = True,
|
||||
add_table_cell_text: bool = True,
|
||||
) -> str:
|
||||
return self.output.export_to_document_tokens(
|
||||
return self.legacy_output.export_to_document_tokens(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
|
||||
@@ -9,7 +9,7 @@ from deepsearch_glm.utils.doc_utils import (
|
||||
)
|
||||
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
||||
from docling_core.types import BaseText
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import Document as DsLegacyDocument
|
||||
from docling_core.types import Ref
|
||||
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||
from docling_core.types.experimental.document import DoclingDocument
|
||||
@@ -32,8 +32,8 @@ class GlmModel:
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult
|
||||
) -> Tuple[DsDocument, DoclingDocument]:
|
||||
ds_doc = conv_res._to_ds_document()
|
||||
) -> Tuple[DsLegacyDocument, DoclingDocument]:
|
||||
ds_doc = conv_res._to_legacy_document()
|
||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||
|
||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||
@@ -42,7 +42,7 @@ class GlmModel:
|
||||
)
|
||||
|
||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||
exported_doc = DsDocument.model_validate(ds_doc_dict)
|
||||
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
|
||||
|
||||
# DEBUG code:
|
||||
def draw_clusters_and_cells(ds_document, page_no):
|
||||
@@ -92,4 +92,4 @@ class GlmModel:
|
||||
# draw_clusters_and_cells(ds_doc, 0)
|
||||
# draw_clusters_and_cells(exported_doc, 0)
|
||||
|
||||
return (exported_doc, docling_doc)
|
||||
return (legacy_doc, docling_doc)
|
||||
|
||||
@@ -44,7 +44,7 @@ class SimpleModelPipeline(BaseModelPipeline):
|
||||
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
||||
# a DoclingDocument straight.
|
||||
|
||||
conv_res.experimental = in_doc._backend.convert()
|
||||
conv_res.output = in_doc._backend.convert()
|
||||
|
||||
# Do other stuff with conv_res.experimental
|
||||
|
||||
|
||||
@@ -124,7 +124,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
||||
elements=all_elements, headers=all_headers, body=all_body
|
||||
)
|
||||
|
||||
conv_res.output, conv_res.experimental = self.glm_model(conv_res)
|
||||
conv_res.legacy_output, conv_res.output = self.glm_model(conv_res)
|
||||
|
||||
return conv_res
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ def generate_multimodal_pages(
|
||||
end_ix = 0
|
||||
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
|
||||
|
||||
doc = doc_result.output
|
||||
doc = doc_result.legacy_output
|
||||
|
||||
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
|
||||
segments = []
|
||||
|
||||
Reference in New Issue
Block a user