From 12477c8cac443470170508cb5843f9697976a7fd Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 23 Sep 2024 12:22:49 +0200 Subject: [PATCH] Lots of import refactoring Signed-off-by: Christoph Auer --- docling/datamodel/base_models.py | 43 +++---------------------- docling/datamodel/document.py | 7 ++-- docling/models/page_assemble_model.py | 4 +-- docling/models/table_structure_model.py | 12 +++---- docling/utils/export.py | 9 +----- examples/batch_convert.py | 10 +++++- examples/export_figures.py | 4 +-- 7 files changed, 26 insertions(+), 63 deletions(-) diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 5a04cb12..73477a75 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -5,6 +5,7 @@ from io import BytesIO from typing import Annotated, Any, Dict, List, Optional, Tuple, Union from docling_core.types.experimental.base import BoundingBox, Size +from docling_core.types.experimental.document import BaseFigureData, TableCell from PIL.Image import Image from pydantic import BaseModel, ConfigDict, Field, model_validator from typing_extensions import Self @@ -67,37 +68,7 @@ class LayoutPrediction(BaseModel): clusters: List[Cluster] = [] -class TableCell(BaseModel): - bbox: BoundingBox - row_span: int - col_span: int - start_row_offset_idx: int - end_row_offset_idx: int - start_col_offset_idx: int - end_col_offset_idx: int - text: str - column_header: bool = False - row_header: bool = False - row_section: bool = False - - @model_validator(mode="before") - @classmethod - def from_dict_format(cls, data: Any) -> Any: - if isinstance(data, Dict): - text = data["bbox"].get("token", "") - if not len(text): - text_cells = data.pop("text_cell_bboxes", None) - if text_cells: - for el in text_cells: - text += el["token"] + " " - - text = text.strip() - data["text"] = text - - return data - - -class TableElement(BasePageElement): +class Table(BasePageElement): otsl_seq: List[str] num_rows: int = 0 num_cols: int = 0 @@ -105,18 +76,14 @@ class TableElement(BasePageElement): class TableStructurePrediction(BaseModel): - table_map: Dict[int, TableElement] = {} + table_map: Dict[int, Table] = {} class TextElement(BasePageElement): ... -class FigureData(BaseModel): - pass - - class FigureElement(BasePageElement): - data: Optional[FigureData] = None + data: Optional[BaseFigureData] = None provenance: Optional[str] = None predicted_class: Optional[str] = None confidence: Optional[float] = None @@ -139,7 +106,7 @@ class PagePredictions(BaseModel): equations_prediction: Optional[EquationPrediction] = None -PageElement = Union[TextElement, TableElement, FigureElement] +PageElement = Union[TextElement, Table, FigureElement] class AssembledUnit(BaseModel): diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 025f25ea..183af07e 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -9,9 +9,8 @@ from docling_core.types import DocumentDescription as DsDocumentDescription from docling_core.types import FileInfoObject as DsFileInfoObject from docling_core.types import PageDimensions, PageReference, Prov, Ref from docling_core.types import Table as DsSchemaTable -from docling_core.types import TableCell from docling_core.types.doc.base import BoundingBox as DsBoundingBox -from docling_core.types.doc.base import Figure +from docling_core.types.doc.base import Figure, TableCell from pydantic import BaseModel from typing_extensions import deprecated @@ -25,7 +24,7 @@ from docling.datamodel.base_models import ( FigureElement, Page, PageElement, - TableElement, + Table, TextElement, ) from docling.datamodel.settings import DocumentLimits @@ -186,7 +185,7 @@ class ConvertedDocument(BaseModel): ], ) ) - elif isinstance(element, TableElement): + elif isinstance(element, Table): index = len(tables) ref_str = f"#/tables/{index}" main_text.append( diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 2b9db544..b3444d1d 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -7,7 +7,7 @@ from docling.datamodel.base_models import ( FigureElement, Page, PageElement, - TableElement, + Table, TextElement, ) from docling.models.layout_model import LayoutModel @@ -84,7 +84,7 @@ class PageAssembleModel: if ( not tbl ): # fallback: add table without structure, if it isn't present - tbl = TableElement( + tbl = Table( label=cluster.label, id=cluster.id, text="", diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 78b727fc..99972144 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -3,15 +3,11 @@ from typing import Iterable, List import numpy from docling_core.types.experimental.base import BoundingBox +from docling_core.types.experimental.document import TableCell from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from PIL import ImageDraw -from docling.datamodel.base_models import ( - Page, - TableCell, - TableElement, - TableStructurePrediction, -) +from docling.datamodel.base_models import Page, Table, TableStructurePrediction class TableStructureModel: @@ -32,7 +28,7 @@ class TableStructureModel: self.tf_predictor = TFPredictor(self.tm_config) self.scale = 2.0 # Scale up table input images to 144 dpi - def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]): + def draw_table_and_cells(self, page: Page, tbl_list: List[Table]): image = ( page._backend.get_page_image() ) # make new image to avoid drawing on the saved ones @@ -134,7 +130,7 @@ class TableStructureModel: num_cols = table_out["predict_details"]["num_cols"] otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"] - tbl = TableElement( + tbl = Table( otsl_seq=otsl_seq, table_cells=table_cells, num_rows=num_rows, diff --git a/docling/utils/export.py b/docling/utils/export.py index 3daa856b..3535abc8 100644 --- a/docling/utils/export.py +++ b/docling/utils/export.py @@ -1,14 +1,7 @@ import logging from typing import Any, Dict, Iterable, List, Tuple, Union -from docling_core.types.doc.base import ( - BaseCell, - BaseText, - BoundingBox, - Ref, - Table, - TableCell, -) +from docling_core.types.doc.base import BaseCell, BaseText, BoundingBox, Ref, Table from docling_core.types.experimental.base import CoordOrigin from docling.datamodel.base_models import OcrCell diff --git a/examples/batch_convert.py b/examples/batch_convert.py index 1fe81830..94abf716 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -32,8 +32,16 @@ def export_documents( with (output_dir / f"{doc_filename}.json").open("w") as fp: fp.write(json.dumps(conv_res.render_as_dict())) + # Export Docling document format to JSON (experimental): + with (output_dir / f"{doc_filename}.experimental.json").open("w") as fp: + fp.write( + json.dumps( + conv_res.experimental.model_dump(mode="json", by_alias=True) + ) + ) + # Export Docling document format to YAML (experimental): - with (output_dir / f"{doc_filename}.yaml").open("w") as fp: + with (output_dir / f"{doc_filename}.experimental.yaml").open("w") as fp: fp.write( yaml.safe_dump( conv_res.experimental.model_dump(mode="json", by_alias=True) diff --git a/examples/export_figures.py b/examples/export_figures.py index bdffbec1..60d70156 100644 --- a/examples/export_figures.py +++ b/examples/export_figures.py @@ -8,7 +8,7 @@ from docling.datamodel.base_models import ( ConversionStatus, FigureElement, PageElement, - TableElement, + Table, ) from docling.datamodel.document import DocumentConversionInput from docling.document_converter import DocumentConverter @@ -61,7 +61,7 @@ def main(): # Export figures and tables for element, image in conv_res.render_element_images( - element_types=(FigureElement, TableElement) + element_types=(FigureElement, Table) ): element_image_filename = ( output_dir / f"{doc_filename}-element-{element.id}.png"