mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-09 13:18:24 +00:00
feat: Support tableformer model choice (#90)
* Support tableformer model choice Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update datamodel structure Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update docs Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Cleanup Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add test unit for table options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Ensure import backwards-compatibility for PipelineOptions Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update README Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Adjust parameters on custom_convert Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Update Dockerfile Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
This commit is contained in:
@@ -12,8 +12,9 @@ from docling_core.utils.file import resolve_file_source
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
|
||||
@@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
from typing_extensions import Self
|
||||
|
||||
from docling.backend.abstract_backend import PdfPageBackend
|
||||
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
|
||||
PipelineOptions,
|
||||
TableStructureOptions,
|
||||
)
|
||||
|
||||
|
||||
class ConversionStatus(str, Enum):
|
||||
@@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
|
||||
stream: BytesIO
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
# are merged across table columns.
|
||||
# False: Let table structure model define the text cells, ignore PDF cells.
|
||||
)
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
|
||||
|
||||
class AssembleOptions(BaseModel):
|
||||
keep_page_images: Annotated[
|
||||
bool,
|
||||
|
||||
@@ -4,13 +4,13 @@ from pathlib import Path, PurePath
|
||||
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
|
||||
from docling_core.types import BaseCell, BaseText
|
||||
from docling_core.types import BoundingBox as DsBoundingBox
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||
from docling_core.types import Table as DsSchemaTable
|
||||
from docling_core.types import TableCell
|
||||
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
|
||||
from docling_core.types.doc.base import Figure
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
25
docling/datamodel/pipeline_options.py
Normal file
25
docling/datamodel/pipeline_options.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from enum import Enum, auto
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
FAST = auto()
|
||||
ACCURATE = auto()
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
# are merged across table columns.
|
||||
# False: Let table structure model define the text cells, ignore PDF cells.
|
||||
)
|
||||
mode: TableFormerMode = TableFormerMode.FAST
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
@@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
|
||||
DoclingComponentType,
|
||||
ErrorItem,
|
||||
Page,
|
||||
PipelineOptions,
|
||||
)
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
DocumentConversionInput,
|
||||
InputDocument,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.ds_glm_model import GlmModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import copy
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List
|
||||
|
||||
import numpy
|
||||
@@ -12,16 +13,22 @@ from docling.datamodel.base_models import (
|
||||
TableElement,
|
||||
TableStructurePrediction,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import TableFormerMode
|
||||
|
||||
|
||||
class TableStructureModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.do_cell_matching = config["do_cell_matching"]
|
||||
self.mode = config["mode"]
|
||||
|
||||
self.enabled = config["enabled"]
|
||||
if self.enabled:
|
||||
artifacts_path = config["artifacts_path"]
|
||||
artifacts_path: Path = config["artifacts_path"]
|
||||
|
||||
if self.mode == TableFormerMode.ACCURATE:
|
||||
artifacts_path = artifacts_path / "fat"
|
||||
|
||||
# Third Party
|
||||
import docling_ibm_models.tableformer.common as c
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, List
|
||||
|
||||
from docling.datamodel.base_models import Page, PipelineOptions
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
|
||||
|
||||
class BaseModelPipeline:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
@@ -32,6 +32,7 @@ class StandardModelPipeline(BaseModelPipeline):
|
||||
"artifacts_path": artifacts_path
|
||||
/ StandardModelPipeline._table_model_path,
|
||||
"enabled": pipeline_options.do_table_structure,
|
||||
"mode": pipeline_options.table_structure_options.mode,
|
||||
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
||||
}
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user