Update datamodel structure

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-09-19 16:51:52 +02:00
parent 14dcba11c0
commit d8163b0865
10 changed files with 37 additions and 32 deletions

View File

@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
RUN apt-get update \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git vim procps \
&& apt-get clean
# This will install torch with *only* cpu support
@ -16,7 +16,7 @@ ENV TORCH_HOME=/tmp/
COPY examples/minimal.py /root/minimal.py
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
#RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
# On container environments, always set a thread budget to avoid undesired thread congestion.

View File

@ -24,11 +24,6 @@ class DocInputType(str, Enum):
STREAM = auto()
class TableFormerMode(str, Enum):
FAST = auto()
ACCURATE = auto()
class CoordOrigin(str, Enum):
TOPLEFT = auto()
BOTTOMLEFT = auto()
@ -303,23 +298,6 @@ class DocumentStream(BaseModel):
stream: BytesIO
class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
mode: TableFormerMode = TableFormerMode.FAST
class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
class AssembleOptions(BaseModel):
keep_page_images: Annotated[
bool,

View File

@ -4,14 +4,14 @@ from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
from docling_core.types import BaseCell, BaseText
from docling_core.types import BoundingBox as DsBoundingBox
from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types import TableCell
from docling_core.types.doc.base import Figure
from docling_core.types.legacy.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy.base import Figure
from pydantic import BaseModel
from typing_extensions import deprecated

View File

@ -0,0 +1,25 @@
from enum import Enum, auto
from pydantic import BaseModel
class TableFormerMode(str, Enum):
FAST = auto()
ACCURATE = auto()
class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
mode: TableFormerMode = TableFormerMode.FAST
class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()

View File

@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
DoclingComponentType,
ErrorItem,
Page,
PipelineOptions,
)
from docling.datamodel.document import (
ConversionResult,
DocumentConversionInput,
InputDocument,
)
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.ds_glm_model import GlmModel
from docling.models.page_assemble_model import PageAssembleModel

View File

@ -11,9 +11,9 @@ from docling.datamodel.base_models import (
Page,
TableCell,
TableElement,
TableFormerMode,
TableStructurePrediction,
)
from docling.datamodel.pipeline_options import TableFormerMode
class TableStructureModel:

View File

@ -1,7 +1,8 @@
from pathlib import Path
from typing import Callable, Iterable, List
from docling.datamodel.base_models import Page, PipelineOptions
from docling.datamodel.base_models import Page
from docling.datamodel.pipeline_options import PipelineOptions
class BaseModelPipeline:

View File

@ -1,6 +1,6 @@
from pathlib import Path
from docling.datamodel.base_models import PipelineOptions
from docling.datamodel.pipeline_options import PipelineOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel

View File

@ -4,7 +4,7 @@ import time
from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter

View File

@ -6,8 +6,9 @@ from typing import Iterable
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PipelineOptions
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)