Update datamodel structure

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-09-19 16:51:52 +02:00
parent 14dcba11c0
commit d8163b0865
10 changed files with 37 additions and 32 deletions

View File

@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no" ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \ && apt-get install -y libgl1 libglib2.0-0 curl wget git vim procps \
&& apt-get clean && apt-get clean
# This will install torch with *only* cpu support # This will install torch with *only* cpu support
@ -16,7 +16,7 @@ ENV TORCH_HOME=/tmp/
COPY examples/minimal.py /root/minimal.py COPY examples/minimal.py /root/minimal.py
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' #RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);' RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
# On container environments, always set a thread budget to avoid undesired thread congestion. # On container environments, always set a thread budget to avoid undesired thread congestion.

View File

@ -24,11 +24,6 @@ class DocInputType(str, Enum):
STREAM = auto() STREAM = auto()
class TableFormerMode(str, Enum):
FAST = auto()
ACCURATE = auto()
class CoordOrigin(str, Enum): class CoordOrigin(str, Enum):
TOPLEFT = auto() TOPLEFT = auto()
BOTTOMLEFT = auto() BOTTOMLEFT = auto()
@ -303,23 +298,6 @@ class DocumentStream(BaseModel):
stream: BytesIO stream: BytesIO
class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
mode: TableFormerMode = TableFormerMode.FAST
class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
class AssembleOptions(BaseModel): class AssembleOptions(BaseModel):
keep_page_images: Annotated[ keep_page_images: Annotated[
bool, bool,

View File

@ -4,14 +4,14 @@ from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
from docling_core.types import BaseCell, BaseText from docling_core.types import BaseCell, BaseText
from docling_core.types import BoundingBox as DsBoundingBox
from docling_core.types import Document as DsDocument from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable from docling_core.types import Table as DsSchemaTable
from docling_core.types import TableCell from docling_core.types import TableCell
from docling_core.types.doc.base import Figure from docling_core.types.legacy.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy.base import Figure
from pydantic import BaseModel from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated

View File

@ -0,0 +1,25 @@
from enum import Enum, auto
from pydantic import BaseModel
class TableFormerMode(str, Enum):
FAST = auto()
ACCURATE = auto()
class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
mode: TableFormerMode = TableFormerMode.FAST
class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()

View File

@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
DoclingComponentType, DoclingComponentType,
ErrorItem, ErrorItem,
Page, Page,
PipelineOptions,
) )
from docling.datamodel.document import ( from docling.datamodel.document import (
ConversionResult, ConversionResult,
DocumentConversionInput, DocumentConversionInput,
InputDocument, InputDocument,
) )
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.ds_glm_model import GlmModel from docling.models.ds_glm_model import GlmModel
from docling.models.page_assemble_model import PageAssembleModel from docling.models.page_assemble_model import PageAssembleModel

View File

@ -11,9 +11,9 @@ from docling.datamodel.base_models import (
Page, Page,
TableCell, TableCell,
TableElement, TableElement,
TableFormerMode,
TableStructurePrediction, TableStructurePrediction,
) )
from docling.datamodel.pipeline_options import TableFormerMode
class TableStructureModel: class TableStructureModel:

View File

@ -1,7 +1,8 @@
from pathlib import Path from pathlib import Path
from typing import Callable, Iterable, List from typing import Callable, Iterable, List
from docling.datamodel.base_models import Page, PipelineOptions from docling.datamodel.base_models import Page
from docling.datamodel.pipeline_options import PipelineOptions
class BaseModelPipeline: class BaseModelPipeline:

View File

@ -1,6 +1,6 @@
from pathlib import Path from pathlib import Path
from docling.datamodel.base_models import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel from docling.models.table_structure_model import TableStructureModel

View File

@ -4,7 +4,7 @@ import time
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter

View File

@ -6,8 +6,9 @@ from typing import Iterable
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PipelineOptions
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)