mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Update datamodel structure
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
14dcba11c0
commit
d8163b0865
@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm
|
|||||||
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
|
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
|
||||||
|
|
||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
|
&& apt-get install -y libgl1 libglib2.0-0 curl wget git vim procps \
|
||||||
&& apt-get clean
|
&& apt-get clean
|
||||||
|
|
||||||
# This will install torch with *only* cpu support
|
# This will install torch with *only* cpu support
|
||||||
@ -16,7 +16,7 @@ ENV TORCH_HOME=/tmp/
|
|||||||
|
|
||||||
COPY examples/minimal.py /root/minimal.py
|
COPY examples/minimal.py /root/minimal.py
|
||||||
|
|
||||||
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
#RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
||||||
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
|
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
|
||||||
|
|
||||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||||
|
@ -24,11 +24,6 @@ class DocInputType(str, Enum):
|
|||||||
STREAM = auto()
|
STREAM = auto()
|
||||||
|
|
||||||
|
|
||||||
class TableFormerMode(str, Enum):
|
|
||||||
FAST = auto()
|
|
||||||
ACCURATE = auto()
|
|
||||||
|
|
||||||
|
|
||||||
class CoordOrigin(str, Enum):
|
class CoordOrigin(str, Enum):
|
||||||
TOPLEFT = auto()
|
TOPLEFT = auto()
|
||||||
BOTTOMLEFT = auto()
|
BOTTOMLEFT = auto()
|
||||||
@ -303,23 +298,6 @@ class DocumentStream(BaseModel):
|
|||||||
stream: BytesIO
|
stream: BytesIO
|
||||||
|
|
||||||
|
|
||||||
class TableStructureOptions(BaseModel):
|
|
||||||
do_cell_matching: bool = (
|
|
||||||
True
|
|
||||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
|
||||||
# are merged across table columns.
|
|
||||||
# False: Let table structure model define the text cells, ignore PDF cells.
|
|
||||||
)
|
|
||||||
mode: TableFormerMode = TableFormerMode.FAST
|
|
||||||
|
|
||||||
|
|
||||||
class PipelineOptions(BaseModel):
|
|
||||||
do_table_structure: bool = True # True: perform table structure extraction
|
|
||||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
|
||||||
|
|
||||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
|
||||||
|
|
||||||
|
|
||||||
class AssembleOptions(BaseModel):
|
class AssembleOptions(BaseModel):
|
||||||
keep_page_images: Annotated[
|
keep_page_images: Annotated[
|
||||||
bool,
|
bool,
|
||||||
|
@ -4,14 +4,14 @@ from pathlib import Path, PurePath
|
|||||||
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
from docling_core.types import BaseCell, BaseText
|
from docling_core.types import BaseCell, BaseText
|
||||||
from docling_core.types import BoundingBox as DsBoundingBox
|
|
||||||
from docling_core.types import Document as DsDocument
|
from docling_core.types import Document as DsDocument
|
||||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||||
from docling_core.types import Table as DsSchemaTable
|
from docling_core.types import Table as DsSchemaTable
|
||||||
from docling_core.types import TableCell
|
from docling_core.types import TableCell
|
||||||
from docling_core.types.doc.base import Figure
|
from docling_core.types.legacy.base import BoundingBox as DsBoundingBox
|
||||||
|
from docling_core.types.legacy.base import Figure
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
|
25
docling/datamodel/pipeline_options.py
Normal file
25
docling/datamodel/pipeline_options.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
from enum import Enum, auto
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class TableFormerMode(str, Enum):
|
||||||
|
FAST = auto()
|
||||||
|
ACCURATE = auto()
|
||||||
|
|
||||||
|
|
||||||
|
class TableStructureOptions(BaseModel):
|
||||||
|
do_cell_matching: bool = (
|
||||||
|
True
|
||||||
|
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||||
|
# are merged across table columns.
|
||||||
|
# False: Let table structure model define the text cells, ignore PDF cells.
|
||||||
|
)
|
||||||
|
mode: TableFormerMode = TableFormerMode.FAST
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineOptions(BaseModel):
|
||||||
|
do_table_structure: bool = True # True: perform table structure extraction
|
||||||
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||||
|
|
||||||
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
|
|||||||
DoclingComponentType,
|
DoclingComponentType,
|
||||||
ErrorItem,
|
ErrorItem,
|
||||||
Page,
|
Page,
|
||||||
PipelineOptions,
|
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
DocumentConversionInput,
|
DocumentConversionInput,
|
||||||
InputDocument,
|
InputDocument,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.ds_glm_model import GlmModel
|
from docling.models.ds_glm_model import GlmModel
|
||||||
from docling.models.page_assemble_model import PageAssembleModel
|
from docling.models.page_assemble_model import PageAssembleModel
|
||||||
|
@ -11,9 +11,9 @@ from docling.datamodel.base_models import (
|
|||||||
Page,
|
Page,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableElement,
|
TableElement,
|
||||||
TableFormerMode,
|
|
||||||
TableStructurePrediction,
|
TableStructurePrediction,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_options import TableFormerMode
|
||||||
|
|
||||||
|
|
||||||
class TableStructureModel:
|
class TableStructureModel:
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, Iterable, List
|
from typing import Callable, Iterable, List
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, PipelineOptions
|
from docling.datamodel.base_models import Page
|
||||||
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
|
|
||||||
|
|
||||||
class BaseModelPipeline:
|
class BaseModelPipeline:
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.datamodel.base_models import PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
|
@ -4,7 +4,7 @@ import time
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
@ -6,8 +6,9 @@ from typing import Iterable
|
|||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
Loading…
Reference in New Issue
Block a user