mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Update datamodel structure
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
14dcba11c0
commit
d8163b0865
@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm
|
||||
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
|
||||
&& apt-get install -y libgl1 libglib2.0-0 curl wget git vim procps \
|
||||
&& apt-get clean
|
||||
|
||||
# This will install torch with *only* cpu support
|
||||
@ -16,7 +16,7 @@ ENV TORCH_HOME=/tmp/
|
||||
|
||||
COPY examples/minimal.py /root/minimal.py
|
||||
|
||||
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
||||
#RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
||||
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
|
||||
|
||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||
|
@ -24,11 +24,6 @@ class DocInputType(str, Enum):
|
||||
STREAM = auto()
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
FAST = auto()
|
||||
ACCURATE = auto()
|
||||
|
||||
|
||||
class CoordOrigin(str, Enum):
|
||||
TOPLEFT = auto()
|
||||
BOTTOMLEFT = auto()
|
||||
@ -303,23 +298,6 @@ class DocumentStream(BaseModel):
|
||||
stream: BytesIO
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
# are merged across table columns.
|
||||
# False: Let table structure model define the text cells, ignore PDF cells.
|
||||
)
|
||||
mode: TableFormerMode = TableFormerMode.FAST
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
|
||||
|
||||
class AssembleOptions(BaseModel):
|
||||
keep_page_images: Annotated[
|
||||
bool,
|
||||
|
@ -4,14 +4,14 @@ from pathlib import Path, PurePath
|
||||
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
|
||||
from docling_core.types import BaseCell, BaseText
|
||||
from docling_core.types import BoundingBox as DsBoundingBox
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||
from docling_core.types import Table as DsSchemaTable
|
||||
from docling_core.types import TableCell
|
||||
from docling_core.types.doc.base import Figure
|
||||
from docling_core.types.legacy.base import BoundingBox as DsBoundingBox
|
||||
from docling_core.types.legacy.base import Figure
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
|
25
docling/datamodel/pipeline_options.py
Normal file
25
docling/datamodel/pipeline_options.py
Normal file
@ -0,0 +1,25 @@
|
||||
from enum import Enum, auto
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
FAST = auto()
|
||||
ACCURATE = auto()
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
# are merged across table columns.
|
||||
# False: Let table structure model define the text cells, ignore PDF cells.
|
||||
)
|
||||
mode: TableFormerMode = TableFormerMode.FAST
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
|
||||
DoclingComponentType,
|
||||
ErrorItem,
|
||||
Page,
|
||||
PipelineOptions,
|
||||
)
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
DocumentConversionInput,
|
||||
InputDocument,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.ds_glm_model import GlmModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel
|
||||
|
@ -11,9 +11,9 @@ from docling.datamodel.base_models import (
|
||||
Page,
|
||||
TableCell,
|
||||
TableElement,
|
||||
TableFormerMode,
|
||||
TableStructurePrediction,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import TableFormerMode
|
||||
|
||||
|
||||
class TableStructureModel:
|
||||
|
@ -1,7 +1,8 @@
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, List
|
||||
|
||||
from docling.datamodel.base_models import Page, PipelineOptions
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
|
||||
|
||||
class BaseModelPipeline:
|
||||
|
@ -1,6 +1,6 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
|
@ -4,7 +4,7 @@ import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
|
@ -6,8 +6,9 @@ from typing import Iterable
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
Loading…
Reference in New Issue
Block a user