diff --git a/Dockerfile b/Dockerfile index 0c05fda5..a7e9bc6f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no" RUN apt-get update \ - && apt-get install -y libgl1 libglib2.0-0 curl wget git \ + && apt-get install -y libgl1 libglib2.0-0 curl wget git vim procps \ && apt-get clean # This will install torch with *only* cpu support @@ -16,7 +16,7 @@ ENV TORCH_HOME=/tmp/ COPY examples/minimal.py /root/minimal.py -RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' +#RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);' # On container environments, always set a thread budget to avoid undesired thread congestion. diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index d828beee..8e84ca4c 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -24,11 +24,6 @@ class DocInputType(str, Enum): STREAM = auto() -class TableFormerMode(str, Enum): - FAST = auto() - ACCURATE = auto() - - class CoordOrigin(str, Enum): TOPLEFT = auto() BOTTOMLEFT = auto() @@ -303,23 +298,6 @@ class DocumentStream(BaseModel): stream: BytesIO -class TableStructureOptions(BaseModel): - do_cell_matching: bool = ( - True - # True: Matches predictions back to PDF cells. Can break table output if PDF cells - # are merged across table columns. - # False: Let table structure model define the text cells, ignore PDF cells. - ) - mode: TableFormerMode = TableFormerMode.FAST - - -class PipelineOptions(BaseModel): - do_table_structure: bool = True # True: perform table structure extraction - do_ocr: bool = True # True: perform OCR, replace programmatic PDF text - - table_structure_options: TableStructureOptions = TableStructureOptions() - - class AssembleOptions(BaseModel): keep_page_images: Annotated[ bool, diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index b8177730..6669b5b6 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -4,14 +4,14 @@ from pathlib import Path, PurePath from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union from docling_core.types import BaseCell, BaseText -from docling_core.types import BoundingBox as DsBoundingBox from docling_core.types import Document as DsDocument from docling_core.types import DocumentDescription as DsDocumentDescription from docling_core.types import FileInfoObject as DsFileInfoObject from docling_core.types import PageDimensions, PageReference, Prov, Ref from docling_core.types import Table as DsSchemaTable from docling_core.types import TableCell -from docling_core.types.doc.base import Figure +from docling_core.types.legacy.base import BoundingBox as DsBoundingBox +from docling_core.types.legacy.base import Figure from pydantic import BaseModel from typing_extensions import deprecated diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py new file mode 100644 index 00000000..9ea7a77f --- /dev/null +++ b/docling/datamodel/pipeline_options.py @@ -0,0 +1,25 @@ +from enum import Enum, auto + +from pydantic import BaseModel + + +class TableFormerMode(str, Enum): + FAST = auto() + ACCURATE = auto() + + +class TableStructureOptions(BaseModel): + do_cell_matching: bool = ( + True + # True: Matches predictions back to PDF cells. Can break table output if PDF cells + # are merged across table columns. + # False: Let table structure model define the text cells, ignore PDF cells. + ) + mode: TableFormerMode = TableFormerMode.FAST + + +class PipelineOptions(BaseModel): + do_table_structure: bool = True # True: perform table structure extraction + do_ocr: bool = True # True: perform OCR, replace programmatic PDF text + + table_structure_options: TableStructureOptions = TableStructureOptions() diff --git a/docling/document_converter.py b/docling/document_converter.py index 542e174d..556a04a0 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -18,13 +18,13 @@ from docling.datamodel.base_models import ( DoclingComponentType, ErrorItem, Page, - PipelineOptions, ) from docling.datamodel.document import ( ConversionResult, DocumentConversionInput, InputDocument, ) +from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.settings import settings from docling.models.ds_glm_model import GlmModel from docling.models.page_assemble_model import PageAssembleModel diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index c2af5814..f722c13a 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -11,9 +11,9 @@ from docling.datamodel.base_models import ( Page, TableCell, TableElement, - TableFormerMode, TableStructurePrediction, ) +from docling.datamodel.pipeline_options import TableFormerMode class TableStructureModel: diff --git a/docling/pipeline/base_model_pipeline.py b/docling/pipeline/base_model_pipeline.py index 4fdde951..3dcf6546 100644 --- a/docling/pipeline/base_model_pipeline.py +++ b/docling/pipeline/base_model_pipeline.py @@ -1,7 +1,8 @@ from pathlib import Path from typing import Callable, Iterable, List -from docling.datamodel.base_models import Page, PipelineOptions +from docling.datamodel.base_models import Page +from docling.datamodel.pipeline_options import PipelineOptions class BaseModelPipeline: diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py index b43dc421..3532fea6 100644 --- a/docling/pipeline/standard_model_pipeline.py +++ b/docling/pipeline/standard_model_pipeline.py @@ -1,6 +1,6 @@ from pathlib import Path -from docling.datamodel.base_models import PipelineOptions +from docling.datamodel.pipeline_options import PipelineOptions from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel from docling.models.table_structure_model import TableStructureModel diff --git a/examples/batch_convert.py b/examples/batch_convert.py index 4491b364..eb943a7a 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -4,7 +4,7 @@ import time from pathlib import Path from typing import Iterable -from docling.datamodel.base_models import ConversionStatus, PipelineOptions +from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.document_converter import DocumentConverter diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 6f0b8f8f..2c0fac7a 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -6,8 +6,9 @@ from typing import Iterable from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import ConversionStatus, PipelineOptions +from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import ConversionResult, DocumentConversionInput +from docling.datamodel.pipeline_options import PipelineOptions from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__)