mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-17 00:58:25 +00:00
@@ -8,14 +8,8 @@ from docling_core.types.experimental import (
|
||||
BasePictureData,
|
||||
BaseTableData,
|
||||
DescriptionItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
ImageRef,
|
||||
PictureItem,
|
||||
SectionHeaderItem,
|
||||
TableCell,
|
||||
TableItem,
|
||||
)
|
||||
from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
|
||||
|
||||
|
||||
@@ -11,22 +11,18 @@ from docling_core.types.experimental import (
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
ImageRef,
|
||||
PictureItem,
|
||||
ProvenanceItem,
|
||||
TableCell,
|
||||
TableItem,
|
||||
)
|
||||
from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
|
||||
from pptx import Presentation
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
||||
from pptx.util import Inches
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
PaginatedDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import FormatToMimeType, InputFormat
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -11,11 +11,7 @@ from docling_core.types.experimental import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
GroupLabel,
|
||||
ImageRef,
|
||||
PictureItem,
|
||||
SectionHeaderItem,
|
||||
TableCell,
|
||||
TableItem,
|
||||
)
|
||||
from lxml import etree
|
||||
|
||||
|
||||
@@ -1,15 +1,12 @@
|
||||
import warnings
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Dict, List, Optional, Union
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from docling_core.types.experimental import BoundingBox, Size
|
||||
from docling_core.types.experimental.document import BasePictureData, TableCell
|
||||
from docling_core.types.experimental.labels import DocItemLabel
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
from typing_extensions import Self
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
|
||||
class ConversionStatus(str, Enum):
|
||||
@@ -173,40 +170,3 @@ class DocumentStream(BaseModel):
|
||||
|
||||
filename: str
|
||||
stream: BytesIO
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
# are merged across table columns.
|
||||
# False: Let table structure model define the text cells, ignore PDF cells.
|
||||
)
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel): ...
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
|
||||
keep_page_images: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
||||
),
|
||||
] = False # False: page images are removed in the assemble step
|
||||
images_scale: Optional[float] = None # if set, the scale for generated images
|
||||
|
||||
@model_validator(mode="after")
|
||||
def set_page_images_from_deprecated(self) -> Self:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
default_scale = 1.0
|
||||
if self.keep_page_images and self.images_scale is None:
|
||||
self.images_scale = default_scale
|
||||
return self
|
||||
|
||||
49
docling/datamodel/pipeline_options.py
Normal file
49
docling/datamodel/pipeline_options.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import warnings
|
||||
from enum import Enum, auto
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Optional, Self, Union
|
||||
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
FAST = auto()
|
||||
ACCURATE = auto()
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
# are merged across table columns.
|
||||
# False: Let table structure model define the text cells, ignore PDF cells.
|
||||
)
|
||||
mode: TableFormerMode = TableFormerMode.FAST
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel): ...
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
|
||||
keep_page_images: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
||||
),
|
||||
] = False # False: page images are removed in the assemble step
|
||||
images_scale: Optional[float] = None # if set, the scale for generated images
|
||||
|
||||
@model_validator(mode="after")
|
||||
def set_page_images_from_deprecated(self) -> Self:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
default_scale = 1.0
|
||||
if self.keep_page_images and self.images_scale is None:
|
||||
self.images_scale = default_scale
|
||||
return self
|
||||
@@ -8,12 +8,13 @@ import requests
|
||||
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat, PipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
DocumentConversionInput,
|
||||
InputDocument,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||
|
||||
@@ -73,6 +73,7 @@ class LayoutModel(AbstractPageModel):
|
||||
|
||||
CLASS_REMAPPINGS = {
|
||||
DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
|
||||
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
||||
}
|
||||
|
||||
_log.debug("================= Start postprocess function ====================")
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import copy
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List
|
||||
|
||||
import numpy
|
||||
@@ -9,6 +10,7 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
|
||||
from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
from docling.datamodel.pipeline_options import TableFormerMode
|
||||
from docling.models.abstract_model import AbstractPageModel
|
||||
|
||||
|
||||
@@ -16,10 +18,15 @@ class TableStructureModel(AbstractPageModel):
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.do_cell_matching = config["do_cell_matching"]
|
||||
self.mode = config["mode"]
|
||||
|
||||
self.enabled = config["enabled"]
|
||||
if self.enabled:
|
||||
artifacts_path = config["artifacts_path"]
|
||||
artifacts_path: Path = config["artifacts_path"]
|
||||
|
||||
if self.mode == TableFormerMode.ACCURATE:
|
||||
artifacts_path = artifacts_path / "fat"
|
||||
|
||||
# Third Party
|
||||
import docling_ibm_models.tableformer.common as c
|
||||
|
||||
|
||||
@@ -12,9 +12,9 @@ from docling.datamodel.base_models import (
|
||||
DoclingComponentType,
|
||||
ErrorItem,
|
||||
Page,
|
||||
PipelineOptions,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.utils.utils import chunkify
|
||||
|
||||
@@ -86,9 +86,6 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
|
||||
# Free up mem resources of PDF backend
|
||||
in_doc._backend.unload()
|
||||
|
||||
conv_res = self.assemble_document(in_doc, conv_res)
|
||||
|
||||
status = ConversionStatus.SUCCESS
|
||||
@@ -113,6 +110,10 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
||||
f"{trace}"
|
||||
)
|
||||
raise e
|
||||
finally:
|
||||
# Always unload the PDF backend, even in case of failure
|
||||
if in_doc._backend:
|
||||
in_doc._backend.unload()
|
||||
|
||||
return conv_res
|
||||
|
||||
|
||||
@@ -4,12 +4,9 @@ from docling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
DeclarativeDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
)
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -4,8 +4,9 @@ from typing import Optional
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import AssembledUnit, Page, PdfPipelineOptions
|
||||
from docling.datamodel.base_models import AssembledUnit, Page
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.models.ds_glm_model import GlmModel
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
@@ -18,7 +19,7 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StandardPdfModelPipeline(PaginatedModelPipeline):
|
||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
|
||||
_table_model_path = "model_artifacts/tableformer"
|
||||
|
||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||
@@ -52,6 +53,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
||||
/ StandardPdfModelPipeline._table_model_path,
|
||||
"enabled": pipeline_options.do_table_structure,
|
||||
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
||||
"mode": pipeline_options.table_structure_options.mode,
|
||||
}
|
||||
),
|
||||
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
|
||||
@@ -64,7 +66,10 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
|
||||
repo_id="ds4sd/docling-models",
|
||||
force_download=force,
|
||||
local_dir=local_dir,
|
||||
revision="v2.0.0",
|
||||
)
|
||||
|
||||
return Path(download_path)
|
||||
|
||||
Reference in New Issue
Block a user