Merge from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-08 14:42:33 +02:00
49 changed files with 1621 additions and 1432 deletions

View File

@@ -8,14 +8,8 @@ from docling_core.types.experimental import (
BasePictureData,
BaseTableData,
DescriptionItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
ImageRef,
PictureItem,
SectionHeaderItem,
TableCell,
TableItem,
)
from docling_core.types.experimental.labels import DocItemLabel, GroupLabel

View File

@@ -11,22 +11,18 @@ from docling_core.types.experimental import (
DoclingDocument,
DocumentOrigin,
GroupLabel,
ImageRef,
PictureItem,
ProvenanceItem,
TableCell,
TableItem,
)
from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
from pptx.util import Inches
from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import FormatToMimeType, InputFormat
from docling.datamodel.base_models import InputFormat
_log = logging.getLogger(__name__)

View File

@@ -11,11 +11,7 @@ from docling_core.types.experimental import (
DocItemLabel,
DoclingDocument,
GroupLabel,
ImageRef,
PictureItem,
SectionHeaderItem,
TableCell,
TableItem,
)
from lxml import etree

View File

@@ -1,15 +1,12 @@
import warnings
from enum import Enum, auto
from io import BytesIO
from pathlib import Path
from typing import Annotated, Dict, List, Optional, Union
from typing import Dict, List, Optional, Union
from docling_core.types.experimental import BoundingBox, Size
from docling_core.types.experimental.document import BasePictureData, TableCell
from docling_core.types.experimental.labels import DocItemLabel
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
from pydantic import BaseModel, ConfigDict
class ConversionStatus(str, Enum):
@@ -173,40 +170,3 @@ class DocumentStream(BaseModel):
filename: str
stream: BytesIO
class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
class PipelineOptions(BaseModel): ...
class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
keep_page_images: Annotated[
bool,
Field(
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
),
] = False # False: page images are removed in the assemble step
images_scale: Optional[float] = None # if set, the scale for generated images
@model_validator(mode="after")
def set_page_images_from_deprecated(self) -> Self:
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
default_scale = 1.0
if self.keep_page_images and self.images_scale is None:
self.images_scale = default_scale
return self

View File

@@ -0,0 +1,49 @@
import warnings
from enum import Enum, auto
from pathlib import Path
from typing import Annotated, Optional, Self, Union
from pydantic import BaseModel, Field, model_validator
class TableFormerMode(str, Enum):
FAST = auto()
ACCURATE = auto()
class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
mode: TableFormerMode = TableFormerMode.FAST
class PipelineOptions(BaseModel): ...
class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
keep_page_images: Annotated[
bool,
Field(
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
),
] = False # False: page images are removed in the assemble step
images_scale: Optional[float] = None # if set, the scale for generated images
@model_validator(mode="after")
def set_page_images_from_deprecated(self) -> Self:
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
default_scale = 1.0
if self.keep_page_images and self.images_scale is None:
self.images_scale = default_scale
return self

View File

@@ -8,12 +8,13 @@ import requests
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat, PipelineOptions
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import (
ConversionResult,
DocumentConversionInput,
InputDocument,
)
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.pipeline.base_model_pipeline import BaseModelPipeline
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline

View File

@@ -73,6 +73,7 @@ class LayoutModel(AbstractPageModel):
CLASS_REMAPPINGS = {
DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
}
_log.debug("================= Start postprocess function ====================")

View File

@@ -1,4 +1,5 @@
import copy
from pathlib import Path
from typing import Iterable, List
import numpy
@@ -9,6 +10,7 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
from PIL import ImageDraw
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.pipeline_options import TableFormerMode
from docling.models.abstract_model import AbstractPageModel
@@ -16,10 +18,15 @@ class TableStructureModel(AbstractPageModel):
def __init__(self, config):
self.config = config
self.do_cell_matching = config["do_cell_matching"]
self.mode = config["mode"]
self.enabled = config["enabled"]
if self.enabled:
artifacts_path = config["artifacts_path"]
artifacts_path: Path = config["artifacts_path"]
if self.mode == TableFormerMode.ACCURATE:
artifacts_path = artifacts_path / "fat"
# Third Party
import docling_ibm_models.tableformer.common as c

View File

@@ -12,9 +12,9 @@ from docling.datamodel.base_models import (
DoclingComponentType,
ErrorItem,
Page,
PipelineOptions,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.utils.utils import chunkify
@@ -86,9 +86,6 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
# Free up mem resources of PDF backend
in_doc._backend.unload()
conv_res = self.assemble_document(in_doc, conv_res)
status = ConversionStatus.SUCCESS
@@ -113,6 +110,10 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
f"{trace}"
)
raise e
finally:
# Always unload the PDF backend, even in case of failure
if in_doc._backend:
in_doc._backend.unload()
return conv_res

View File

@@ -4,12 +4,9 @@ from docling.backend.abstract_backend import (
AbstractDocumentBackend,
DeclarativeDocumentBackend,
)
from docling.datamodel.base_models import (
ConversionStatus,
PdfPipelineOptions,
PipelineOptions,
)
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.pipeline.base_model_pipeline import BaseModelPipeline
_log = logging.getLogger(__name__)

View File

@@ -4,8 +4,9 @@ from typing import Optional
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page, PdfPipelineOptions
from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.models.ds_glm_model import GlmModel
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
@@ -18,7 +19,7 @@ _log = logging.getLogger(__name__)
class StandardPdfModelPipeline(PaginatedModelPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, pipeline_options: PdfPipelineOptions):
@@ -52,6 +53,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
/ StandardPdfModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure,
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
"mode": pipeline_options.table_structure_options.mode,
}
),
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
@@ -64,7 +66,10 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
from huggingface_hub import snapshot_download
download_path = snapshot_download(
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
repo_id="ds4sd/docling-models",
force_download=force,
local_dir=local_dir,
revision="v2.0.0",
)
return Path(download_path)