feat: Support tableformer model choice (#90)

* Support tableformer model choice

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update datamodel structure

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update docs

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Cleanup

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add test unit for table options

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Ensure import backwards-compatibility for PipelineOptions

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update README

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Adjust parameters on custom_convert

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>

* Update Dockerfile

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
This commit is contained in:
Christoph Auer
2024-09-26 21:37:08 +02:00
committed by GitHub
parent 39977b5631
commit d6df76f90b
16 changed files with 711 additions and 592 deletions

View File

@@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
from docling.backend.abstract_backend import PdfPageBackend
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
PipelineOptions,
TableStructureOptions,
)
class ConversionStatus(str, Enum):
@@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
stream: BytesIO
class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
class AssembleOptions(BaseModel):
keep_page_images: Annotated[
bool,