mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: Optimize table extraction quality, add configuration options (#11)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import copy
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
@@ -47,6 +48,15 @@ class BoundingBox(BaseModel):
|
||||
def height(self):
|
||||
return abs(self.t - self.b)
|
||||
|
||||
def scaled(self, scale: float) -> "BoundingBox":
|
||||
out_bbox = copy.deepcopy(self)
|
||||
out_bbox.l *= scale
|
||||
out_bbox.r *= scale
|
||||
out_bbox.t *= scale
|
||||
out_bbox.b *= scale
|
||||
|
||||
return out_bbox
|
||||
|
||||
def as_tuple(self):
|
||||
if self.coord_origin == CoordOrigin.TOPLEFT:
|
||||
return (self.l, self.t, self.r, self.b)
|
||||
@@ -241,6 +251,17 @@ class DocumentStream(BaseModel):
|
||||
stream: BytesIO
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
# are merged across table columns.
|
||||
# False: Let table structure model define the text cells, ignore PDF cells.
|
||||
)
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
do_table_structure: bool = True
|
||||
do_ocr: bool = False
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
|
||||
Reference in New Issue
Block a user