test: improve typing definitions (part 1) (#72)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2024-09-12 15:56:29 +02:00
committed by GitHub
parent 53569a1023
commit 8aa476ccd3
9 changed files with 91 additions and 29 deletions

View File

@@ -1,10 +1,13 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import Any, Iterable, Optional, Union
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
from PIL import Image
if TYPE_CHECKING:
from docling.datamodel.base_models import BoundingBox, Cell, PageSize
class PdfPageBackend(ABC):
@@ -17,12 +20,12 @@ class PdfPageBackend(ABC):
pass
@abstractmethod
def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
pass
@abstractmethod
def get_page_image(
self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
) -> Image.Image:
pass

View File

@@ -2,7 +2,7 @@ import logging
import random
from io import BytesIO
from pathlib import Path
from typing import Iterable, Optional, Union
from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_parse.docling_parse import pdf_parser
@@ -22,7 +22,6 @@ class DoclingParsePageBackend(PdfPageBackend):
self._ppage = page_obj
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
self._dpage = None
self.valid = "pages" in parsed_page
if self.valid:
self._dpage = parsed_page["pages"][0]
@@ -68,7 +67,7 @@ class DoclingParsePageBackend(PdfPageBackend):
return text_piece
def get_text_cells(self) -> Iterable[Cell]:
cells = []
cells: List[Cell] = []
cell_counter = 0
if not self.valid:
@@ -130,7 +129,7 @@ class DoclingParsePageBackend(PdfPageBackend):
return cells
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
for i in range(len(self._dpage["images"])):
@@ -145,7 +144,7 @@ class DoclingParsePageBackend(PdfPageBackend):
yield cropbox
def get_page_image(
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()

View File

@@ -7,7 +7,7 @@ from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from pypdfium2 import PdfPage, PdfTextPage
from pypdfium2._helpers.misc import PdfiumError
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
@@ -29,12 +29,12 @@ class PyPdfiumPageBackend(PdfPageBackend):
exc_info=True,
)
self.valid = False
self.text_page = None
self.text_page: Optional[PdfTextPage] = None
def is_valid(self) -> bool:
return self.valid
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
@@ -189,7 +189,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
return cells
def get_page_image(
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()

View File

@@ -87,7 +87,7 @@ class BoundingBox(BaseModel):
return (self.l, self.b, self.r, self.t)
@classmethod
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
if origin == CoordOrigin.TOPLEFT:
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
if r < l:
@@ -246,7 +246,7 @@ class EquationPrediction(BaseModel):
class PagePredictions(BaseModel):
layout: LayoutPrediction = None
layout: Optional[LayoutPrediction] = None
tablestructure: Optional[TableStructurePrediction] = None
figures_classification: Optional[FigureClassificationPrediction] = None
equations_prediction: Optional[EquationPrediction] = None
@@ -267,7 +267,7 @@ class Page(BaseModel):
page_no: int
page_hash: Optional[str] = None
size: Optional[PageSize] = None
cells: List[Cell] = None
cells: List[Cell] = []
predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None

View File

@@ -1,12 +1,12 @@
from pathlib import Path
from typing import Iterable
from typing import Callable, Iterable, List
from docling.datamodel.base_models import Page, PipelineOptions
class BaseModelPipeline:
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
self.model_pipe = []
self.model_pipe: List[Callable] = []
self.artifacts_path = artifacts_path
self.pipeline_options = pipeline_options

View File

@@ -1,10 +1,10 @@
import logging
from typing import Any, Dict, Iterable, List, Tuple
from typing import Any, Dict, Iterable, List, Tuple, Union
from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
from docling.datamodel.document import ConvertedDocument, Page
from docling.datamodel.document import ConversionResult, Page
_log = logging.getLogger(__name__)
@@ -15,7 +15,10 @@ def _export_table_to_html(table: Table):
# to the docling-core package.
def _get_tablecell_span(cell: TableCell, ix):
span = set([s[ix] for s in cell.spans])
if cell.spans is None:
span = set()
else:
span = set([s[ix] for s in cell.spans])
if len(span) == 0:
return 1, None, None
return len(span), min(span), max(span)
@@ -24,6 +27,8 @@ def _export_table_to_html(table: Table):
nrows = table.num_rows
ncols = table.num_cols
if table.data is None:
return ""
for i in range(nrows):
body += "<tr>"
for j in range(ncols):
@@ -66,7 +71,7 @@ def _export_table_to_html(table: Table):
def generate_multimodal_pages(
doc_result: ConvertedDocument,
doc_result: ConversionResult,
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
label_to_doclaynet = {
@@ -94,7 +99,7 @@ def generate_multimodal_pages(
page_no = 0
start_ix = 0
end_ix = 0
doc_items = []
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
doc = doc_result.output
@@ -105,11 +110,11 @@ def generate_multimodal_pages(
item_type = item.obj_type
label = label_to_doclaynet.get(item_type, None)
if label is None:
if label is None or item.prov is None or page.size is None:
continue
bbox = BoundingBox.from_tuple(
item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
)
new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
page_size=page.size
@@ -137,13 +142,15 @@ def generate_multimodal_pages(
return segments
def _process_page_cells(page: Page):
cells = []
cells: List[dict] = []
if page.size is None:
return cells
for cell in page.cells:
new_bbox = cell.bbox.to_top_left_origin(
page_height=page.size.height
).normalized(page_size=page.size)
is_ocr = isinstance(cell, OcrCell)
ocr_confidence = cell.confidence if is_ocr else 1.0
ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
cells.append(
{
"text": cell.text,
@@ -170,6 +177,8 @@ def generate_multimodal_pages(
return content_text, content_md, content_dt, page_cells, page_segments, page
if doc.main_text is None:
return
for ix, orig_item in enumerate(doc.main_text):
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item