diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 7bb53fce..66df2869 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -1,10 +1,13 @@ from abc import ABC, abstractmethod from io import BytesIO from pathlib import Path -from typing import Any, Iterable, Optional, Union +from typing import TYPE_CHECKING, Any, Iterable, Optional, Union from PIL import Image +if TYPE_CHECKING: + from docling.datamodel.base_models import BoundingBox, Cell, PageSize + class PdfPageBackend(ABC): @@ -17,12 +20,12 @@ class PdfPageBackend(ABC): pass @abstractmethod - def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]: + def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]: pass @abstractmethod def get_page_image( - self, scale: int = 1, cropbox: Optional["BoundingBox"] = None + self, scale: float = 1, cropbox: Optional["BoundingBox"] = None ) -> Image.Image: pass diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index aeaf4739..d7a116d4 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -2,7 +2,7 @@ import logging import random from io import BytesIO from pathlib import Path -from typing import Iterable, Optional, Union +from typing import Iterable, List, Optional, Union import pypdfium2 as pdfium from docling_parse.docling_parse import pdf_parser @@ -22,7 +22,6 @@ class DoclingParsePageBackend(PdfPageBackend): self._ppage = page_obj parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) - self._dpage = None self.valid = "pages" in parsed_page if self.valid: self._dpage = parsed_page["pages"][0] @@ -68,7 +67,7 @@ class DoclingParsePageBackend(PdfPageBackend): return text_piece def get_text_cells(self) -> Iterable[Cell]: - cells = [] + cells: List[Cell] = [] cell_counter = 0 if not self.valid: @@ -130,7 +129,7 @@ class DoclingParsePageBackend(PdfPageBackend): return cells - def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]: + def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: AREA_THRESHOLD = 32 * 32 for i in range(len(self._dpage["images"])): @@ -145,7 +144,7 @@ class DoclingParsePageBackend(PdfPageBackend): yield cropbox def get_page_image( - self, scale: int = 1, cropbox: Optional[BoundingBox] = None + self, scale: float = 1, cropbox: Optional[BoundingBox] = None ) -> Image.Image: page_size = self.get_size() diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index b7ec824a..81ab8488 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -7,7 +7,7 @@ from typing import Iterable, List, Optional, Union import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c from PIL import Image, ImageDraw -from pypdfium2 import PdfPage +from pypdfium2 import PdfPage, PdfTextPage from pypdfium2._helpers.misc import PdfiumError from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend @@ -29,12 +29,12 @@ class PyPdfiumPageBackend(PdfPageBackend): exc_info=True, ) self.valid = False - self.text_page = None + self.text_page: Optional[PdfTextPage] = None def is_valid(self) -> bool: return self.valid - def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]: + def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: AREA_THRESHOLD = 32 * 32 for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]): pos = obj.get_pos() @@ -189,7 +189,7 @@ class PyPdfiumPageBackend(PdfPageBackend): return cells def get_page_image( - self, scale: int = 1, cropbox: Optional[BoundingBox] = None + self, scale: float = 1, cropbox: Optional[BoundingBox] = None ) -> Image.Image: page_size = self.get_size() diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 71238b8d..e9c51d69 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -87,7 +87,7 @@ class BoundingBox(BaseModel): return (self.l, self.b, self.r, self.t) @classmethod - def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin): + def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin): if origin == CoordOrigin.TOPLEFT: l, t, r, b = coord[0], coord[1], coord[2], coord[3] if r < l: @@ -246,7 +246,7 @@ class EquationPrediction(BaseModel): class PagePredictions(BaseModel): - layout: LayoutPrediction = None + layout: Optional[LayoutPrediction] = None tablestructure: Optional[TableStructurePrediction] = None figures_classification: Optional[FigureClassificationPrediction] = None equations_prediction: Optional[EquationPrediction] = None @@ -267,7 +267,7 @@ class Page(BaseModel): page_no: int page_hash: Optional[str] = None size: Optional[PageSize] = None - cells: List[Cell] = None + cells: List[Cell] = [] predictions: PagePredictions = PagePredictions() assembled: Optional[AssembledUnit] = None diff --git a/docling/pipeline/base_model_pipeline.py b/docling/pipeline/base_model_pipeline.py index 680a1140..4fdde951 100644 --- a/docling/pipeline/base_model_pipeline.py +++ b/docling/pipeline/base_model_pipeline.py @@ -1,12 +1,12 @@ from pathlib import Path -from typing import Iterable +from typing import Callable, Iterable, List from docling.datamodel.base_models import Page, PipelineOptions class BaseModelPipeline: def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions): - self.model_pipe = [] + self.model_pipe: List[Callable] = [] self.artifacts_path = artifacts_path self.pipeline_options = pipeline_options diff --git a/docling/utils/export.py b/docling/utils/export.py index f438ed1d..115f7646 100644 --- a/docling/utils/export.py +++ b/docling/utils/export.py @@ -1,10 +1,10 @@ import logging -from typing import Any, Dict, Iterable, List, Tuple +from typing import Any, Dict, Iterable, List, Tuple, Union -from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell +from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell -from docling.datamodel.document import ConvertedDocument, Page +from docling.datamodel.document import ConversionResult, Page _log = logging.getLogger(__name__) @@ -15,7 +15,10 @@ def _export_table_to_html(table: Table): # to the docling-core package. def _get_tablecell_span(cell: TableCell, ix): - span = set([s[ix] for s in cell.spans]) + if cell.spans is None: + span = set() + else: + span = set([s[ix] for s in cell.spans]) if len(span) == 0: return 1, None, None return len(span), min(span), max(span) @@ -24,6 +27,8 @@ def _export_table_to_html(table: Table): nrows = table.num_rows ncols = table.num_cols + if table.data is None: + return "" for i in range(nrows): body += "