Reorganize imports from docling-core

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-09-20 10:53:52 +02:00
parent 6dd1e91c4a
commit abb6dddea8
15 changed files with 89 additions and 184 deletions

View File

@ -3,10 +3,11 @@ from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
from docling_core.types.experimental.base import BoundingBox, Size
from PIL import Image
if TYPE_CHECKING:
from docling.datamodel.base_models import BoundingBox, Cell, PageSize
from docling.datamodel.base_models import Cell
class PdfPageBackend(ABC):
@ -30,7 +31,7 @@ class PdfPageBackend(ABC):
pass
@abstractmethod
def get_size(self) -> "PageSize":
def get_size(self) -> "Size":
pass
@abstractmethod

View File

@ -5,12 +5,13 @@ from pathlib import Path
from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
from docling_parse.docling_parse import pdf_parser
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
from docling.datamodel.base_models import Cell
_log = logging.getLogger(__name__)
@ -177,8 +178,8 @@ class DoclingParsePageBackend(PdfPageBackend):
return image
def get_size(self) -> PageSize:
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None

View File

@ -6,12 +6,13 @@ from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage, PdfTextPage
from pypdfium2._helpers.misc import PdfiumError
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
from docling.datamodel.base_models import Cell
_log = logging.getLogger(__name__)
@ -222,8 +223,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
return image
def get_size(self) -> PageSize:
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None

View File

@ -4,6 +4,7 @@ from enum import Enum, auto
from io import BytesIO
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
from docling_core.types.experimental.base import BoundingBox, Size
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
@ -24,11 +25,6 @@ class DocInputType(str, Enum):
STREAM = auto()
class CoordOrigin(str, Enum):
TOPLEFT = auto()
BOTTOMLEFT = auto()
class DoclingComponentType(str, Enum):
PDF_BACKEND = auto()
MODEL = auto()
@ -41,115 +37,6 @@ class ErrorItem(BaseModel):
error_message: str
class PageSize(BaseModel):
width: float = 0.0
height: float = 0.0
class BoundingBox(BaseModel):
l: float # left
t: float # top
r: float # right
b: float # bottom
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
@property
def width(self):
return self.r - self.l
@property
def height(self):
return abs(self.t - self.b)
def scaled(self, scale: float) -> "BoundingBox":
out_bbox = copy.deepcopy(self)
out_bbox.l *= scale
out_bbox.r *= scale
out_bbox.t *= scale
out_bbox.b *= scale
return out_bbox
def normalized(self, page_size: PageSize) -> "BoundingBox":
out_bbox = copy.deepcopy(self)
out_bbox.l /= page_size.width
out_bbox.r /= page_size.width
out_bbox.t /= page_size.height
out_bbox.b /= page_size.height
return out_bbox
def as_tuple(self):
if self.coord_origin == CoordOrigin.TOPLEFT:
return (self.l, self.t, self.r, self.b)
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return (self.l, self.b, self.r, self.t)
@classmethod
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
if origin == CoordOrigin.TOPLEFT:
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b < t:
b, t = t, b
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
elif origin == CoordOrigin.BOTTOMLEFT:
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b > t:
b, t = t, b
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
def area(self) -> float:
return (self.r - self.l) * (self.b - self.t)
def intersection_area_with(self, other: "BoundingBox") -> float:
# Calculate intersection coordinates
left = max(self.l, other.l)
top = max(self.t, other.t)
right = min(self.r, other.r)
bottom = min(self.b, other.b)
# Calculate intersection dimensions
width = right - left
height = bottom - top
# If the bounding boxes do not overlap, width or height will be negative
if width <= 0 or height <= 0:
return 0.0
return width * height
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
return self
elif self.coord_origin == CoordOrigin.TOPLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t,
b=page_height - self.b,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
def to_top_left_origin(self, page_height):
if self.coord_origin == CoordOrigin.TOPLEFT:
return self
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t, # self.b
b=page_height - self.b, # self.t
coord_origin=CoordOrigin.TOPLEFT,
)
class Cell(BaseModel):
id: int
text: str
@ -266,7 +153,7 @@ class Page(BaseModel):
page_no: int
page_hash: Optional[str] = None
size: Optional[PageSize] = None
size: Optional[Size] = None
cells: List[Cell] = []
predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None

View File

@ -4,13 +4,13 @@ from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
from docling_core.types import BaseCell, BaseText
from docling_core.types import BoundingBox as DsBoundingBox
from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types import TableCell
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
from docling_core.types.doc.base import Figure
from pydantic import BaseModel
from typing_extensions import deprecated

View File

@ -5,11 +5,12 @@ from typing import Iterable, List, Tuple
import numpy
import numpy as np
from docling_core.types.experimental.base import BoundingBox, CoordOrigin
from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.datamodel.base_models import OcrCell, Page
_log = logging.getLogger(__name__)

View File

@ -7,9 +7,10 @@ from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_mode
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument
from docling_core.types import Ref
from docling_core.types.experimental.base import BoundingBox, CoordOrigin
from PIL import ImageDraw
from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
from docling.datamodel.base_models import Cluster
from docling.datamodel.document import ConversionResult

View File

@ -2,8 +2,9 @@ import logging
from typing import Iterable
import numpy
from docling_core.types.experimental.base import BoundingBox, CoordOrigin
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.datamodel.base_models import OcrCell, Page
from docling.models.base_ocr_model import BaseOcrModel
_log = logging.getLogger(__name__)

View File

@ -4,6 +4,7 @@ import random
import time
from typing import Iterable, List
from docling_core.types.experimental.base import CoordOrigin
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import ImageDraw
@ -11,7 +12,6 @@ from docling.datamodel.base_models import (
BoundingBox,
Cell,
Cluster,
CoordOrigin,
LayoutPrediction,
Page,
)

View File

@ -2,11 +2,11 @@ import copy
from typing import Iterable, List
import numpy
from docling_core.types.experimental.base import BoundingBox
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from PIL import ImageDraw
from docling.datamodel.base_models import (
BoundingBox,
Page,
TableCell,
TableElement,

View File

@ -1,9 +1,17 @@
import logging
from typing import Any, Dict, Iterable, List, Tuple, Union
from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
from docling_core.types.doc.base import (
BaseCell,
BaseText,
BoundingBox,
Ref,
Table,
TableCell,
)
from docling_core.types.experimental.base import CoordOrigin
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
from docling.datamodel.base_models import OcrCell
from docling.datamodel.document import ConversionResult, Page
_log = logging.getLogger(__name__)

100
poetry.lock generated
View File

@ -857,50 +857,33 @@ name = "deepsearch-glm"
version = "0.21.1"
description = "Graph Language Models"
optional = false
python-versions = "<4.0,>=3.8"
files = [
{file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:b765d371ab0a4f57dd2532c651d7dc1b4a187395153e619a77b6f0d0f6aefb32"},
{file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:c69e055b98d0a22267a1d0b6139801aecc5b7386289b89f53f976ab723352728"},
{file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:3eaa245e5ac4ab3e9d0c95a93e23f58d61d70f11431b76b6705fae358eb31c62"},
{file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:63d195f6c5b30f4f908436589cffd4a5b9e18553c44c57fb635068a2afbd7fab"},
{file = "deepsearch_glm-0.21.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91c9296a2e417a30bf030de0c7c2e2cce4773c58bead039d5e6fccbf7deb2269"},
{file = "deepsearch_glm-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:166b9958d3a8a98d0671a1e3fdf8083ded9ccf12c2ab80fb9709908a2cf81784"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:13bea2b4e8c04647ec743c3feb1ee66c784db542ab9dbed8dad7eb66fca74b70"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:c5b8b8e2207615ff99e535f00548c7b0b8e4ca4593e59edd83fcad98fc318284"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ba74868243caf5ac850fff7c45c8a372c1cac0193431e22eb41888d45ac79719"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:7815b06aa1c3953488496f191ce0265d0ee7bed5a6b96454a5f9d6f1add28f69"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1a7dd2a1e63cee47f6090ebfebc15f68d24f61d5f4f45a21f22120b2267798d"},
{file = "deepsearch_glm-0.21.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d52bd2934a27fdc9db5f2d0713dbeec0c94e5c5843d29996e85d641a11498ad0"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:fd4d0d4ff853e566b05769c704a4ea3c050c0cfc5721e4e2035e550fb2a8fe91"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:802a59a8a3bea1801bce848d58d19fcdbbcea27d9e2c23f163419d13cdec2345"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:1ead7958bc044000a8d43cce53c9b82be0d341b0ca5cf7b39a0c09f9c4fd8ceb"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:312cf2b0b6560c8dfe5331a5a80a0ed5cb409d29ee6cc999a81696774d50f5e7"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc3d6f6ca2cffbe5e112818c8aba9a783af8ab7cffff04624bfb5bf8d185b707"},
{file = "deepsearch_glm-0.21.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bc537d5e9d108233b7e7249c6739292dc9c36a0f39c11e7f430700df35ff884"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:4db0a700c08ff2d6285461dc5f4a68ccd36876a59b62131f847dc4be76a85989"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:f1041c44d1a4d1a43a324781795b03edfdfd8076c49a610c4dd384c86f2a6236"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:efb0e9678fe07640bd9b6dc07651eaf1f8e5d5602e379b4cf78dbcddc62b50e9"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:f8d46922d74339ec7fd7a6933220ebc36b2ff39738ad9bb74ea55a198dd31b2f"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2872de101ce6d262f57afd3f4d68452064c214c5ab001b7ac698a948e0725314"},
{file = "deepsearch_glm-0.21.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:187da7dabc11317badbf6983ee508c367299eb39ed78938623206be6b21e41bd"},
]
python-versions = "^3.9"
files = []
develop = false
[package.dependencies]
docling-core = {git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "cau/new-format-dev"}
docutils = "!=0.21"
matplotlib = ">=3.7.1,<4.0.0"
networkx = ">=3.1,<4.0"
netwulf = ">=0.1.5,<0.2.0"
numerize = ">=0.12,<0.13"
numpy = {version = ">=1.26.4,<2.0.0", markers = "python_version >= \"3.9\""}
matplotlib = "^3.7.1"
networkx = "^3.1"
netwulf = "^0.1.5"
numerize = "^0.12"
numpy = {version = "^1.26.4", markers = "python_version >= \"3.9\""}
pandas = ">=1.5.1"
python-dotenv = ">=1.0.0,<2.0.0"
rich = ">=13.7.0,<14.0.0"
python-dotenv = "^1.0.0"
rich = "^13.7.0"
tabulate = ">=0.8.9"
tqdm = ">=4.64.0,<5.0.0"
tqdm = "^4.64.0"
[package.extras]
toolkit = ["deepsearch-toolkit (>=0.31.0)"]
[package.source]
type = "git"
url = "ssh://git@github.com/DS4SD/deepsearch-glm.git"
reference = "cau/new-format-dev"
resolved_reference = "6d86b7ddaa8911ec57df9bbabf981a42166e53d2"
[[package]]
name = "deprecated"
version = "1.2.14"
@ -957,23 +940,27 @@ files = [
[[package]]
name = "docling-core"
version = "1.4.0"
version = "1.4.1"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_core-1.4.0-py3-none-any.whl", hash = "sha256:11cd6228d5f321fd11427cf61f40148afd544170e82236228794300f14f8a15a"},
{file = "docling_core-1.4.0.tar.gz", hash = "sha256:6ea151974172a87a9bca0d63787dc16bdb4170ecb73f18e61e3c2e95eb3fe3d8"},
]
python-versions = "^3.9"
files = []
develop = false
[package.dependencies]
json-schema-for-humans = ">=1.0.0,<2.0.0"
jsonref = ">=1.1.0,<2.0.0"
jsonschema = ">=4.16.0,<5.0.0"
pandas = ">=2.2.2,<3.0.0"
pydantic = ">=2.6.0,<3.0.0"
pyproject-toml = ">=0.0.10,<0.0.11"
tabulate = ">=0.9.0,<0.10.0"
json-schema-for-humans = "^1.0.0"
jsonref = "^1.1.0"
jsonschema = "^4.16.0"
pandas = "^2.2.2"
pydantic = "^2.6.0"
pyproject-toml = "^0.0.10"
tabulate = "^0.9.0"
[package.source]
type = "git"
url = "ssh://git@github.com/DS4SD/docling-core.git"
reference = "cau/new-format-dev"
resolved_reference = "ed087646ec9ad86c5b54eb37d7b99322d03487f0"
[[package]]
name = "docling-ibm-models"
@ -4697,6 +4684,21 @@ files = [
[package.dependencies]
six = ">=1.5"
[[package]]
name = "python-docx"
version = "1.1.2"
description = "Create, read, and update Microsoft Word .docx files."
optional = false
python-versions = ">=3.7"
files = [
{file = "python_docx-1.1.2-py3-none-any.whl", hash = "sha256:08c20d6058916fb19853fcf080f7f42b6270d89eac9fa5f8c15f691c0017fabe"},
{file = "python_docx-1.1.2.tar.gz", hash = "sha256:0cf1f22e95b9002addca7948e16f2cd7acdfd498047f1941ca5d293db7762efd"},
]
[package.dependencies]
lxml = ">=3.1.0"
typing-extensions = ">=4.9.0"
[[package]]
name = "python-dotenv"
version = "1.0.1"
@ -7257,4 +7259,4 @@ examples = ["langchain-huggingface", "langchain-milvus", "langchain-text-splitte
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "7dc789b3c981898fdabec03f85ebb92273f2bb55b2bf1e18dad1d4c361c6b97b"
content-hash = "1b908180d822d74ae8033e8b6c650b8d00b4365fc7dd36cea6505305651b79b6"

View File

@ -23,9 +23,10 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies]
python = "^3.10"
pydantic = "^2.0.0"
docling-core = "^1.4.0"
docling-core = {git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "cau/new-format-dev"}
docling-ibm-models = "^1.2.0"
deepsearch-glm = "^0.21.1"
deepsearch-glm = {git = "ssh://git@github.com/DS4SD/deepsearch-glm.git", branch = "cau/new-format-dev"}
filetype = "^1.2.0"
pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0"
@ -61,6 +62,7 @@ torchvision = [
{version = "~0.17.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
]
typer = "^0.12.5"
python-docx = "^1.1.2"
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}

View File

@ -1,12 +1,12 @@
from pathlib import Path
import pytest
from docling_core.types.experimental.base import BoundingBox
from docling.backend.docling_parse_backend import (
DoclingParseDocumentBackend,
DoclingParsePageBackend,
)
from docling.datamodel.base_models import BoundingBox
@pytest.fixture

View File

@ -1,12 +1,12 @@
from pathlib import Path
import pytest
from docling_core.types.experimental.base import BoundingBox
from docling.backend.pypdfium2_backend import (
PyPdfiumDocumentBackend,
PyPdfiumPageBackend,
)
from docling.datamodel.base_models import BoundingBox
@pytest.fixture