mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
use new PictureData
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
c1ed447c21
commit
7c8d7e222e
@ -5,10 +5,10 @@ from typing import Set, Union
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from docling_core.types.experimental import (
|
||||
BasePictureData,
|
||||
BaseTableData,
|
||||
DescriptionItem,
|
||||
DoclingDocument,
|
||||
PictureData,
|
||||
TableCell,
|
||||
)
|
||||
from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
|
||||
@ -400,7 +400,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
contains_captions = element.find(["figcaption"])
|
||||
if contains_captions is None:
|
||||
doc.add_picture(
|
||||
data=BasePictureData(), parent=self.parents[self.level], caption=None
|
||||
data=PictureData(), parent=self.parents[self.level], caption=None
|
||||
)
|
||||
|
||||
else:
|
||||
@ -412,7 +412,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
|
||||
)
|
||||
doc.add_picture(
|
||||
data=BasePictureData(),
|
||||
data=PictureData(),
|
||||
parent=self.parents[self.level],
|
||||
caption=fig_caption,
|
||||
)
|
||||
@ -420,5 +420,5 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def handle_image(self, element, idx, doc):
|
||||
"""Handles image tags (img)."""
|
||||
doc.add_picture(
|
||||
data=BasePictureData(), parent=self.parents[self.level], caption=None
|
||||
data=PictureData(), parent=self.parents[self.level], caption=None
|
||||
)
|
||||
|
@ -4,13 +4,13 @@ from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from docling_core.types.experimental import (
|
||||
BasePictureData,
|
||||
BaseTableData,
|
||||
DescriptionItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
PictureData,
|
||||
ProvenanceItem,
|
||||
TableCell,
|
||||
)
|
||||
@ -204,7 +204,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
# shape has picture
|
||||
prov = self.generate_prov(shape, slide_ind, "")
|
||||
doc.add_picture(
|
||||
data=BasePictureData(), parent=parent_slide, caption=None, prov=prov
|
||||
data=PictureData(), parent=parent_slide, caption=None, prov=prov
|
||||
)
|
||||
return
|
||||
|
||||
|
@ -5,12 +5,12 @@ from typing import Set, Union
|
||||
|
||||
import docx
|
||||
from docling_core.types.experimental import (
|
||||
BasePictureData,
|
||||
BaseTableData,
|
||||
DescriptionItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
GroupLabel,
|
||||
PictureData,
|
||||
TableCell,
|
||||
)
|
||||
from lxml import etree
|
||||
@ -419,6 +419,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def handle_pictures(self, element, docx_obj, doc):
|
||||
doc.add_picture(
|
||||
data=BasePictureData(), parent=self.parents[self.level], caption=None
|
||||
data=PictureData(), parent=self.parents[self.level], caption=None
|
||||
)
|
||||
return
|
||||
|
@ -3,7 +3,7 @@ from io import BytesIO
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from docling_core.types.experimental import BoundingBox, Size
|
||||
from docling_core.types.experimental.document import BasePictureData, TableCell
|
||||
from docling_core.types.experimental.document import PictureData, TableCell
|
||||
from docling_core.types.experimental.labels import DocItemLabel
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
@ -109,7 +109,7 @@ class TextElement(BasePageElement): ...
|
||||
|
||||
|
||||
class FigureElement(BasePageElement):
|
||||
data: Optional[BasePictureData] = None
|
||||
data: Optional[PictureData] = None
|
||||
provenance: Optional[str] = None
|
||||
predicted_class: Optional[str] = None
|
||||
confidence: Optional[float] = None
|
||||
|
@ -1,15 +1,14 @@
|
||||
from typing import Any, Iterable
|
||||
|
||||
from docling_core.types.experimental import DoclingDocument, NodeItem
|
||||
from docling_core.types.experimental.document import BasePictureData, PictureItem
|
||||
from docling_core.types.experimental.document import (
|
||||
PictureClassificationData,
|
||||
PictureItem,
|
||||
)
|
||||
|
||||
from docling.models.base_model import BaseEnrichmentModel
|
||||
|
||||
|
||||
class DummyPictureData(BasePictureData):
|
||||
hello: str
|
||||
|
||||
|
||||
class DummyPictureClassifierEnrichmentModel(BaseEnrichmentModel):
|
||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||
return isinstance(element, PictureItem)
|
||||
@ -19,6 +18,10 @@ class DummyPictureClassifierEnrichmentModel(BaseEnrichmentModel):
|
||||
) -> Iterable[Any]:
|
||||
for element in element_batch:
|
||||
assert isinstance(element, PictureItem)
|
||||
element.data = DummyPictureData(hello="world")
|
||||
element.data.classification = PictureClassificationData(
|
||||
provenance="dummy_classifier-0.0.1",
|
||||
predicted_class="dummy",
|
||||
confidence=0.42,
|
||||
)
|
||||
|
||||
yield element
|
||||
|
16
poetry.lock
generated
16
poetry.lock
generated
@ -885,7 +885,7 @@ files = []
|
||||
develop = false
|
||||
|
||||
[package.dependencies]
|
||||
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "baceeaeaa690a12f717918d17336fcbfe414cbb8"}
|
||||
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "8223654d87631ec61b9ec3570728e878d85d2ecf"}
|
||||
docutils = "!=0.21"
|
||||
matplotlib = "^3.7.1"
|
||||
networkx = "^3.1"
|
||||
@ -909,8 +909,8 @@ toolkit = ["deepsearch-toolkit (>=0.31.0)"]
|
||||
[package.source]
|
||||
type = "git"
|
||||
url = "https://github.com/DS4SD/deepsearch-glm.git"
|
||||
reference = "af4557df1500d15f82a0e0c9d2a3b64afc3e6ac1"
|
||||
resolved_reference = "af4557df1500d15f82a0e0c9d2a3b64afc3e6ac1"
|
||||
reference = "53874bd5c39bb3fe389663992b3efd3fedaf5697"
|
||||
resolved_reference = "53874bd5c39bb3fe389663992b3efd3fedaf5697"
|
||||
|
||||
[[package]]
|
||||
name = "dill"
|
||||
@ -958,8 +958,8 @@ tabulate = "^0.9.0"
|
||||
[package.source]
|
||||
type = "git"
|
||||
url = "https://github.com/DS4SD/docling-core.git"
|
||||
reference = "baceeaeaa690a12f717918d17336fcbfe414cbb8"
|
||||
resolved_reference = "baceeaeaa690a12f717918d17336fcbfe414cbb8"
|
||||
reference = "8223654d87631ec61b9ec3570728e878d85d2ecf"
|
||||
resolved_reference = "8223654d87631ec61b9ec3570728e878d85d2ecf"
|
||||
|
||||
[[package]]
|
||||
name = "docling-ibm-models"
|
||||
@ -3440,9 +3440,9 @@ files = [
|
||||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -3576,8 +3576,8 @@ files = [
|
||||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||
]
|
||||
python-dateutil = ">=2.8.2"
|
||||
pytz = ">=2020.1"
|
||||
@ -7107,4 +7107,4 @@ tesserocr = ["tesserocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "76695cfbcb87589dc2d8bc05b42969d558962122a9375e62ce68eed39cb0e634"
|
||||
content-hash = "d09e865ced8e4de077898f499cfd6e487b655e25ac2fe34b2159d91cb85b5238"
|
||||
|
@ -37,9 +37,9 @@ torchvision = [
|
||||
######################
|
||||
python = "^3.10"
|
||||
pydantic = "^2.0.0"
|
||||
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "baceeaeaa690a12f717918d17336fcbfe414cbb8"}
|
||||
docling-core = {git = "https://github.com/DS4SD/docling-core.git", rev = "8223654d87631ec61b9ec3570728e878d85d2ecf"}
|
||||
docling-ibm-models = {git = "https://github.com/DS4SD/docling-ibm-models.git", rev = "1d2e2a2e6eb152c237f1383cdba20cf85db80b97"}
|
||||
deepsearch-glm = {git = "https://github.com/DS4SD/deepsearch-glm.git", rev = "af4557df1500d15f82a0e0c9d2a3b64afc3e6ac1"}
|
||||
deepsearch-glm = {git = "https://github.com/DS4SD/deepsearch-glm.git", rev = "53874bd5c39bb3fe389663992b3efd3fedaf5697"}
|
||||
docling-parse = "^1.5.1"
|
||||
|
||||
filetype = "^1.2.0"
|
||||
|
Loading…
Reference in New Issue
Block a user