mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
feat: export document pages as multimodal output
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
48f4d1ba52
commit
3e789dfbdd
@ -71,6 +71,15 @@ class BoundingBox(BaseModel):
|
|||||||
|
|
||||||
return out_bbox
|
return out_bbox
|
||||||
|
|
||||||
|
def normalized(self, page_size: PageSize) -> "BoundingBox":
|
||||||
|
out_bbox = copy.deepcopy(self)
|
||||||
|
out_bbox.l /= page_size.width
|
||||||
|
out_bbox.r /= page_size.width
|
||||||
|
out_bbox.t /= page_size.height
|
||||||
|
out_bbox.b /= page_size.height
|
||||||
|
|
||||||
|
return out_bbox
|
||||||
|
|
||||||
def as_tuple(self):
|
def as_tuple(self):
|
||||||
if self.coord_origin == CoordOrigin.TOPLEFT:
|
if self.coord_origin == CoordOrigin.TOPLEFT:
|
||||||
return (self.l, self.t, self.r, self.b)
|
return (self.l, self.t, self.r, self.b)
|
||||||
|
193
docling/utils/export.py
Normal file
193
docling/utils/export.py
Normal file
@ -0,0 +1,193 @@
|
|||||||
|
import logging
|
||||||
|
from typing import Any, Dict, Iterable, List, Tuple
|
||||||
|
|
||||||
|
from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
|
||||||
|
from docling.datamodel.document import ConvertedDocument, Page
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _export_table_to_html(table: Table):
|
||||||
|
|
||||||
|
# TODO: this is flagged as internal, because we will move it
|
||||||
|
# to the docling-core package.
|
||||||
|
|
||||||
|
def _get_tablecell_span(cell: TableCell, ix):
|
||||||
|
span = set([s[ix] for s in cell.spans])
|
||||||
|
if len(span) == 0:
|
||||||
|
return 1, None, None
|
||||||
|
return len(span), min(span), max(span)
|
||||||
|
|
||||||
|
body = ""
|
||||||
|
nrows = table.num_rows
|
||||||
|
ncols = table.num_cols
|
||||||
|
|
||||||
|
for i in range(nrows):
|
||||||
|
body += "<tr>"
|
||||||
|
for j in range(ncols):
|
||||||
|
cell: TableCell = table.data[i][j]
|
||||||
|
|
||||||
|
rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
|
||||||
|
colspan, colstart, colend = _get_tablecell_span(cell, 1)
|
||||||
|
|
||||||
|
if rowstart is not None and rowstart != i:
|
||||||
|
continue
|
||||||
|
if colstart is not None and colstart != j:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if rowstart is None:
|
||||||
|
rowstart = i
|
||||||
|
if colstart is None:
|
||||||
|
colstart = j
|
||||||
|
|
||||||
|
content = cell.text.strip()
|
||||||
|
label = cell.obj_type
|
||||||
|
label_class = "body"
|
||||||
|
celltag = "td"
|
||||||
|
if label in ["row_header", "row_multi_header", "row_title"]:
|
||||||
|
label_class = "header"
|
||||||
|
elif label in ["col_header", "col_multi_header"]:
|
||||||
|
label_class = "header"
|
||||||
|
celltag = "th"
|
||||||
|
|
||||||
|
opening_tag = f"{celltag}"
|
||||||
|
if rowspan > 1:
|
||||||
|
opening_tag += f' rowspan="{rowspan}"'
|
||||||
|
if colspan > 1:
|
||||||
|
opening_tag += f' colspan="{colspan}"'
|
||||||
|
|
||||||
|
body += f"<{opening_tag}>{content}</{celltag}>"
|
||||||
|
body += "</tr>"
|
||||||
|
body = f"<table>{body}</table>"
|
||||||
|
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
def generate_multimodal_pages(
|
||||||
|
doc_result: ConvertedDocument,
|
||||||
|
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
||||||
|
|
||||||
|
label_to_doclaynet = {
|
||||||
|
"title": "title",
|
||||||
|
"table-of-contents": "document_index",
|
||||||
|
"subtitle-level-1": "section_header",
|
||||||
|
"checkbox-selected": "checkbox_selected",
|
||||||
|
"checkbox-unselected": "checkbox_unselected",
|
||||||
|
"caption": "caption",
|
||||||
|
"page-header": "page_header",
|
||||||
|
"page-footer": "page_footer",
|
||||||
|
"footnote": "footnote",
|
||||||
|
"table": "table",
|
||||||
|
"formula": "formula",
|
||||||
|
"list-item": "list_item",
|
||||||
|
"code": "code",
|
||||||
|
"figure": "picture",
|
||||||
|
"picture": "picture",
|
||||||
|
"reference": "text",
|
||||||
|
"paragraph": "text",
|
||||||
|
"text": "text",
|
||||||
|
}
|
||||||
|
|
||||||
|
content_text = ""
|
||||||
|
page_no = 0
|
||||||
|
start_ix = 0
|
||||||
|
end_ix = 0
|
||||||
|
doc_items = []
|
||||||
|
|
||||||
|
doc = doc_result.output
|
||||||
|
|
||||||
|
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
|
||||||
|
segments = []
|
||||||
|
|
||||||
|
for ix, item in doc_items:
|
||||||
|
item_type = item.obj_type
|
||||||
|
label = label_to_doclaynet.get(item_type, None)
|
||||||
|
|
||||||
|
if label is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
bbox = BoundingBox.from_tuple(
|
||||||
|
item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
|
||||||
|
)
|
||||||
|
new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
|
||||||
|
page_size=page.size
|
||||||
|
)
|
||||||
|
|
||||||
|
new_segment = {
|
||||||
|
"index_in_doc": ix,
|
||||||
|
"label": label,
|
||||||
|
"text": item.text if item.text is not None else "",
|
||||||
|
"bbox": new_bbox.as_tuple(),
|
||||||
|
"data": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
if isinstance(item, Table):
|
||||||
|
table_html = _export_table_to_html(item)
|
||||||
|
new_segment["data"].append(
|
||||||
|
{
|
||||||
|
"html_seq": table_html,
|
||||||
|
"otsl_seq": "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
segments.append(new_segment)
|
||||||
|
|
||||||
|
return segments
|
||||||
|
|
||||||
|
def _process_page_cells(page: Page):
|
||||||
|
cells = []
|
||||||
|
for cell in page.cells:
|
||||||
|
new_bbox = cell.bbox.to_top_left_origin(
|
||||||
|
page_height=page.size.height
|
||||||
|
).normalized(page_size=page.size)
|
||||||
|
is_ocr = isinstance(cell, OcrCell)
|
||||||
|
ocr_confidence = cell.confidence if is_ocr else 1.0
|
||||||
|
cells.append(
|
||||||
|
{
|
||||||
|
"text": cell.text,
|
||||||
|
"bbox": new_bbox.as_tuple(),
|
||||||
|
"ocr": is_ocr,
|
||||||
|
"ocr_confidence": ocr_confidence,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return cells
|
||||||
|
|
||||||
|
def _process_page():
|
||||||
|
page_ix = page_no - 1
|
||||||
|
page = doc_result.pages[page_ix]
|
||||||
|
|
||||||
|
page_cells = _process_page_cells(page=page)
|
||||||
|
page_segments = _process_page_segments(doc_items=doc_items, page=page)
|
||||||
|
content_md = doc.export_to_markdown(
|
||||||
|
main_text_start=start_ix, main_text_stop=end_ix
|
||||||
|
)
|
||||||
|
|
||||||
|
return content_text, content_md, page_cells, page_segments, page
|
||||||
|
|
||||||
|
for ix, orig_item in enumerate(doc.main_text):
|
||||||
|
|
||||||
|
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
||||||
|
if item is None or item.prov is None or len(item.prov) == 0:
|
||||||
|
_log.debug(f"Skipping item {orig_item}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
item_page = item.prov[0].page
|
||||||
|
|
||||||
|
# Page is complete
|
||||||
|
if page_no > 0 and item_page > page_no:
|
||||||
|
yield _process_page()
|
||||||
|
|
||||||
|
start_ix = ix
|
||||||
|
doc_items = []
|
||||||
|
content_text = ""
|
||||||
|
|
||||||
|
page_no = item_page
|
||||||
|
end_ix = ix
|
||||||
|
doc_items.append((ix, item))
|
||||||
|
if item.text is not None and item.text != "":
|
||||||
|
content_text += item.text + " "
|
||||||
|
|
||||||
|
if len(doc_items) > 0:
|
||||||
|
yield _process_page()
|
87
examples/export_multimodal.py
Normal file
87
examples/export_multimodal.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import AssembleOptions, ConversionStatus
|
||||||
|
from docling.datamodel.document import DocumentConversionInput
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
from docling.utils.export import generate_multimodal_pages
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
IMAGE_RESOLUTION_SCALE = 2.0
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
input_doc_paths = [
|
||||||
|
Path("./test/data/2206.01062.pdf"),
|
||||||
|
]
|
||||||
|
output_dir = Path("./scratch")
|
||||||
|
|
||||||
|
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||||
|
|
||||||
|
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||||
|
# will destroy them for cleaning up memory.
|
||||||
|
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||||
|
# scale=1 correspond of a standard 72 DPI image
|
||||||
|
assemble_options = AssembleOptions()
|
||||||
|
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(assemble_options=assemble_options)
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
converted_docs = doc_converter.convert(input_files)
|
||||||
|
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
for doc in converted_docs:
|
||||||
|
if doc.status != ConversionStatus.SUCCESS:
|
||||||
|
_log.info(f"Document {doc.input.file} failed to convert.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
doc_filename = doc.input.file.stem
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for _pack in generate_multimodal_pages(doc):
|
||||||
|
content_text, content_md, page_cells, page_segments, page = _pack
|
||||||
|
|
||||||
|
dpi = page._default_image_scale * 72
|
||||||
|
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"document": doc.input.file.name,
|
||||||
|
"hash": doc.input.document_hash,
|
||||||
|
"page_hash": page.page_hash,
|
||||||
|
"image": {
|
||||||
|
"width": page.image.width,
|
||||||
|
"height": page.image.height,
|
||||||
|
"bytes": page.image.tobytes(),
|
||||||
|
},
|
||||||
|
"cells": page_cells,
|
||||||
|
"contents": content_text,
|
||||||
|
"contents_md": content_md,
|
||||||
|
"segments": page_segments,
|
||||||
|
"extra": {
|
||||||
|
"page_num": page.page_no + 1,
|
||||||
|
"width_in_points": page.size.width,
|
||||||
|
"height_in_points": page.size.height,
|
||||||
|
"dpi": dpi,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
df = pd.json_normalize(rows)
|
||||||
|
|
||||||
|
output_filename = output_dir / f"{doc_filename}.parquet"
|
||||||
|
df.to_parquet(output_filename)
|
||||||
|
|
||||||
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
|
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
68
poetry.lock
generated
68
poetry.lock
generated
@ -26,8 +26,8 @@ files = [
|
|||||||
lazy-object-proxy = ">=1.4.0"
|
lazy-object-proxy = ">=1.4.0"
|
||||||
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
|
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
|
||||||
wrapt = [
|
wrapt = [
|
||||||
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
|
|
||||||
{version = ">=1.11,<2", markers = "python_version < \"3.11\""},
|
{version = ">=1.11,<2", markers = "python_version < \"3.11\""},
|
||||||
|
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2670,10 +2670,10 @@ files = [
|
|||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
|
||||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||||
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2727,9 +2727,9 @@ files = [
|
|||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
|
||||||
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
||||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||||
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
]
|
]
|
||||||
python-dateutil = ">=2.8.2"
|
python-dateutil = ">=2.8.2"
|
||||||
pytz = ">=2020.1"
|
pytz = ">=2020.1"
|
||||||
@ -3080,6 +3080,57 @@ files = [
|
|||||||
[package.extras]
|
[package.extras]
|
||||||
tests = ["pytest"]
|
tests = ["pytest"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pyarrow"
|
||||||
|
version = "17.0.0"
|
||||||
|
description = "Python library for Apache Arrow"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
|
||||||
|
{file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
|
||||||
|
{file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
|
||||||
|
{file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
|
||||||
|
{file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
|
||||||
|
{file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
|
||||||
|
{file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
|
||||||
|
{file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
|
||||||
|
{file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
|
||||||
|
{file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
|
||||||
|
{file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
|
||||||
|
{file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
|
||||||
|
{file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
|
||||||
|
{file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
|
||||||
|
{file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
|
||||||
|
{file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
|
||||||
|
{file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
|
||||||
|
{file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
|
||||||
|
{file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
|
||||||
|
{file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
|
||||||
|
{file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
|
||||||
|
{file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
|
||||||
|
{file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
|
||||||
|
{file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
|
||||||
|
{file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
|
||||||
|
{file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
|
||||||
|
{file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
|
||||||
|
{file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
|
||||||
|
{file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
|
||||||
|
{file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
|
||||||
|
{file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
|
||||||
|
{file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
|
||||||
|
{file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
|
||||||
|
{file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
|
||||||
|
{file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
|
||||||
|
{file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
numpy = ">=1.16.6"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pybind11"
|
name = "pybind11"
|
||||||
version = "2.13.5"
|
version = "2.13.5"
|
||||||
@ -3184,8 +3235,8 @@ files = [
|
|||||||
annotated-types = ">=0.4.0"
|
annotated-types = ">=0.4.0"
|
||||||
pydantic-core = "2.20.1"
|
pydantic-core = "2.20.1"
|
||||||
typing-extensions = [
|
typing-extensions = [
|
||||||
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
|
|
||||||
{version = ">=4.6.1", markers = "python_version < \"3.13\""},
|
{version = ">=4.6.1", markers = "python_version < \"3.13\""},
|
||||||
|
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
@ -3352,8 +3403,8 @@ files = [
|
|||||||
astroid = ">=2.15.8,<=2.17.0-dev0"
|
astroid = ">=2.15.8,<=2.17.0-dev0"
|
||||||
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
||||||
dill = [
|
dill = [
|
||||||
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
|
|
||||||
{version = ">=0.2", markers = "python_version < \"3.11\""},
|
{version = ">=0.2", markers = "python_version < \"3.11\""},
|
||||||
|
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
|
||||||
]
|
]
|
||||||
isort = ">=4.2.5,<6"
|
isort = ">=4.2.5,<6"
|
||||||
mccabe = ">=0.6,<0.8"
|
mccabe = ">=0.6,<0.8"
|
||||||
@ -4887,6 +4938,11 @@ files = [
|
|||||||
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
|
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
|
||||||
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
|
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
|
||||||
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
|
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
|
||||||
|
{file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
|
||||||
|
{file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
|
||||||
|
{file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
|
||||||
|
{file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
|
||||||
|
{file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -5245,4 +5301,4 @@ type = ["pytest-mypy"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "9f876a0fe3d1f350eb9279f6e8ec8c481d2ce196d1bdd9834f4b89c881658074"
|
content-hash = "03159c4cb3c5e1fa19df5e67a54932836eb88c78494467dd385f88fa99866071"
|
||||||
|
@ -23,7 +23,7 @@ packages = [{include = "docling"}]
|
|||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.10"
|
python = "^3.10"
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-core = "^1.1.2"
|
docling-core = "^1.1.3"
|
||||||
docling-ibm-models = "^1.1.3"
|
docling-ibm-models = "^1.1.3"
|
||||||
deepsearch-glm = "^0.19.1"
|
deepsearch-glm = "^0.19.1"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
@ -36,6 +36,7 @@ docling-parse = "^1.1.3"
|
|||||||
certifi = ">=2024.7.4"
|
certifi = ">=2024.7.4"
|
||||||
rtree = "^1.3.0"
|
rtree = "^1.3.0"
|
||||||
scipy = "^1.14.1"
|
scipy = "^1.14.1"
|
||||||
|
pyarrow = "^17.0.0"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||||
|
Loading…
Reference in New Issue
Block a user