Upgrading docling core and adding groups

This commit is contained in:
Rafael Teixeira de Lima 2025-03-04 17:18:40 +01:00
parent 5630c6b8fd
commit 655e95dd72
19 changed files with 500 additions and 238 deletions

View File

View File

@ -268,4 +268,4 @@ LIM_TO = ("\\rightarrow", "\\to")
LIM_UPP = "\\overset{{{lim}}}{{{text}}}" LIM_UPP = "\\overset{{{lim}}}{{{text}}}"
M = "\\begin{{matrix}}{text}\end{{matrix}}" M = "\\begin{{matrix}}{text}\\end{{matrix}}"

View File

@ -8,7 +8,7 @@ On 23/01/2025
import lxml.etree as ET import lxml.etree as ET
from pylatexenc.latexencode import UnicodeToLatexEncoder from pylatexenc.latexencode import UnicodeToLatexEncoder
from docling.backend.docx_latex.latex_dict import ( from docling.backend.docx.latex.latex_dict import (
ALN, ALN,
ARR, ARR,
BACKSLASH, BACKSLASH,

View File

@ -26,7 +26,7 @@ from PIL import Image, UnidentifiedImageError
from typing_extensions import override from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.docx_latex.omml import oMath2Latex from docling.backend.docx.latex.omml import oMath2Latex
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
@ -164,7 +164,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) -> DoclingDocument: ) -> DoclingDocument:
for element in body: for element in body:
tag_name = etree.QName(element).localname tag_name = etree.QName(element).localname
# Check for Inline Images (blip elements) # Check for Inline Images (blip elements)
namespaces = { namespaces = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main", "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@ -262,6 +261,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else: else:
return label, None return label, None
def handle_equations_in_text(self, element, text):
only_texts = []
only_equations = []
texts_and_equations = []
for subt in element.iter():
tag_name = etree.QName(subt).localname
if tag_name == "t" and "math" not in subt.tag:
only_texts.append(subt.text)
texts_and_equations.append(subt.text)
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
latex_equation = str(oMath2Latex(subt))
only_equations.append(latex_equation)
texts_and_equations.append(latex_equation)
if "".join(only_texts) != text:
return text
return "".join(texts_and_equations), only_equations
def handle_text_elements( def handle_text_elements(
self, self,
@ -272,7 +289,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph = Paragraph(element, docx_obj) paragraph = Paragraph(element, docx_obj)
raw_text = paragraph.text raw_text = paragraph.text
text = self.handle_equations_in_text(element=element, text=raw_text) text, equations = self.handle_equations_in_text(element=element, text=raw_text)
if text is None: if text is None:
return return
@ -326,36 +343,57 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.parents[0] = doc.add_text( self.parents[0] = doc.add_text(
parent=None, label=DocItemLabel.TITLE, text=text parent=None, label=DocItemLabel.TITLE, text=text
) )
elif "Heading" in p_style_id: elif "Heading" in p_style_id:
self.add_header(doc, p_level, text) self.add_header(doc, p_level, text)
elif p_style_id in [ elif len(equations) > 0:
"Subtitle", if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
"Author",
"ListParagraph",
"ListBullet",
"Quote",
]:
level = self.get_level()
doc.add_text(
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
)
elif (raw_text is None or len(raw_text) == 0) and len(text) > 0:
# Standalone equation # Standalone equation
# Entities in which all text comes from equations
level = self.get_level() level = self.get_level()
if text.strip().startswith("$") and text.strip().endswith("$"):
text = text.strip()[1:-1]
doc.add_text( doc.add_text(
label=DocItemLabel.FORMULA, parent=self.parents[level - 1], text=text label=DocItemLabel.FORMULA,
parent=self.parents[level - 1],
text=text,
)
else:
# Inline equation
level = self.get_level()
inline_equation = doc.add_group(
label=GroupLabel.INLINE, parent=self.parents[level - 1]
)
text_tmp = text
for eq in equations:
if len(text_tmp) == 0:
break
pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
text_tmp = text_tmp.split(eq, maxsplit=1)[1]
if len(pre_eq_text) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=inline_equation,
text=pre_eq_text,
)
doc.add_text(
label=DocItemLabel.FORMULA,
parent=inline_equation,
text=eq,
)
if len(text_tmp) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=inline_equation,
text=text_tmp,
) )
elif p_style_id in [ elif p_style_id in [
"Paragraph", "Paragraph",
"Normal", "Normal",
"Subtitle",
"Author",
"DefaultText", "DefaultText",
"ListParagraph",
"ListBullet",
"Quote",
]: ]:
level = self.get_level() level = self.get_level()
doc.add_text( doc.add_text(
@ -367,8 +405,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# hence we treat all other labels as pure text # hence we treat all other labels as pure text
level = self.get_level() level = self.get_level()
doc.add_text( doc.add_text(
label=DocItemLabel.TEXT, parent=self.parents[level - 1], text=text label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
) )
self.update_history(p_style_id, p_level, numid, ilevel) self.update_history(p_style_id, p_level, numid, ilevel)
return return

122
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
[[package]] [[package]]
name = "accelerate" name = "accelerate"
@ -33,13 +33,13 @@ testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized",
[[package]] [[package]]
name = "aiohappyeyeballs" name = "aiohappyeyeballs"
version = "2.4.6" version = "2.4.8"
description = "Happy Eyeballs for asyncio" description = "Happy Eyeballs for asyncio"
optional = false optional = false
python-versions = ">=3.9" python-versions = ">=3.9"
files = [ files = [
{file = "aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1"}, {file = "aiohappyeyeballs-2.4.8-py3-none-any.whl", hash = "sha256:6cac4f5dd6e34a9644e69cf9021ef679e4394f54e58a183056d12009e42ea9e3"},
{file = "aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0"}, {file = "aiohappyeyeballs-2.4.8.tar.gz", hash = "sha256:19728772cb12263077982d2f55453babd8bec6a052a926cd5c0c42796da8bf62"},
] ]
[[package]] [[package]]
@ -311,6 +311,24 @@ files = [
docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"] testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]
[[package]]
name = "backrefs"
version = "5.8"
description = "A wrapper around re and regex that adds additional back references."
optional = false
python-versions = ">=3.9"
files = [
{file = "backrefs-5.8-py310-none-any.whl", hash = "sha256:c67f6638a34a5b8730812f5101376f9d41dc38c43f1fdc35cb54700f6ed4465d"},
{file = "backrefs-5.8-py311-none-any.whl", hash = "sha256:2e1c15e4af0e12e45c8701bd5da0902d326b2e200cafcd25e49d9f06d44bb61b"},
{file = "backrefs-5.8-py312-none-any.whl", hash = "sha256:bbef7169a33811080d67cdf1538c8289f76f0942ff971222a16034da88a73486"},
{file = "backrefs-5.8-py313-none-any.whl", hash = "sha256:e3a63b073867dbefd0536425f43db618578528e3896fb77be7141328642a1585"},
{file = "backrefs-5.8-py39-none-any.whl", hash = "sha256:a66851e4533fb5b371aa0628e1fee1af05135616b86140c9d787a2ffdf4b8fdc"},
{file = "backrefs-5.8.tar.gz", hash = "sha256:2cab642a205ce966af3dd4b38ee36009b31fa9502a35fd61d59ccc116e40a6bd"},
]
[package.extras]
extras = ["regex"]
[[package]] [[package]]
name = "beautifulsoup4" name = "beautifulsoup4"
version = "4.13.3" version = "4.13.3"
@ -852,13 +870,13 @@ files = [
[[package]] [[package]]
name = "docling-core" name = "docling-core"
version = "2.20.0" version = "2.21.1"
description = "A python library to define and validate data types in Docling." description = "A python library to define and validate data types in Docling."
optional = false optional = false
python-versions = "<4.0,>=3.9" python-versions = "<4.0,>=3.9"
files = [ files = [
{file = "docling_core-2.20.0-py3-none-any.whl", hash = "sha256:72f50fce277b7bb51f4134f443240c041582184305c3bcaabdea13fc5550f160"}, {file = "docling_core-2.21.1-py3-none-any.whl", hash = "sha256:b8112915728cdc14f328f636f6c0ed36e6bbcc02ff940cc0bf85e303738671c3"},
{file = "docling_core-2.20.0.tar.gz", hash = "sha256:9733581c15f5a9b5e3a6cb74fa995cc4078ff16668007f86c5f75d1ea9180d7f"}, {file = "docling_core-2.21.1.tar.gz", hash = "sha256:3ccc50197d24a3156cfc6c22c8404c58757749646d876a1c1c69fd800f664a4f"},
] ]
[package.dependencies] [package.dependencies]
@ -880,13 +898,13 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]
[[package]] [[package]]
name = "docling-ibm-models" name = "docling-ibm-models"
version = "3.4.0" version = "3.4.1"
description = "This package contains the AI models used by the Docling PDF conversion package" description = "This package contains the AI models used by the Docling PDF conversion package"
optional = false optional = false
python-versions = "<4.0,>=3.9" python-versions = "<4.0,>=3.9"
files = [ files = [
{file = "docling_ibm_models-3.4.0-py3-none-any.whl", hash = "sha256:186517ff1f76e76113600fa1e5a699927325081a8013fdd5d0551121c2e34190"}, {file = "docling_ibm_models-3.4.1-py3-none-any.whl", hash = "sha256:c3582c99dddfa3f0eafcf80cf1267fd8efa39c4a74cc7a88f9dd49684fac2986"},
{file = "docling_ibm_models-3.4.0.tar.gz", hash = "sha256:fb79beeb07d1bb9bc8acf9d0a44643cd7ce1910aa418cd685e2e477b13eeafee"}, {file = "docling_ibm_models-3.4.1.tar.gz", hash = "sha256:093b4dff2ea284a4953c3aa009e29945208b8d389b94fb14940a03a93f673e96"},
] ]
[package.dependencies] [package.dependencies]
@ -1331,13 +1349,13 @@ test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit",
[[package]] [[package]]
name = "griffe" name = "griffe"
version = "1.5.7" version = "1.6.0"
description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
optional = false optional = false
python-versions = ">=3.9" python-versions = ">=3.9"
files = [ files = [
{file = "griffe-1.5.7-py3-none-any.whl", hash = "sha256:4af8ec834b64de954d447c7b6672426bb145e71605c74a4e22d510cc79fe7d8b"}, {file = "griffe-1.6.0-py3-none-any.whl", hash = "sha256:9f1dfe035d4715a244ed2050dfbceb05b1f470809ed4f6bb10ece5a7302f8dd1"},
{file = "griffe-1.5.7.tar.gz", hash = "sha256:465238c86deaf1137761f700fb343edd8ffc846d72f6de43c3c345ccdfbebe92"}, {file = "griffe-1.6.0.tar.gz", hash = "sha256:eb5758088b9c73ad61c7ac014f3cdfb4c57b5c2fcbfca69996584b702aefa354"},
] ]
[package.dependencies] [package.dependencies]
@ -1818,18 +1836,18 @@ testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"]
[[package]] [[package]]
name = "jeepney" name = "jeepney"
version = "0.8.0" version = "0.9.0"
description = "Low-level, pure Python DBus protocol wrapper." description = "Low-level, pure Python DBus protocol wrapper."
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"}, {file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"},
{file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"}, {file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"},
] ]
[package.extras] [package.extras]
test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"] test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"]
trio = ["async_generator", "trio"] trio = ["trio"]
[[package]] [[package]]
name = "jinja2" name = "jinja2"
@ -2715,17 +2733,18 @@ pygments = ">2.12.0"
[[package]] [[package]]
name = "mkdocs-material" name = "mkdocs-material"
version = "9.6.5" version = "9.6.7"
description = "Documentation that simply works" description = "Documentation that simply works"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "mkdocs_material-9.6.5-py3-none-any.whl", hash = "sha256:aad3e6fb860c20870f75fb2a69ef901f1be727891e41adb60b753efcae19453b"}, {file = "mkdocs_material-9.6.7-py3-none-any.whl", hash = "sha256:8a159e45e80fcaadd9fbeef62cbf928569b93df954d4dc5ba76d46820caf7b47"},
{file = "mkdocs_material-9.6.5.tar.gz", hash = "sha256:b714679a8c91b0ffe2188e11ed58c44d2523e9c2ae26a29cc652fa7478faa21f"}, {file = "mkdocs_material-9.6.7.tar.gz", hash = "sha256:3e2c1fceb9410056c2d91f334a00cdea3215c28750e00c691c1e46b2a33309b4"},
] ]
[package.dependencies] [package.dependencies]
babel = ">=2.10,<3.0" babel = ">=2.10,<3.0"
backrefs = ">=5.7.post1,<6.0"
colorama = ">=0.4,<1.0" colorama = ">=0.4,<1.0"
jinja2 = ">=3.0,<4.0" jinja2 = ">=3.0,<4.0"
markdown = ">=3.2,<4.0" markdown = ">=3.2,<4.0"
@ -2734,7 +2753,6 @@ mkdocs-material-extensions = ">=1.3,<2.0"
paginate = ">=0.5,<1.0" paginate = ">=0.5,<1.0"
pygments = ">=2.16,<3.0" pygments = ">=2.16,<3.0"
pymdown-extensions = ">=10.2,<11.0" pymdown-extensions = ">=10.2,<11.0"
regex = ">=2022.4"
requests = ">=2.26,<3.0" requests = ">=2.26,<3.0"
[package.extras] [package.extras]
@ -4755,13 +4773,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
[[package]] [[package]]
name = "pydantic-settings" name = "pydantic-settings"
version = "2.8.0" version = "2.8.1"
description = "Settings management using Pydantic" description = "Settings management using Pydantic"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "pydantic_settings-2.8.0-py3-none-any.whl", hash = "sha256:c782c7dc3fb40e97b238e713c25d26f64314aece2e91abcff592fcac15f71820"}, {file = "pydantic_settings-2.8.1-py3-none-any.whl", hash = "sha256:81942d5ac3d905f7f3ee1a70df5dfb62d5569c12f51a5a647defc1c3d9ee2e9c"},
{file = "pydantic_settings-2.8.0.tar.gz", hash = "sha256:88e2ca28f6e68ea102c99c3c401d6c9078e68a5df600e97b43891c34e089500a"}, {file = "pydantic_settings-2.8.1.tar.gz", hash = "sha256:d5c663dfbe9db9d5e1c646b2e161da12f0d734d422ee56f567d0ea2cee4e8585"},
] ]
[package.dependencies] [package.dependencies]
@ -5907,26 +5925,26 @@ files = [
[[package]] [[package]]
name = "safetensors" name = "safetensors"
version = "0.5.2" version = "0.5.3"
description = "" description = ""
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "safetensors-0.5.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:45b6092997ceb8aa3801693781a71a99909ab9cc776fbc3fa9322d29b1d3bef2"}, {file = "safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bd20eb133db8ed15b40110b7c00c6df51655a2998132193de2f75f72d99c7073"},
{file = "safetensors-0.5.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6d0d6a8ee2215a440e1296b843edf44fd377b055ba350eaba74655a2fe2c4bae"}, {file = "safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:21d01c14ff6c415c485616b8b0bf961c46b3b343ca59110d38d744e577f9cce7"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86016d40bcaa3bcc9a56cd74d97e654b5f4f4abe42b038c71e4f00a089c4526c"}, {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11bce6164887cd491ca75c2326a113ba934be596e22b28b1742ce27b1d076467"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:990833f70a5f9c7d3fc82c94507f03179930ff7d00941c287f73b6fcbf67f19e"}, {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4a243be3590bc3301c821da7a18d87224ef35cbd3e5f5727e4e0728b8172411e"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dfa7c2f3fe55db34eba90c29df94bcdac4821043fc391cb5d082d9922013869"}, {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8bd84b12b1670a6f8e50f01e28156422a2bc07fb16fc4e98bded13039d688a0d"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46ff2116150ae70a4e9c490d2ab6b6e1b1b93f25e520e540abe1b81b48560c3a"}, {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:391ac8cab7c829452175f871fcaf414aa1e292b5448bd02620f675a7f3e7abb9"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ab696dfdc060caffb61dbe4066b86419107a24c804a4e373ba59be699ebd8d5"}, {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cead1fa41fc54b1e61089fa57452e8834f798cb1dc7a09ba3524f1eb08e0317a"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:03c937100f38c9ff4c1507abea9928a6a9b02c9c1c9c3609ed4fb2bf413d4975"}, {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:a00e737948791b94dad83cf0eafc09a02c4d8c2171a239e8c8572fe04e25960e"}, {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:799021e78287bac619c7b3f3606730a22da4cda27759ddf55d37c8db7511c74b"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:d3a06fae62418ec8e5c635b61a8086032c9e281f16c63c3af46a6efbab33156f"}, {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df26da01aaac504334644e1b7642fa000bfec820e7cef83aeac4e355e03195ff"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:1506e4c2eda1431099cebe9abf6c76853e95d0b7a95addceaa74c6019c65d8cf"}, {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:32c3ef2d7af8b9f52ff685ed0bc43913cdcde135089ae322ee576de93eae5135"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5c5b5d9da594f638a259fca766046f44c97244cc7ab8bef161b3e80d04becc76"}, {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:37f1521be045e56fc2b54c606d4455573e717b2d887c579ee1dbba5f868ece04"},
{file = "safetensors-0.5.2-cp38-abi3-win32.whl", hash = "sha256:fe55c039d97090d1f85277d402954dd6ad27f63034fa81985a9cc59655ac3ee2"}, {file = "safetensors-0.5.3-cp38-abi3-win32.whl", hash = "sha256:cfc0ec0846dcf6763b0ed3d1846ff36008c6e7290683b61616c4b040f6a54ace"},
{file = "safetensors-0.5.2-cp38-abi3-win_amd64.whl", hash = "sha256:78abdddd03a406646107f973c7843276e7b64e5e32623529dc17f3d94a20f589"}, {file = "safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11"},
{file = "safetensors-0.5.2.tar.gz", hash = "sha256:cb4a8d98ba12fa016f4241932b1fc5e702e5143f5374bba0bbcf7ddc1c4cf2b8"}, {file = "safetensors-0.5.3.tar.gz", hash = "sha256:b6b0d6ecacec39a4fdd99cc19f4576f5219ce858e6fd8dbe7609df0b8dc56965"},
] ]
[package.dependencies] [package.dependencies]
@ -6223,13 +6241,13 @@ train = ["accelerate (>=0.20.3)", "datasets"]
[[package]] [[package]]
name = "setuptools" name = "setuptools"
version = "75.8.1" version = "75.8.2"
description = "Easily download, build, install, upgrade, and uninstall Python packages" description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false optional = false
python-versions = ">=3.9" python-versions = ">=3.9"
files = [ files = [
{file = "setuptools-75.8.1-py3-none-any.whl", hash = "sha256:3bc32c0b84c643299ca94e77f834730f126efd621de0cc1de64119e0e17dab1f"}, {file = "setuptools-75.8.2-py3-none-any.whl", hash = "sha256:558e47c15f1811c1fa7adbd0096669bf76c1d3f433f58324df69f3f5ecac4e8f"},
{file = "setuptools-75.8.1.tar.gz", hash = "sha256:65fb779a8f28895242923582eadca2337285f0891c2c9e160754df917c3d2530"}, {file = "setuptools-75.8.2.tar.gz", hash = "sha256:4880473a969e5f23f2a2be3646b2dfd84af9028716d398e46192f84bc36900d2"},
] ]
[package.extras] [package.extras]
@ -7227,13 +7245,13 @@ files = [
[[package]] [[package]]
name = "types-requests" name = "types-requests"
version = "2.32.0.20241016" version = "2.32.0.20250301"
description = "Typing stubs for requests" description = "Typing stubs for requests"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.9"
files = [ files = [
{file = "types-requests-2.32.0.20241016.tar.gz", hash = "sha256:0d9cad2f27515d0e3e3da7134a1b6f28fb97129d86b867f24d9c726452634d95"}, {file = "types_requests-2.32.0.20250301-py3-none-any.whl", hash = "sha256:0003e0124e2cbefefb88222ff822b48616af40c74df83350f599a650c8de483b"},
{file = "types_requests-2.32.0.20241016-py3-none-any.whl", hash = "sha256:4195d62d6d3e043a4eaaf08ff8a62184584d2e8684e9d2aa178c7915a7da3747"}, {file = "types_requests-2.32.0.20250301.tar.gz", hash = "sha256:3d909dc4eaab159c0d964ebe8bfa326a7afb4578d8706408d417e17d61b0c500"},
] ]
[package.dependencies] [package.dependencies]
@ -7241,13 +7259,13 @@ urllib3 = ">=2"
[[package]] [[package]]
name = "types-tqdm" name = "types-tqdm"
version = "4.67.0.20241221" version = "4.67.0.20250301"
description = "Typing stubs for tqdm" description = "Typing stubs for tqdm"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.9"
files = [ files = [
{file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"}, {file = "types_tqdm-4.67.0.20250301-py3-none-any.whl", hash = "sha256:8af97deb8e6874af833555dc1fe0fcd456b1a789470bf6cd8813d4e7ee4f6c5b"},
{file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"}, {file = "types_tqdm-4.67.0.20250301.tar.gz", hash = "sha256:5e89a38ad89b867823368eb97d9f90d2fc69806bb055dde62716a05da62b5e0d"},
] ]
[package.dependencies] [package.dependencies]
@ -7843,4 +7861,4 @@ vlm = ["accelerate", "transformers", "transformers"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "1d4718b694098b0676f1ad1606d769887e51fc29f604e5f4c83dd5e1c90557e7" content-hash = "a340b1230bc83cdcff125a84eee457b1d8786abc112f2c0553391a4ab9f092ea"

View File

@ -2,12 +2,32 @@
name = "docling" name = "docling"
version = "2.25.1" # DO NOT EDIT, updated automatically version = "2.25.1" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"] authors = [
"Christoph Auer <cau@zurich.ibm.com>",
"Michele Dolfi <dol@zurich.ibm.com>",
"Maxim Lysak <mly@zurich.ibm.com>",
"Nikos Livathinos <nli@zurich.ibm.com>",
"Ahmed Nassar <ahn@zurich.ibm.com>",
"Panos Vagenas <pva@zurich.ibm.com>",
"Peter Staar <taa@zurich.ibm.com>",
]
license = "MIT" license = "MIT"
readme = "README.md" readme = "README.md"
repository = "https://github.com/DS4SD/docling" repository = "https://github.com/DS4SD/docling"
homepage = "https://github.com/DS4SD/docling" homepage = "https://github.com/DS4SD/docling"
keywords= ["docling", "convert", "document", "pdf", "docx", "html", "markdown", "layout model", "segmentation", "table structure", "table former"] keywords = [
"docling",
"convert",
"document",
"pdf",
"docx",
"html",
"markdown",
"layout model",
"segmentation",
"table structure",
"table former",
]
classifiers = [ classifiers = [
"License :: OSI Approved :: MIT License", "License :: OSI Approved :: MIT License",
"Operating System :: MacOS :: MacOS X", "Operating System :: MacOS :: MacOS X",
@ -16,7 +36,7 @@ keywords= ["docling", "convert", "document", "pdf", "docx", "html", "markdown",
"Intended Audience :: Developers", "Intended Audience :: Developers",
"Intended Audience :: Science/Research", "Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 3" "Programming Language :: Python :: 3",
] ]
packages = [{ include = "docling" }] packages = [{ include = "docling" }]
@ -26,7 +46,7 @@ packages = [{include = "docling"}]
###################### ######################
python = "^3.9" python = "^3.9"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = {extras = ["chunking"], version = "^2.19.0"} docling-core = { extras = ["chunking"], version = "^2.21.1" }
docling-ibm-models = "^3.4.0" docling-ibm-models = "^3.4.0"
docling-parse = "^3.3.0" docling-parse = "^3.3.0"
filetype = "^1.2.0" filetype = "^1.2.0"
@ -40,7 +60,7 @@ certifi = ">=2024.7.4"
rtree = "^1.3.0" rtree = "^1.3.0"
scipy = [ scipy = [
{ version = "^1.6.0", markers = "python_version >= '3.10'" }, { version = "^1.6.0", markers = "python_version >= '3.10'" },
{ version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" } { version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" },
] ]
typer = "^0.12.5" typer = "^0.12.5"
python-docx = "^1.1.2" python-docx = "^1.1.2"
@ -56,18 +76,19 @@ onnxruntime = [
# 1.19.2 is the last version with python3.9 support, # 1.19.2 is the last version with python3.9 support,
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0 # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" }, { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" } { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" },
] ]
transformers = [ transformers = [
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true }, { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true } { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true },
] ]
accelerate = [ accelerate = [
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true }, { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
] ]
pillow = ">=10.0.0,<12.0.0" pillow = ">=10.0.0,<12.0.0"
tqdm = "^4.65.0" tqdm = "^4.65.0"
pylatexenc = "^2.10"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = { extras = ["jupyter"], version = "^24.4.2" } black = { extras = ["jupyter"], version = "^24.4.2" }
@ -118,11 +139,11 @@ optional = true
[tool.poetry.group.mac_intel.dependencies] [tool.poetry.group.mac_intel.dependencies]
torch = [ torch = [
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2" }, { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2" },
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2"} { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2" },
] ]
torchvision = [ torchvision = [
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0" }, { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0" },
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"} { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2" },
] ]
[tool.poetry.extras] [tool.poetry.extras]
@ -170,6 +191,7 @@ module = [
"lxml.*", "lxml.*",
"huggingface_hub.*", "huggingface_hub.*",
"transformers.*", "transformers.*",
"pylatexenc.*",
] ]
ignore_missing_imports = true ignore_missing_imports = true

View File

@ -1,31 +1,40 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this: item-1 at level 1: inline: group group
item-2 at level 1: paragraph: item-2 at level 2: paragraph: This is a word document and this is an inline equation:
item-3 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23 item-3 at level 2: formula: A= \pi r^{2}
item-4 at level 1: paragraph: And that is an equation by itself. Cheers! item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-5 at level 1: paragraph: item-5 at level 1: paragraph:
item-6 at level 1: paragraph: This is another equation: item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
item-7 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right) item-7 at level 1: paragraph: And that is an equation by itself. Cheers!
item-8 at level 1: paragraph: item-8 at level 1: paragraph:
item-9 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text. item-9 at level 1: paragraph: This is another equation:
item-10 at level 1: paragraph: item-10 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right)
item-11 at level 1: paragraph: item-11 at level 1: paragraph:
item-12 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this: item-12 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
item-13 at level 1: paragraph: item-13 at level 1: paragraph:
item-14 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k} item-14 at level 1: paragraph:
item-15 at level 1: paragraph: item-15 at level 1: inline: group group
item-16 at level 1: paragraph: And that is an equation by itself. Cheers! item-16 at level 2: paragraph: This is a word document and this is an inline equation:
item-17 at level 1: paragraph: item-17 at level 2: formula: A= \pi r^{2}
item-18 at level 1: paragraph: This is another equation: item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-19 at level 1: paragraph: item-19 at level 1: paragraph:
item-20 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ht)x^{2}}{2!}+ \text{ \textellipsis } item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
item-21 at level 1: paragraph: item-21 at level 1: paragraph:
item-22 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text. item-22 at level 1: paragraph: And that is an equation by itself. Cheers!
item-23 at level 1: paragraph: item-23 at level 1: paragraph:
item-24 at level 1: paragraph: item-24 at level 1: paragraph: This is another equation:
item-25 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this: item-25 at level 1: paragraph:
item-26 at level 1: paragraph: item-26 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ght)x^{2}}{2!}+ \text{ \textellipsis }
item-27 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... tellipsis } , - \infty < x < \infty item-27 at level 1: paragraph:
item-28 at level 1: paragraph: item-28 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
item-29 at level 1: paragraph: And that is an equation by itself. Cheers! item-29 at level 1: paragraph:
item-30 at level 1: paragraph: item-30 at level 1: paragraph:
item-31 at level 1: inline: group group
item-32 at level 2: paragraph: This is a word document and this is an inline equation:
item-33 at level 2: formula: A= \pi r^{2}
item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-35 at level 1: paragraph:
item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty
item-37 at level 1: paragraph:
item-38 at level 1: paragraph: And that is an equation by itself. Cheers!
item-39 at level 1: paragraph:

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.0.0", "version": "1.2.0",
"name": "equations", "name": "equations",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -10,6 +10,7 @@
"furniture": { "furniture": {
"self_ref": "#/furniture", "self_ref": "#/furniture",
"children": [], "children": [],
"content_layer": "furniture",
"name": "_root_", "name": "_root_",
"label": "unspecified" "label": "unspecified"
}, },
@ -17,13 +18,7 @@
"self_ref": "#/body", "self_ref": "#/body",
"children": [ "children": [
{ {
"$ref": "#/texts/0" "$ref": "#/groups/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
}, },
{ {
"$ref": "#/texts/3" "$ref": "#/texts/3"
@ -56,13 +51,7 @@
"$ref": "#/texts/12" "$ref": "#/texts/12"
}, },
{ {
"$ref": "#/texts/13" "$ref": "#/groups/1"
},
{
"$ref": "#/texts/14"
},
{
"$ref": "#/texts/15"
}, },
{ {
"$ref": "#/texts/16" "$ref": "#/texts/16"
@ -100,50 +89,127 @@
{ {
"$ref": "#/texts/27" "$ref": "#/texts/27"
}, },
{
"$ref": "#/groups/2"
},
{
"$ref": "#/texts/31"
},
{
"$ref": "#/texts/32"
},
{
"$ref": "#/texts/33"
},
{
"$ref": "#/texts/34"
},
{
"$ref": "#/texts/35"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
},
{
"$ref": "#/texts/15"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/body"
},
"children": [
{ {
"$ref": "#/texts/28" "$ref": "#/texts/28"
}, },
{ {
"$ref": "#/texts/29" "$ref": "#/texts/29"
},
{
"$ref": "#/texts/30"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
} }
], ],
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"texts": [ "texts": [
{ {
"self_ref": "#/texts/0", "self_ref": "#/texts/0",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/groups/0"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:", "orig": "This is a word document and this is an inline equation: ",
"text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:" "text": "This is a word document and this is an inline equation: "
}, },
{ {
"self_ref": "#/texts/1", "self_ref": "#/texts/1",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/groups/0"
}, },
"children": [], "children": [],
"label": "paragraph", "content_layer": "body",
"label": "formula",
"prov": [], "prov": [],
"orig": "", "orig": "A= \\pi r^{2} ",
"text": "" "text": "A= \\pi r^{2} "
}, },
{ {
"self_ref": "#/texts/2", "self_ref": "#/texts/2",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/groups/0"
}, },
"children": [], "children": [],
"label": "formula", "content_layer": "body",
"label": "paragraph",
"prov": [], "prov": [],
"orig": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23", "orig": ". If instead, I want an equation by line, I can do this:",
"text": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23" "text": ". If instead, I want an equation by line, I can do this:"
}, },
{ {
"self_ref": "#/texts/3", "self_ref": "#/texts/3",
@ -151,10 +217,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "And that is an equation by itself. Cheers!", "orig": "",
"text": "And that is an equation by itself. Cheers!" "text": ""
}, },
{ {
"self_ref": "#/texts/4", "self_ref": "#/texts/4",
@ -162,10 +229,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"label": "paragraph", "content_layer": "body",
"label": "formula",
"prov": [], "prov": [],
"orig": "", "orig": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23",
"text": "" "text": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23"
}, },
{ {
"self_ref": "#/texts/5", "self_ref": "#/texts/5",
@ -173,10 +241,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "This is another equation:", "orig": "And that is an equation by itself. Cheers!",
"text": "This is another equation:" "text": "And that is an equation by itself. Cheers!"
}, },
{ {
"self_ref": "#/texts/6", "self_ref": "#/texts/6",
@ -184,10 +253,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"label": "formula", "content_layer": "body",
"label": "paragraph",
"prov": [], "prov": [],
"orig": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)", "orig": "",
"text": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)" "text": ""
}, },
{ {
"self_ref": "#/texts/7", "self_ref": "#/texts/7",
@ -195,10 +265,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "", "orig": "This is another equation:",
"text": "" "text": "This is another equation:"
}, },
{ {
"self_ref": "#/texts/8", "self_ref": "#/texts/8",
@ -206,10 +277,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"label": "paragraph", "content_layer": "body",
"label": "formula",
"prov": [], "prov": [],
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.", "orig": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)",
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text." "text": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)"
}, },
{ {
"self_ref": "#/texts/9", "self_ref": "#/texts/9",
@ -217,6 +289,7 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "", "orig": "",
@ -228,10 +301,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "", "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
"text": "" "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
}, },
{ {
"self_ref": "#/texts/11", "self_ref": "#/texts/11",
@ -239,10 +313,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:", "orig": "",
"text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:" "text": ""
}, },
{ {
"self_ref": "#/texts/12", "self_ref": "#/texts/12",
@ -250,6 +325,7 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "", "orig": "",
@ -258,35 +334,38 @@
{ {
"self_ref": "#/texts/13", "self_ref": "#/texts/13",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/groups/1"
}, },
"children": [], "children": [],
"label": "formula", "content_layer": "body",
"label": "paragraph",
"prov": [], "prov": [],
"orig": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}", "orig": "This is a word document and this is an inline equation: ",
"text": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}" "text": "This is a word document and this is an inline equation: "
}, },
{ {
"self_ref": "#/texts/14", "self_ref": "#/texts/14",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/groups/1"
}, },
"children": [], "children": [],
"label": "paragraph", "content_layer": "body",
"label": "formula",
"prov": [], "prov": [],
"orig": "", "orig": "A= \\pi r^{2} ",
"text": "" "text": "A= \\pi r^{2} "
}, },
{ {
"self_ref": "#/texts/15", "self_ref": "#/texts/15",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/groups/1"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "And that is an equation by itself. Cheers!", "orig": ". If instead, I want an equation by line, I can do this:",
"text": "And that is an equation by itself. Cheers!" "text": ". If instead, I want an equation by line, I can do this:"
}, },
{ {
"self_ref": "#/texts/16", "self_ref": "#/texts/16",
@ -294,6 +373,7 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "", "orig": "",
@ -305,10 +385,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"label": "paragraph", "content_layer": "body",
"label": "formula",
"prov": [], "prov": [],
"orig": "This is another equation:", "orig": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}",
"text": "This is another equation:" "text": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}"
}, },
{ {
"self_ref": "#/texts/18", "self_ref": "#/texts/18",
@ -316,6 +397,7 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "", "orig": "",
@ -327,10 +409,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"label": "formula", "content_layer": "body",
"label": "paragraph",
"prov": [], "prov": [],
"orig": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } ", "orig": "And that is an equation by itself. Cheers!",
"text": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } " "text": "And that is an equation by itself. Cheers!"
}, },
{ {
"self_ref": "#/texts/20", "self_ref": "#/texts/20",
@ -338,6 +421,7 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "", "orig": "",
@ -349,10 +433,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.", "orig": "This is another equation:",
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text." "text": "This is another equation:"
}, },
{ {
"self_ref": "#/texts/22", "self_ref": "#/texts/22",
@ -360,6 +445,7 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "", "orig": "",
@ -371,10 +457,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"label": "paragraph", "content_layer": "body",
"label": "formula",
"prov": [], "prov": [],
"orig": "", "orig": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis }",
"text": "" "text": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis }"
}, },
{ {
"self_ref": "#/texts/24", "self_ref": "#/texts/24",
@ -382,10 +469,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:", "orig": "",
"text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:" "text": ""
}, },
{ {
"self_ref": "#/texts/25", "self_ref": "#/texts/25",
@ -393,10 +481,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "", "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
"text": "" "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
}, },
{ {
"self_ref": "#/texts/26", "self_ref": "#/texts/26",
@ -404,10 +493,11 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"label": "formula", "content_layer": "body",
"label": "paragraph",
"prov": [], "prov": [],
"orig": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty ", "orig": "",
"text": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty " "text": ""
}, },
{ {
"self_ref": "#/texts/27", "self_ref": "#/texts/27",
@ -415,6 +505,7 @@
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "", "orig": "",
@ -422,21 +513,95 @@
}, },
{ {
"self_ref": "#/texts/28", "self_ref": "#/texts/28",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is a word document and this is an inline equation: ",
"text": "This is a word document and this is an inline equation: "
},
{
"self_ref": "#/texts/29",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "A= \\pi r^{2} ",
"text": "A= \\pi r^{2} "
},
{
"self_ref": "#/texts/30",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": ". If instead, I want an equation by line, I can do this:",
"text": ". If instead, I want an equation by line, I can do this:"
},
{
"self_ref": "#/texts/31",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/32",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty",
"text": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty"
},
{
"self_ref": "#/texts/33",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/34",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "And that is an equation by itself. Cheers!", "orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!" "text": "And that is an equation by itself. Cheers!"
}, },
{ {
"self_ref": "#/texts/29", "self_ref": "#/texts/35",
"parent": { "parent": {
"$ref": "#/body" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body",
"label": "paragraph", "label": "paragraph",
"prov": [], "prov": [],
"orig": "", "orig": "",
@ -446,5 +611,6 @@
"pictures": [], "pictures": [],
"tables": [], "tables": [],
"key_value_items": [], "key_value_items": [],
"form_items": [],
"pages": {} "pages": {}
} }

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.1.0", "version": "1.2.0",
"name": "lorem_ipsum", "name": "lorem_ipsum",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -163,5 +163,6 @@
"pictures": [], "pictures": [],
"tables": [], "tables": [],
"key_value_items": [], "key_value_items": [],
"form_items": [],
"pages": {} "pages": {}
} }

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.1.0", "version": "1.2.0",
"name": "tablecell", "name": "tablecell",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -399,5 +399,6 @@
} }
], ],
"key_value_items": [], "key_value_items": [],
"form_items": [],
"pages": {} "pages": {}
} }

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.1.0", "version": "1.2.0",
"name": "unit_test_headers", "name": "unit_test_headers",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -748,5 +748,6 @@
"pictures": [], "pictures": [],
"tables": [], "tables": [],
"key_value_items": [], "key_value_items": [],
"form_items": [],
"pages": {} "pages": {}
} }

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.1.0", "version": "1.2.0",
"name": "unit_test_headers_numbered", "name": "unit_test_headers_numbered",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -802,5 +802,6 @@
"pictures": [], "pictures": [],
"tables": [], "tables": [],
"key_value_items": [], "key_value_items": [],
"form_items": [],
"pages": {} "pages": {}
} }

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.1.0", "version": "1.2.0",
"name": "unit_test_lists", "name": "unit_test_lists",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -979,5 +979,6 @@
"pictures": [], "pictures": [],
"tables": [], "tables": [],
"key_value_items": [], "key_value_items": [],
"form_items": [],
"pages": {} "pages": {}
} }

View File

@ -3,7 +3,7 @@ item-0 at level 0: unspecified: group _root_
item-2 at level 1: title: Swimming in the lake item-2 at level 1: title: Swimming in the lake
item-3 at level 2: paragraph: Duck item-3 at level 2: paragraph: Duck
item-4 at level 2: picture item-4 at level 2: picture
item-5 at level 2: text: Figure 1: This is a cute duckling item-5 at level 2: paragraph: Figure 1: This is a cute duckling
item-6 at level 2: section_header: Lets swim! item-6 at level 2: section_header: Lets swim!
item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown: item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown:
item-8 at level 3: list: group list item-8 at level 3: list: group list

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.1.0", "version": "1.2.0",
"name": "word_tables", "name": "word_tables",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -2372,5 +2372,6 @@
} }
], ],
"key_value_items": [], "key_value_items": [],
"form_items": [],
"pages": {} "pages": {}
} }