Upgrading docling core and adding groups

This commit is contained in:
Rafael Teixeira de Lima 2025-03-04 17:18:40 +01:00
parent 5630c6b8fd
commit 655e95dd72
19 changed files with 500 additions and 238 deletions

View File

View File

@ -268,4 +268,4 @@ LIM_TO = ("\\rightarrow", "\\to")
LIM_UPP = "\\overset{{{lim}}}{{{text}}}"
M = "\\begin{{matrix}}{text}\end{{matrix}}"
M = "\\begin{{matrix}}{text}\\end{{matrix}}"

View File

@ -8,7 +8,7 @@ On 23/01/2025
import lxml.etree as ET
from pylatexenc.latexencode import UnicodeToLatexEncoder
from docling.backend.docx_latex.latex_dict import (
from docling.backend.docx.latex.latex_dict import (
ALN,
ARR,
BACKSLASH,

View File

@ -26,7 +26,7 @@ from PIL import Image, UnidentifiedImageError
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.docx_latex.omml import oMath2Latex
from docling.backend.docx.latex.omml import oMath2Latex
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@ -164,7 +164,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) -> DoclingDocument:
for element in body:
tag_name = etree.QName(element).localname
# Check for Inline Images (blip elements)
namespaces = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@ -262,6 +261,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else:
return label, None
def handle_equations_in_text(self, element, text):
only_texts = []
only_equations = []
texts_and_equations = []
for subt in element.iter():
tag_name = etree.QName(subt).localname
if tag_name == "t" and "math" not in subt.tag:
only_texts.append(subt.text)
texts_and_equations.append(subt.text)
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
latex_equation = str(oMath2Latex(subt))
only_equations.append(latex_equation)
texts_and_equations.append(latex_equation)
if "".join(only_texts) != text:
return text
return "".join(texts_and_equations), only_equations
def handle_text_elements(
self,
@ -272,7 +289,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph = Paragraph(element, docx_obj)
raw_text = paragraph.text
text = self.handle_equations_in_text(element=element, text=raw_text)
text, equations = self.handle_equations_in_text(element=element, text=raw_text)
if text is None:
return
@ -326,36 +343,57 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.parents[0] = doc.add_text(
parent=None, label=DocItemLabel.TITLE, text=text
)
elif "Heading" in p_style_id:
self.add_header(doc, p_level, text)
elif p_style_id in [
"Subtitle",
"Author",
"ListParagraph",
"ListBullet",
"Quote",
]:
level = self.get_level()
doc.add_text(
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
)
elif (raw_text is None or len(raw_text) == 0) and len(text) > 0:
# Standalone equation
# Entities in which all text comes from equations
level = self.get_level()
if text.strip().startswith("$") and text.strip().endswith("$"):
text = text.strip()[1:-1]
doc.add_text(
label=DocItemLabel.FORMULA, parent=self.parents[level - 1], text=text
)
elif len(equations) > 0:
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
# Standalone equation
level = self.get_level()
doc.add_text(
label=DocItemLabel.FORMULA,
parent=self.parents[level - 1],
text=text,
)
else:
# Inline equation
level = self.get_level()
inline_equation = doc.add_group(
label=GroupLabel.INLINE, parent=self.parents[level - 1]
)
text_tmp = text
for eq in equations:
if len(text_tmp) == 0:
break
pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
text_tmp = text_tmp.split(eq, maxsplit=1)[1]
if len(pre_eq_text) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=inline_equation,
text=pre_eq_text,
)
doc.add_text(
label=DocItemLabel.FORMULA,
parent=inline_equation,
text=eq,
)
if len(text_tmp) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=inline_equation,
text=text_tmp,
)
elif p_style_id in [
"Paragraph",
"Normal",
"Subtitle",
"Author",
"DefaultText",
"ListParagraph",
"ListBullet",
"Quote",
]:
level = self.get_level()
doc.add_text(
@ -367,8 +405,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# hence we treat all other labels as pure text
level = self.get_level()
doc.add_text(
label=DocItemLabel.TEXT, parent=self.parents[level - 1], text=text
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
)
self.update_history(p_style_id, p_level, numid, ilevel)
return

122
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
[[package]]
name = "accelerate"
@ -33,13 +33,13 @@ testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized",
[[package]]
name = "aiohappyeyeballs"
version = "2.4.6"
version = "2.4.8"
description = "Happy Eyeballs for asyncio"
optional = false
python-versions = ">=3.9"
files = [
{file = "aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1"},
{file = "aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0"},
{file = "aiohappyeyeballs-2.4.8-py3-none-any.whl", hash = "sha256:6cac4f5dd6e34a9644e69cf9021ef679e4394f54e58a183056d12009e42ea9e3"},
{file = "aiohappyeyeballs-2.4.8.tar.gz", hash = "sha256:19728772cb12263077982d2f55453babd8bec6a052a926cd5c0c42796da8bf62"},
]
[[package]]
@ -311,6 +311,24 @@ files = [
docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]
[[package]]
name = "backrefs"
version = "5.8"
description = "A wrapper around re and regex that adds additional back references."
optional = false
python-versions = ">=3.9"
files = [
{file = "backrefs-5.8-py310-none-any.whl", hash = "sha256:c67f6638a34a5b8730812f5101376f9d41dc38c43f1fdc35cb54700f6ed4465d"},
{file = "backrefs-5.8-py311-none-any.whl", hash = "sha256:2e1c15e4af0e12e45c8701bd5da0902d326b2e200cafcd25e49d9f06d44bb61b"},
{file = "backrefs-5.8-py312-none-any.whl", hash = "sha256:bbef7169a33811080d67cdf1538c8289f76f0942ff971222a16034da88a73486"},
{file = "backrefs-5.8-py313-none-any.whl", hash = "sha256:e3a63b073867dbefd0536425f43db618578528e3896fb77be7141328642a1585"},
{file = "backrefs-5.8-py39-none-any.whl", hash = "sha256:a66851e4533fb5b371aa0628e1fee1af05135616b86140c9d787a2ffdf4b8fdc"},
{file = "backrefs-5.8.tar.gz", hash = "sha256:2cab642a205ce966af3dd4b38ee36009b31fa9502a35fd61d59ccc116e40a6bd"},
]
[package.extras]
extras = ["regex"]
[[package]]
name = "beautifulsoup4"
version = "4.13.3"
@ -852,13 +870,13 @@ files = [
[[package]]
name = "docling-core"
version = "2.20.0"
version = "2.21.1"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_core-2.20.0-py3-none-any.whl", hash = "sha256:72f50fce277b7bb51f4134f443240c041582184305c3bcaabdea13fc5550f160"},
{file = "docling_core-2.20.0.tar.gz", hash = "sha256:9733581c15f5a9b5e3a6cb74fa995cc4078ff16668007f86c5f75d1ea9180d7f"},
{file = "docling_core-2.21.1-py3-none-any.whl", hash = "sha256:b8112915728cdc14f328f636f6c0ed36e6bbcc02ff940cc0bf85e303738671c3"},
{file = "docling_core-2.21.1.tar.gz", hash = "sha256:3ccc50197d24a3156cfc6c22c8404c58757749646d876a1c1c69fd800f664a4f"},
]
[package.dependencies]
@ -880,13 +898,13 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]
[[package]]
name = "docling-ibm-models"
version = "3.4.0"
version = "3.4.1"
description = "This package contains the AI models used by the Docling PDF conversion package"
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_ibm_models-3.4.0-py3-none-any.whl", hash = "sha256:186517ff1f76e76113600fa1e5a699927325081a8013fdd5d0551121c2e34190"},
{file = "docling_ibm_models-3.4.0.tar.gz", hash = "sha256:fb79beeb07d1bb9bc8acf9d0a44643cd7ce1910aa418cd685e2e477b13eeafee"},
{file = "docling_ibm_models-3.4.1-py3-none-any.whl", hash = "sha256:c3582c99dddfa3f0eafcf80cf1267fd8efa39c4a74cc7a88f9dd49684fac2986"},
{file = "docling_ibm_models-3.4.1.tar.gz", hash = "sha256:093b4dff2ea284a4953c3aa009e29945208b8d389b94fb14940a03a93f673e96"},
]
[package.dependencies]
@ -1331,13 +1349,13 @@ test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit",
[[package]]
name = "griffe"
version = "1.5.7"
version = "1.6.0"
description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
optional = false
python-versions = ">=3.9"
files = [
{file = "griffe-1.5.7-py3-none-any.whl", hash = "sha256:4af8ec834b64de954d447c7b6672426bb145e71605c74a4e22d510cc79fe7d8b"},
{file = "griffe-1.5.7.tar.gz", hash = "sha256:465238c86deaf1137761f700fb343edd8ffc846d72f6de43c3c345ccdfbebe92"},
{file = "griffe-1.6.0-py3-none-any.whl", hash = "sha256:9f1dfe035d4715a244ed2050dfbceb05b1f470809ed4f6bb10ece5a7302f8dd1"},
{file = "griffe-1.6.0.tar.gz", hash = "sha256:eb5758088b9c73ad61c7ac014f3cdfb4c57b5c2fcbfca69996584b702aefa354"},
]
[package.dependencies]
@ -1818,18 +1836,18 @@ testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"]
[[package]]
name = "jeepney"
version = "0.8.0"
version = "0.9.0"
description = "Low-level, pure Python DBus protocol wrapper."
optional = false
python-versions = ">=3.7"
files = [
{file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"},
{file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"},
{file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"},
{file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"},
]
[package.extras]
test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"]
trio = ["async_generator", "trio"]
trio = ["trio"]
[[package]]
name = "jinja2"
@ -2715,17 +2733,18 @@ pygments = ">2.12.0"
[[package]]
name = "mkdocs-material"
version = "9.6.5"
version = "9.6.7"
description = "Documentation that simply works"
optional = false
python-versions = ">=3.8"
files = [
{file = "mkdocs_material-9.6.5-py3-none-any.whl", hash = "sha256:aad3e6fb860c20870f75fb2a69ef901f1be727891e41adb60b753efcae19453b"},
{file = "mkdocs_material-9.6.5.tar.gz", hash = "sha256:b714679a8c91b0ffe2188e11ed58c44d2523e9c2ae26a29cc652fa7478faa21f"},
{file = "mkdocs_material-9.6.7-py3-none-any.whl", hash = "sha256:8a159e45e80fcaadd9fbeef62cbf928569b93df954d4dc5ba76d46820caf7b47"},
{file = "mkdocs_material-9.6.7.tar.gz", hash = "sha256:3e2c1fceb9410056c2d91f334a00cdea3215c28750e00c691c1e46b2a33309b4"},
]
[package.dependencies]
babel = ">=2.10,<3.0"
backrefs = ">=5.7.post1,<6.0"
colorama = ">=0.4,<1.0"
jinja2 = ">=3.0,<4.0"
markdown = ">=3.2,<4.0"
@ -2734,7 +2753,6 @@ mkdocs-material-extensions = ">=1.3,<2.0"
paginate = ">=0.5,<1.0"
pygments = ">=2.16,<3.0"
pymdown-extensions = ">=10.2,<11.0"
regex = ">=2022.4"
requests = ">=2.26,<3.0"
[package.extras]
@ -4755,13 +4773,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
[[package]]
name = "pydantic-settings"
version = "2.8.0"
version = "2.8.1"
description = "Settings management using Pydantic"
optional = false
python-versions = ">=3.8"
files = [
{file = "pydantic_settings-2.8.0-py3-none-any.whl", hash = "sha256:c782c7dc3fb40e97b238e713c25d26f64314aece2e91abcff592fcac15f71820"},
{file = "pydantic_settings-2.8.0.tar.gz", hash = "sha256:88e2ca28f6e68ea102c99c3c401d6c9078e68a5df600e97b43891c34e089500a"},
{file = "pydantic_settings-2.8.1-py3-none-any.whl", hash = "sha256:81942d5ac3d905f7f3ee1a70df5dfb62d5569c12f51a5a647defc1c3d9ee2e9c"},
{file = "pydantic_settings-2.8.1.tar.gz", hash = "sha256:d5c663dfbe9db9d5e1c646b2e161da12f0d734d422ee56f567d0ea2cee4e8585"},
]
[package.dependencies]
@ -5907,26 +5925,26 @@ files = [
[[package]]
name = "safetensors"
version = "0.5.2"
version = "0.5.3"
description = ""
optional = false
python-versions = ">=3.7"
files = [
{file = "safetensors-0.5.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:45b6092997ceb8aa3801693781a71a99909ab9cc776fbc3fa9322d29b1d3bef2"},
{file = "safetensors-0.5.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6d0d6a8ee2215a440e1296b843edf44fd377b055ba350eaba74655a2fe2c4bae"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86016d40bcaa3bcc9a56cd74d97e654b5f4f4abe42b038c71e4f00a089c4526c"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:990833f70a5f9c7d3fc82c94507f03179930ff7d00941c287f73b6fcbf67f19e"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dfa7c2f3fe55db34eba90c29df94bcdac4821043fc391cb5d082d9922013869"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46ff2116150ae70a4e9c490d2ab6b6e1b1b93f25e520e540abe1b81b48560c3a"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ab696dfdc060caffb61dbe4066b86419107a24c804a4e373ba59be699ebd8d5"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:03c937100f38c9ff4c1507abea9928a6a9b02c9c1c9c3609ed4fb2bf413d4975"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:a00e737948791b94dad83cf0eafc09a02c4d8c2171a239e8c8572fe04e25960e"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:d3a06fae62418ec8e5c635b61a8086032c9e281f16c63c3af46a6efbab33156f"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:1506e4c2eda1431099cebe9abf6c76853e95d0b7a95addceaa74c6019c65d8cf"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5c5b5d9da594f638a259fca766046f44c97244cc7ab8bef161b3e80d04becc76"},
{file = "safetensors-0.5.2-cp38-abi3-win32.whl", hash = "sha256:fe55c039d97090d1f85277d402954dd6ad27f63034fa81985a9cc59655ac3ee2"},
{file = "safetensors-0.5.2-cp38-abi3-win_amd64.whl", hash = "sha256:78abdddd03a406646107f973c7843276e7b64e5e32623529dc17f3d94a20f589"},
{file = "safetensors-0.5.2.tar.gz", hash = "sha256:cb4a8d98ba12fa016f4241932b1fc5e702e5143f5374bba0bbcf7ddc1c4cf2b8"},
{file = "safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bd20eb133db8ed15b40110b7c00c6df51655a2998132193de2f75f72d99c7073"},
{file = "safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:21d01c14ff6c415c485616b8b0bf961c46b3b343ca59110d38d744e577f9cce7"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11bce6164887cd491ca75c2326a113ba934be596e22b28b1742ce27b1d076467"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4a243be3590bc3301c821da7a18d87224ef35cbd3e5f5727e4e0728b8172411e"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8bd84b12b1670a6f8e50f01e28156422a2bc07fb16fc4e98bded13039d688a0d"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:391ac8cab7c829452175f871fcaf414aa1e292b5448bd02620f675a7f3e7abb9"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cead1fa41fc54b1e61089fa57452e8834f798cb1dc7a09ba3524f1eb08e0317a"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d"},
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:799021e78287bac619c7b3f3606730a22da4cda27759ddf55d37c8db7511c74b"},
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df26da01aaac504334644e1b7642fa000bfec820e7cef83aeac4e355e03195ff"},
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:32c3ef2d7af8b9f52ff685ed0bc43913cdcde135089ae322ee576de93eae5135"},
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:37f1521be045e56fc2b54c606d4455573e717b2d887c579ee1dbba5f868ece04"},
{file = "safetensors-0.5.3-cp38-abi3-win32.whl", hash = "sha256:cfc0ec0846dcf6763b0ed3d1846ff36008c6e7290683b61616c4b040f6a54ace"},
{file = "safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11"},
{file = "safetensors-0.5.3.tar.gz", hash = "sha256:b6b0d6ecacec39a4fdd99cc19f4576f5219ce858e6fd8dbe7609df0b8dc56965"},
]
[package.dependencies]
@ -6223,13 +6241,13 @@ train = ["accelerate (>=0.20.3)", "datasets"]
[[package]]
name = "setuptools"
version = "75.8.1"
version = "75.8.2"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.9"
files = [
{file = "setuptools-75.8.1-py3-none-any.whl", hash = "sha256:3bc32c0b84c643299ca94e77f834730f126efd621de0cc1de64119e0e17dab1f"},
{file = "setuptools-75.8.1.tar.gz", hash = "sha256:65fb779a8f28895242923582eadca2337285f0891c2c9e160754df917c3d2530"},
{file = "setuptools-75.8.2-py3-none-any.whl", hash = "sha256:558e47c15f1811c1fa7adbd0096669bf76c1d3f433f58324df69f3f5ecac4e8f"},
{file = "setuptools-75.8.2.tar.gz", hash = "sha256:4880473a969e5f23f2a2be3646b2dfd84af9028716d398e46192f84bc36900d2"},
]
[package.extras]
@ -7227,13 +7245,13 @@ files = [
[[package]]
name = "types-requests"
version = "2.32.0.20241016"
version = "2.32.0.20250301"
description = "Typing stubs for requests"
optional = false
python-versions = ">=3.8"
python-versions = ">=3.9"
files = [
{file = "types-requests-2.32.0.20241016.tar.gz", hash = "sha256:0d9cad2f27515d0e3e3da7134a1b6f28fb97129d86b867f24d9c726452634d95"},
{file = "types_requests-2.32.0.20241016-py3-none-any.whl", hash = "sha256:4195d62d6d3e043a4eaaf08ff8a62184584d2e8684e9d2aa178c7915a7da3747"},
{file = "types_requests-2.32.0.20250301-py3-none-any.whl", hash = "sha256:0003e0124e2cbefefb88222ff822b48616af40c74df83350f599a650c8de483b"},
{file = "types_requests-2.32.0.20250301.tar.gz", hash = "sha256:3d909dc4eaab159c0d964ebe8bfa326a7afb4578d8706408d417e17d61b0c500"},
]
[package.dependencies]
@ -7241,13 +7259,13 @@ urllib3 = ">=2"
[[package]]
name = "types-tqdm"
version = "4.67.0.20241221"
version = "4.67.0.20250301"
description = "Typing stubs for tqdm"
optional = false
python-versions = ">=3.8"
python-versions = ">=3.9"
files = [
{file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"},
{file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"},
{file = "types_tqdm-4.67.0.20250301-py3-none-any.whl", hash = "sha256:8af97deb8e6874af833555dc1fe0fcd456b1a789470bf6cd8813d4e7ee4f6c5b"},
{file = "types_tqdm-4.67.0.20250301.tar.gz", hash = "sha256:5e89a38ad89b867823368eb97d9f90d2fc69806bb055dde62716a05da62b5e0d"},
]
[package.dependencies]
@ -7843,4 +7861,4 @@ vlm = ["accelerate", "transformers", "transformers"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "1d4718b694098b0676f1ad1606d769887e51fc29f604e5f4c83dd5e1c90557e7"
content-hash = "a340b1230bc83cdcff125a84eee457b1d8786abc112f2c0553391a4ab9f092ea"

View File

@ -1,24 +1,44 @@
[tool.poetry]
name = "docling"
version = "2.25.1" # DO NOT EDIT, updated automatically
version = "2.25.1" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
authors = [
"Christoph Auer <cau@zurich.ibm.com>",
"Michele Dolfi <dol@zurich.ibm.com>",
"Maxim Lysak <mly@zurich.ibm.com>",
"Nikos Livathinos <nli@zurich.ibm.com>",
"Ahmed Nassar <ahn@zurich.ibm.com>",
"Panos Vagenas <pva@zurich.ibm.com>",
"Peter Staar <taa@zurich.ibm.com>",
]
license = "MIT"
readme = "README.md"
repository = "https://github.com/DS4SD/docling"
homepage = "https://github.com/DS4SD/docling"
keywords= ["docling", "convert", "document", "pdf", "docx", "html", "markdown", "layout model", "segmentation", "table structure", "table former"]
classifiers = [
"License :: OSI Approved :: MIT License",
"Operating System :: MacOS :: MacOS X",
"Operating System :: POSIX :: Linux",
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 3"
]
packages = [{include = "docling"}]
keywords = [
"docling",
"convert",
"document",
"pdf",
"docx",
"html",
"markdown",
"layout model",
"segmentation",
"table structure",
"table former",
]
classifiers = [
"License :: OSI Approved :: MIT License",
"Operating System :: MacOS :: MacOS X",
"Operating System :: POSIX :: Linux",
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 3",
]
packages = [{ include = "docling" }]
[tool.poetry.dependencies]
######################
@ -26,7 +46,7 @@ packages = [{include = "docling"}]
######################
python = "^3.9"
pydantic = "^2.0.0"
docling-core = {extras = ["chunking"], version = "^2.19.0"}
docling-core = { extras = ["chunking"], version = "^2.21.1" }
docling-ibm-models = "^3.4.0"
docling-parse = "^3.3.0"
filetype = "^1.2.0"
@ -40,7 +60,7 @@ certifi = ">=2024.7.4"
rtree = "^1.3.0"
scipy = [
{ version = "^1.6.0", markers = "python_version >= '3.10'" },
{ version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" }
{ version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" },
]
typer = "^0.12.5"
python-docx = "^1.1.2"
@ -56,21 +76,22 @@ onnxruntime = [
# 1.19.2 is the last version with python3.9 support,
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" },
]
transformers = [
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
{ markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true },
]
accelerate = [
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
]
pillow = ">=10.0.0,<12.0.0"
tqdm = "^4.65.0"
pylatexenc = "^2.10"
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}
black = { extras = ["jupyter"], version = "^24.4.2" }
pytest = "^7.2.2"
pre-commit = "^3.7.1"
mypy = "^1.10.1"
@ -93,7 +114,7 @@ types-tqdm = "^4.67.0.20241221"
mkdocs-material = "^9.5.40"
mkdocs-jupyter = "^0.25.0"
mkdocs-click = "^0.8.1"
mkdocstrings = {extras = ["python"], version = "^0.27.0"}
mkdocstrings = { extras = ["python"], version = "^0.27.0" }
griffe-pydantic = "^1.1.0"
[tool.poetry.group.examples.dependencies]
@ -108,8 +129,8 @@ optional = true
[tool.poetry.group.constraints.dependencies]
numpy = [
{ version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
{ version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
{ version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
{ version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
]
[tool.poetry.group.mac_intel]
@ -117,12 +138,12 @@ optional = true
[tool.poetry.group.mac_intel.dependencies]
torch = [
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2"},
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2"}
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2" },
{ markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2" },
]
torchvision = [
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"},
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"}
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0" },
{ markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2" },
]
[tool.poetry.extras]
@ -147,7 +168,7 @@ include = '\.pyi?$'
[tool.isort]
profile = "black"
line_length = 88
py_version=39
py_version = 39
[tool.mypy]
pretty = true
@ -158,18 +179,19 @@ python_version = "3.10"
[[tool.mypy.overrides]]
module = [
"docling_parse.*",
"pypdfium2.*",
"networkx.*",
"scipy.*",
"filetype.*",
"tesserocr.*",
"docling_ibm_models.*",
"easyocr.*",
"ocrmac.*",
"lxml.*",
"huggingface_hub.*",
"transformers.*",
"docling_parse.*",
"pypdfium2.*",
"networkx.*",
"scipy.*",
"filetype.*",
"tesserocr.*",
"docling_ibm_models.*",
"easyocr.*",
"ocrmac.*",
"lxml.*",
"huggingface_hub.*",
"transformers.*",
"pylatexenc.*",
]
ignore_missing_imports = true

View File

@ -1,31 +1,40 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this:
item-2 at level 1: paragraph:
item-3 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
item-4 at level 1: paragraph: And that is an equation by itself. Cheers!
item-1 at level 1: inline: group group
item-2 at level 2: paragraph: This is a word document and this is an inline equation:
item-3 at level 2: formula: A= \pi r^{2}
item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-5 at level 1: paragraph:
item-6 at level 1: paragraph: This is another equation:
item-7 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right)
item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
item-7 at level 1: paragraph: And that is an equation by itself. Cheers!
item-8 at level 1: paragraph:
item-9 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
item-10 at level 1: paragraph:
item-9 at level 1: paragraph: This is another equation:
item-10 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right)
item-11 at level 1: paragraph:
item-12 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this:
item-12 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
item-13 at level 1: paragraph:
item-14 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
item-15 at level 1: paragraph:
item-16 at level 1: paragraph: And that is an equation by itself. Cheers!
item-17 at level 1: paragraph:
item-18 at level 1: paragraph: This is another equation:
item-14 at level 1: paragraph:
item-15 at level 1: inline: group group
item-16 at level 2: paragraph: This is a word document and this is an inline equation:
item-17 at level 2: formula: A= \pi r^{2}
item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-19 at level 1: paragraph:
item-20 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ht)x^{2}}{2!}+ \text{ \textellipsis }
item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
item-21 at level 1: paragraph:
item-22 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
item-22 at level 1: paragraph: And that is an equation by itself. Cheers!
item-23 at level 1: paragraph:
item-24 at level 1: paragraph:
item-25 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this:
item-26 at level 1: paragraph:
item-27 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... tellipsis } , - \infty < x < \infty
item-28 at level 1: paragraph:
item-29 at level 1: paragraph: And that is an equation by itself. Cheers!
item-30 at level 1: paragraph:
item-24 at level 1: paragraph: This is another equation:
item-25 at level 1: paragraph:
item-26 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ght)x^{2}}{2!}+ \text{ \textellipsis }
item-27 at level 1: paragraph:
item-28 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
item-29 at level 1: paragraph:
item-30 at level 1: paragraph:
item-31 at level 1: inline: group group
item-32 at level 2: paragraph: This is a word document and this is an inline equation:
item-33 at level 2: formula: A= \pi r^{2}
item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-35 at level 1: paragraph:
item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty
item-37 at level 1: paragraph:
item-38 at level 1: paragraph: And that is an equation by itself. Cheers!
item-39 at level 1: paragraph:

View File

@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.0.0",
"version": "1.2.0",
"name": "equations",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -10,6 +10,7 @@
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
@ -17,13 +18,7 @@
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
"$ref": "#/groups/0"
},
{
"$ref": "#/texts/3"
@ -56,13 +51,7 @@
"$ref": "#/texts/12"
},
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
},
{
"$ref": "#/texts/15"
"$ref": "#/groups/1"
},
{
"$ref": "#/texts/16"
@ -101,49 +90,126 @@
"$ref": "#/texts/27"
},
{
"$ref": "#/texts/28"
"$ref": "#/groups/2"
},
{
"$ref": "#/texts/29"
"$ref": "#/texts/31"
},
{
"$ref": "#/texts/32"
},
{
"$ref": "#/texts/33"
},
{
"$ref": "#/texts/34"
},
{
"$ref": "#/texts/35"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
},
{
"$ref": "#/texts/15"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/28"
},
{
"$ref": "#/texts/29"
},
{
"$ref": "#/texts/30"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:",
"text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:"
"orig": "This is a word document and this is an inline equation: ",
"text": "This is a word document and this is an inline equation: "
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
"$ref": "#/groups/0"
},
"children": [],
"label": "paragraph",
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "",
"text": ""
"orig": "A= \\pi r^{2} ",
"text": "A= \\pi r^{2} "
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/body"
"$ref": "#/groups/0"
},
"children": [],
"label": "formula",
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23",
"text": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23"
"orig": ". If instead, I want an equation by line, I can do this:",
"text": ". If instead, I want an equation by line, I can do this:"
},
{
"self_ref": "#/texts/3",
@ -151,10 +217,11 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!"
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/4",
@ -162,10 +229,11 @@
"$ref": "#/body"
},
"children": [],
"label": "paragraph",
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "",
"text": ""
"orig": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23",
"text": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23"
},
{
"self_ref": "#/texts/5",
@ -173,10 +241,11 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is another equation:",
"text": "This is another equation:"
"orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!"
},
{
"self_ref": "#/texts/6",
@ -184,10 +253,11 @@
"$ref": "#/body"
},
"children": [],
"label": "formula",
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)",
"text": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)"
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/7",
@ -195,10 +265,11 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
"orig": "This is another equation:",
"text": "This is another equation:"
},
{
"self_ref": "#/texts/8",
@ -206,10 +277,11 @@
"$ref": "#/body"
},
"children": [],
"label": "paragraph",
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
"orig": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)",
"text": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)"
},
{
"self_ref": "#/texts/9",
@ -217,6 +289,7 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
@ -228,10 +301,11 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
},
{
"self_ref": "#/texts/11",
@ -239,10 +313,11 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:",
"text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:"
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/12",
@ -250,6 +325,7 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
@ -258,35 +334,38 @@
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/body"
"$ref": "#/groups/1"
},
"children": [],
"label": "formula",
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}",
"text": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}"
"orig": "This is a word document and this is an inline equation: ",
"text": "This is a word document and this is an inline equation: "
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/body"
"$ref": "#/groups/1"
},
"children": [],
"label": "paragraph",
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "",
"text": ""
"orig": "A= \\pi r^{2} ",
"text": "A= \\pi r^{2} "
},
{
"self_ref": "#/texts/15",
"parent": {
"$ref": "#/body"
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!"
"orig": ". If instead, I want an equation by line, I can do this:",
"text": ". If instead, I want an equation by line, I can do this:"
},
{
"self_ref": "#/texts/16",
@ -294,6 +373,7 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
@ -305,10 +385,11 @@
"$ref": "#/body"
},
"children": [],
"label": "paragraph",
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "This is another equation:",
"text": "This is another equation:"
"orig": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}",
"text": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}"
},
{
"self_ref": "#/texts/18",
@ -316,6 +397,7 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
@ -327,10 +409,11 @@
"$ref": "#/body"
},
"children": [],
"label": "formula",
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } ",
"text": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } "
"orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!"
},
{
"self_ref": "#/texts/20",
@ -338,6 +421,7 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
@ -349,10 +433,11 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
"orig": "This is another equation:",
"text": "This is another equation:"
},
{
"self_ref": "#/texts/22",
@ -360,6 +445,7 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
@ -371,10 +457,11 @@
"$ref": "#/body"
},
"children": [],
"label": "paragraph",
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "",
"text": ""
"orig": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis }",
"text": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis }"
},
{
"self_ref": "#/texts/24",
@ -382,10 +469,11 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:",
"text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:"
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/25",
@ -393,10 +481,11 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
},
{
"self_ref": "#/texts/26",
@ -404,10 +493,11 @@
"$ref": "#/body"
},
"children": [],
"label": "formula",
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty ",
"text": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty "
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/27",
@ -415,6 +505,7 @@
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
@ -422,21 +513,95 @@
},
{
"self_ref": "#/texts/28",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "This is a word document and this is an inline equation: ",
"text": "This is a word document and this is an inline equation: "
},
{
"self_ref": "#/texts/29",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "A= \\pi r^{2} ",
"text": "A= \\pi r^{2} "
},
{
"self_ref": "#/texts/30",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": ". If instead, I want an equation by line, I can do this:",
"text": ". If instead, I want an equation by line, I can do this:"
},
{
"self_ref": "#/texts/31",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/32",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty",
"text": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty"
},
{
"self_ref": "#/texts/33",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/34",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!"
},
{
"self_ref": "#/texts/29",
"self_ref": "#/texts/35",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
@ -446,5 +611,6 @@
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -1,4 +1,4 @@
This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this:
This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:
$$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$$
@ -10,7 +10,7 @@ $$f\left(x\right)=a_{0}+\sum_{n=1}^{ \infty }\left(a_{n}\cos(\frac{n \pi x}{L})+
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this:
This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:
$$\left(x+a\right)^{n}=\sum_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$$
@ -18,12 +18,12 @@ And that is an equation by itself. Cheers!
This is another equation:
$$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{ \textellipsis } $$
$$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{ \textellipsis }$$
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this:
This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this:
$$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty $$
$$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty$$
And that is an equation by itself. Cheers!

View File

@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.1.0",
"version": "1.2.0",
"name": "lorem_ipsum",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -163,5 +163,6 @@
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.1.0",
"version": "1.2.0",
"name": "tablecell",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -399,5 +399,6 @@
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.1.0",
"version": "1.2.0",
"name": "unit_test_headers",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -748,5 +748,6 @@
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.1.0",
"version": "1.2.0",
"name": "unit_test_headers_numbered",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -802,5 +802,6 @@
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.1.0",
"version": "1.2.0",
"name": "unit_test_lists",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -979,5 +979,6 @@
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -3,7 +3,7 @@ item-0 at level 0: unspecified: group _root_
item-2 at level 1: title: Swimming in the lake
item-3 at level 2: paragraph: Duck
item-4 at level 2: picture
item-5 at level 2: text: Figure 1: This is a cute duckling
item-5 at level 2: paragraph: Figure 1: This is a cute duckling
item-6 at level 2: section_header: Lets swim!
item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown:
item-8 at level 3: list: group list

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.1.0",
"version": "1.2.0",
"name": "word_tables",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -2372,5 +2372,6 @@
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}