mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Adding test files
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
parent
7996dcbf3e
commit
f5034944b8
BIN
tests/data/docx/equations.docx
Normal file
BIN
tests/data/docx/equations.docx
Normal file
Binary file not shown.
31
tests/data/groundtruth/docling_v2/equations.docx.itxt
Normal file
31
tests/data/groundtruth/docling_v2/equations.docx.itxt
Normal file
@ -0,0 +1,31 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this:
|
||||
item-2 at level 1: paragraph:
|
||||
item-3 at level 1: paragraph: $a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$
|
||||
item-4 at level 1: paragraph: And that is an equation by itself. Cheers!
|
||||
item-5 at level 1: paragraph:
|
||||
item-6 at level 1: paragraph: This is another equation:
|
||||
item-7 at level 1: paragraph: $f\left(x\right)=a_{0}+\sum_{n=1 ... )+b_{n}\sin(\frac{n \pi x}{L})\right)$
|
||||
item-8 at level 1: paragraph:
|
||||
item-9 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
|
||||
item-10 at level 1: paragraph:
|
||||
item-11 at level 1: paragraph:
|
||||
item-12 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this:
|
||||
item-13 at level 1: paragraph:
|
||||
item-14 at level 1: paragraph: $\left(x+a\right)^{n}=\sum_{k=0} ... c{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$
|
||||
item-15 at level 1: paragraph:
|
||||
item-16 at level 1: paragraph: And that is an equation by itself. Cheers!
|
||||
item-17 at level 1: paragraph:
|
||||
item-18 at level 1: paragraph: This is another equation:
|
||||
item-19 at level 1: paragraph:
|
||||
item-20 at level 1: paragraph: $\left(1+x\right)^{n}=1+\frac{nx ... t)x^{2}}{2!}+ \text{ \textellipsis } $
|
||||
item-21 at level 1: paragraph:
|
||||
item-22 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
|
||||
item-23 at level 1: paragraph:
|
||||
item-24 at level 1: paragraph:
|
||||
item-25 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this:
|
||||
item-26 at level 1: paragraph:
|
||||
item-27 at level 1: paragraph: $e^{x}=1+\frac{x}{1!}+\frac{x^{2 ... ellipsis } , - \infty < x < \infty $
|
||||
item-28 at level 1: paragraph:
|
||||
item-29 at level 1: paragraph: And that is an equation by itself. Cheers!
|
||||
item-30 at level 1: paragraph:
|
450
tests/data/groundtruth/docling_v2/equations.docx.json
Normal file
450
tests/data/groundtruth/docling_v2/equations.docx.json
Normal file
@ -0,0 +1,450 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.0.0",
|
||||
"name": "equations",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"binary_hash": 11121138535595486899,
|
||||
"filename": "equations.docx"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/7"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/8"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/9"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/10"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/11"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/12"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/13"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/14"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/15"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/17"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/18"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/19"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/20"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/21"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/22"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/23"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/24"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/25"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/26"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/27"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/28"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/29"
|
||||
}
|
||||
],
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:",
|
||||
"text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "$a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23$",
|
||||
"text": "$a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23$"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "And that is an equation by itself. Cheers!",
|
||||
"text": "And that is an equation by itself. Cheers!"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "This is another equation:",
|
||||
"text": "This is another equation:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "$f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)$",
|
||||
"text": "$f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)$"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/8",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
||||
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/9",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/10",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/11",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:",
|
||||
"text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/12",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/13",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "$\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}$",
|
||||
"text": "$\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}$"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/14",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/15",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "And that is an equation by itself. Cheers!",
|
||||
"text": "And that is an equation by itself. Cheers!"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/16",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/17",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "This is another equation:",
|
||||
"text": "This is another equation:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/18",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/19",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "$\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } $",
|
||||
"text": "$\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } $"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/20",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/21",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
||||
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/22",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/23",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/24",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:",
|
||||
"text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/25",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/26",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "$e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty $",
|
||||
"text": "$e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty $"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/27",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/28",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "And that is an equation by itself. Cheers!",
|
||||
"text": "And that is an equation by itself. Cheers!"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/29",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"pages": {}
|
||||
}
|
29
tests/data/groundtruth/docling_v2/equations.docx.md
Normal file
29
tests/data/groundtruth/docling_v2/equations.docx.md
Normal file
@ -0,0 +1,29 @@
|
||||
This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this:
|
||||
|
||||
$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$
|
||||
|
||||
And that is an equation by itself. Cheers!
|
||||
|
||||
This is another equation:
|
||||
|
||||
$f\left(x\right)=a\_{0}+\sum\_{n=1}^{ \infty }\left(a\_{n}\cos(\frac{n \pi x}{L})+b\_{n}\sin(\frac{n \pi x}{L})\right)$
|
||||
|
||||
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
|
||||
|
||||
This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this:
|
||||
|
||||
$\left(x+a\right)^{n}=\sum\_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$
|
||||
|
||||
And that is an equation by itself. Cheers!
|
||||
|
||||
This is another equation:
|
||||
|
||||
$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{ \textellipsis } $
|
||||
|
||||
This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.
|
||||
|
||||
This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this:
|
||||
|
||||
$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty $
|
||||
|
||||
And that is an equation by itself. Cheers!
|
Loading…
Reference in New Issue
Block a user