mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 03:55:00 +00:00
* fix(ocr): tesseract support mis-oriented documents Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): update missing test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): rotate image to the natural orientation before layout prediction Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): move bounding bow rotation util to orientation.py Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): refactor rotation utilities Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): avoid to swallow tesseract errors causing orientation detection failures Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel` * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel` * chore(ocr): default `TesseractOcrCliModel._is_auto` to `False` * fix(ocr): fix `TesseractOcrCliModel._is_auto` computation * chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel` --------- Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
658 lines
16 KiB
JSON
658 lines
16 KiB
JSON
{
|
|
"schema_name": "DoclingDocument",
|
|
"version": "1.3.0",
|
|
"name": "equations",
|
|
"origin": {
|
|
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
"binary_hash": 11121138535595486899,
|
|
"filename": "equations.docx"
|
|
},
|
|
"furniture": {
|
|
"self_ref": "#/furniture",
|
|
"children": [],
|
|
"content_layer": "furniture",
|
|
"name": "_root_",
|
|
"label": "unspecified"
|
|
},
|
|
"body": {
|
|
"self_ref": "#/body",
|
|
"children": [
|
|
{
|
|
"$ref": "#/groups/0"
|
|
},
|
|
{
|
|
"$ref": "#/texts/3"
|
|
},
|
|
{
|
|
"$ref": "#/texts/4"
|
|
},
|
|
{
|
|
"$ref": "#/texts/5"
|
|
},
|
|
{
|
|
"$ref": "#/texts/6"
|
|
},
|
|
{
|
|
"$ref": "#/texts/7"
|
|
},
|
|
{
|
|
"$ref": "#/texts/8"
|
|
},
|
|
{
|
|
"$ref": "#/texts/9"
|
|
},
|
|
{
|
|
"$ref": "#/texts/10"
|
|
},
|
|
{
|
|
"$ref": "#/texts/11"
|
|
},
|
|
{
|
|
"$ref": "#/texts/12"
|
|
},
|
|
{
|
|
"$ref": "#/groups/1"
|
|
},
|
|
{
|
|
"$ref": "#/texts/16"
|
|
},
|
|
{
|
|
"$ref": "#/texts/17"
|
|
},
|
|
{
|
|
"$ref": "#/texts/18"
|
|
},
|
|
{
|
|
"$ref": "#/texts/19"
|
|
},
|
|
{
|
|
"$ref": "#/texts/20"
|
|
},
|
|
{
|
|
"$ref": "#/texts/21"
|
|
},
|
|
{
|
|
"$ref": "#/texts/22"
|
|
},
|
|
{
|
|
"$ref": "#/texts/23"
|
|
},
|
|
{
|
|
"$ref": "#/texts/24"
|
|
},
|
|
{
|
|
"$ref": "#/texts/25"
|
|
},
|
|
{
|
|
"$ref": "#/texts/26"
|
|
},
|
|
{
|
|
"$ref": "#/texts/27"
|
|
},
|
|
{
|
|
"$ref": "#/groups/2"
|
|
},
|
|
{
|
|
"$ref": "#/texts/31"
|
|
},
|
|
{
|
|
"$ref": "#/texts/32"
|
|
},
|
|
{
|
|
"$ref": "#/texts/33"
|
|
},
|
|
{
|
|
"$ref": "#/texts/34"
|
|
},
|
|
{
|
|
"$ref": "#/texts/35"
|
|
}
|
|
],
|
|
"content_layer": "body",
|
|
"name": "_root_",
|
|
"label": "unspecified"
|
|
},
|
|
"groups": [
|
|
{
|
|
"self_ref": "#/groups/0",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [
|
|
{
|
|
"$ref": "#/texts/0"
|
|
},
|
|
{
|
|
"$ref": "#/texts/1"
|
|
},
|
|
{
|
|
"$ref": "#/texts/2"
|
|
}
|
|
],
|
|
"content_layer": "body",
|
|
"name": "group",
|
|
"label": "inline"
|
|
},
|
|
{
|
|
"self_ref": "#/groups/1",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [
|
|
{
|
|
"$ref": "#/texts/13"
|
|
},
|
|
{
|
|
"$ref": "#/texts/14"
|
|
},
|
|
{
|
|
"$ref": "#/texts/15"
|
|
}
|
|
],
|
|
"content_layer": "body",
|
|
"name": "group",
|
|
"label": "inline"
|
|
},
|
|
{
|
|
"self_ref": "#/groups/2",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [
|
|
{
|
|
"$ref": "#/texts/28"
|
|
},
|
|
{
|
|
"$ref": "#/texts/29"
|
|
},
|
|
{
|
|
"$ref": "#/texts/30"
|
|
}
|
|
],
|
|
"content_layer": "body",
|
|
"name": "group",
|
|
"label": "inline"
|
|
}
|
|
],
|
|
"texts": [
|
|
{
|
|
"self_ref": "#/texts/0",
|
|
"parent": {
|
|
"$ref": "#/groups/0"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "This is a word document and this is an inline equation: ",
|
|
"text": "This is a word document and this is an inline equation: "
|
|
},
|
|
{
|
|
"self_ref": "#/texts/1",
|
|
"parent": {
|
|
"$ref": "#/groups/0"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "formula",
|
|
"prov": [],
|
|
"orig": "A= \\pi r^{2}",
|
|
"text": "A= \\pi r^{2}"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/2",
|
|
"parent": {
|
|
"$ref": "#/groups/0"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": ". If instead, I want an equation by line, I can do this:",
|
|
"text": ". If instead, I want an equation by line, I can do this:"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/3",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/4",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "formula",
|
|
"prov": [],
|
|
"orig": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23",
|
|
"text": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/5",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "And that is an equation by itself. Cheers!",
|
|
"text": "And that is an equation by itself. Cheers!",
|
|
"formatting": {
|
|
"bold": false,
|
|
"italic": false,
|
|
"underline": false,
|
|
"strikethrough": false
|
|
}
|
|
},
|
|
{
|
|
"self_ref": "#/texts/6",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/7",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "This is another equation:",
|
|
"text": "This is another equation:",
|
|
"formatting": {
|
|
"bold": false,
|
|
"italic": false,
|
|
"underline": false,
|
|
"strikethrough": false
|
|
}
|
|
},
|
|
{
|
|
"self_ref": "#/texts/8",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "formula",
|
|
"prov": [],
|
|
"orig": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)",
|
|
"text": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/9",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/10",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
|
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
|
"formatting": {
|
|
"bold": false,
|
|
"italic": false,
|
|
"underline": false,
|
|
"strikethrough": false
|
|
}
|
|
},
|
|
{
|
|
"self_ref": "#/texts/11",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/12",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/13",
|
|
"parent": {
|
|
"$ref": "#/groups/1"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "This is a word document and this is an inline equation: ",
|
|
"text": "This is a word document and this is an inline equation: "
|
|
},
|
|
{
|
|
"self_ref": "#/texts/14",
|
|
"parent": {
|
|
"$ref": "#/groups/1"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "formula",
|
|
"prov": [],
|
|
"orig": "A= \\pi r^{2}",
|
|
"text": "A= \\pi r^{2}"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/15",
|
|
"parent": {
|
|
"$ref": "#/groups/1"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": ". If instead, I want an equation by line, I can do this:",
|
|
"text": ". If instead, I want an equation by line, I can do this:"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/16",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/17",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "formula",
|
|
"prov": [],
|
|
"orig": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}",
|
|
"text": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/18",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/19",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "And that is an equation by itself. Cheers!",
|
|
"text": "And that is an equation by itself. Cheers!",
|
|
"formatting": {
|
|
"bold": false,
|
|
"italic": false,
|
|
"underline": false,
|
|
"strikethrough": false
|
|
}
|
|
},
|
|
{
|
|
"self_ref": "#/texts/20",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/21",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "This is another equation:",
|
|
"text": "This is another equation:",
|
|
"formatting": {
|
|
"bold": false,
|
|
"italic": false,
|
|
"underline": false,
|
|
"strikethrough": false
|
|
}
|
|
},
|
|
{
|
|
"self_ref": "#/texts/22",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/23",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "formula",
|
|
"prov": [],
|
|
"orig": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis }",
|
|
"text": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis }"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/24",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/25",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
|
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
|
"formatting": {
|
|
"bold": false,
|
|
"italic": false,
|
|
"underline": false,
|
|
"strikethrough": false
|
|
}
|
|
},
|
|
{
|
|
"self_ref": "#/texts/26",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/27",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/28",
|
|
"parent": {
|
|
"$ref": "#/groups/2"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "This is a word document and this is an inline equation: ",
|
|
"text": "This is a word document and this is an inline equation: "
|
|
},
|
|
{
|
|
"self_ref": "#/texts/29",
|
|
"parent": {
|
|
"$ref": "#/groups/2"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "formula",
|
|
"prov": [],
|
|
"orig": "A= \\pi r^{2}",
|
|
"text": "A= \\pi r^{2}"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/30",
|
|
"parent": {
|
|
"$ref": "#/groups/2"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": ". If instead, I want an equation by line, I can do this:",
|
|
"text": ". If instead, I want an equation by line, I can do this:"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/31",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/32",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "formula",
|
|
"prov": [],
|
|
"orig": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty",
|
|
"text": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty"
|
|
},
|
|
{
|
|
"self_ref": "#/texts/33",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
},
|
|
{
|
|
"self_ref": "#/texts/34",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "And that is an equation by itself. Cheers!",
|
|
"text": "And that is an equation by itself. Cheers!",
|
|
"formatting": {
|
|
"bold": false,
|
|
"italic": false,
|
|
"underline": false,
|
|
"strikethrough": false
|
|
}
|
|
},
|
|
{
|
|
"self_ref": "#/texts/35",
|
|
"parent": {
|
|
"$ref": "#/body"
|
|
},
|
|
"children": [],
|
|
"content_layer": "body",
|
|
"label": "paragraph",
|
|
"prov": [],
|
|
"orig": "",
|
|
"text": ""
|
|
}
|
|
],
|
|
"pictures": [],
|
|
"tables": [],
|
|
"key_value_items": [],
|
|
"form_items": [],
|
|
"pages": {}
|
|
} |