mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
refactor(HTML): handle text from styled html (#1960)
* A new HTML backend that handles styled html (ignors it) as well as images. Images are parsed as placeholders with a caption, if it exists. Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: vaaale <2428222+vaaale@users.noreply.github.com> Signed-off-by: Alexander Vaagan <alexander.vaagan@gmail.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: vaaale <2428222+vaaale@users.noreply.github.com> * tests(HTML): re-enable test_ordered_lists Re-enable test_ordered_lists regression test for the HTML backend since docling-core now supports ordered lists with custom start value. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Alexander Vaagan <alexander.vaagan@gmail.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: vaaale <2428222+vaaale@users.noreply.github.com> Co-authored-by: Alexander Vaagan <2428222+vaaale@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
5d98bcea1b
commit
a069b1175b
@@ -19,6 +19,9 @@
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@@ -33,10 +36,10 @@
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
"$ref": "#/texts/5"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
"$ref": "#/texts/6"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@@ -50,10 +53,10 @@
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
"$ref": "#/texts/7"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/7"
|
||||
"$ref": "#/texts/8"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@@ -67,10 +70,10 @@
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/8"
|
||||
"$ref": "#/texts/9"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/9"
|
||||
"$ref": "#/texts/10"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@@ -153,6 +156,18 @@
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "caption",
|
||||
"prov": [],
|
||||
"orig": "Example image",
|
||||
"text": "Example image"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
@@ -166,7 +181,7 @@
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
@@ -180,7 +195,7 @@
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
@@ -194,7 +209,7 @@
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/7",
|
||||
"self_ref": "#/texts/8",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
@@ -208,7 +223,7 @@
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/8",
|
||||
"self_ref": "#/texts/9",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
@@ -222,7 +237,7 @@
|
||||
"marker": "42."
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/9",
|
||||
"self_ref": "#/texts/10",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
@@ -246,7 +261,11 @@
|
||||
"content_layer": "body",
|
||||
"label": "picture",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"captions": [
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
}
|
||||
],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"annotations": []
|
||||
|
||||
Reference in New Issue
Block a user