fix: support escaped characters in markdown backend (#2304)

fix: improve markdown backend to support input documents with escaped characters

Signed-off-by: Lucas Morin <lucas.morin222@gmail.com>
This commit is contained in:
Lucas Morin
2025-09-23 18:00:16 +02:00
committed by GitHub
parent d599177547
commit 9d67bb9ed6
7 changed files with 772 additions and 3 deletions

View File

@@ -0,0 +1,675 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "escaped_characters",
"origin": {
"mimetype": "text/html",
"binary_hash": 10682185258371912110,
"filename": "escaped_characters.md"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/9"
},
{
"$ref": "#/texts/11"
},
{
"$ref": "#/texts/12"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/texts/4"
},
"children": [
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"name": "ordered list",
"label": "list"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/texts/4"
},
"children": [
{
"$ref": "#/texts/6"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "title",
"prov": [],
"orig": "escaped_characters",
"text": "escaped_characters"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Headers:",
"text": "Headers:"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/3"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"level": 1
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/texts/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Text: 00:16.000 ----> 00:18.000 & < > \" '",
"text": "Text: 00:16.000 ----> 00:18.000 & < > \" '"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Lists",
"text": "Lists"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"enumerated": true,
"marker": ""
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Inline code",
"text": "Inline code"
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/texts/7"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/10"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Code block",
"text": "Code block"
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/texts/9"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/tables/0"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Table",
"text": "Table"
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Raw HTML",
"text": "Raw HTML"
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/texts/12"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "& < > \" '/div>",
"text": "& < > \" '/div>"
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/texts/12"
},
"children": [
{
"$ref": "#/texts/15"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Link",
"text": "Link",
"level": 1
},
{
"self_ref": "#/texts/15",
"parent": {
"$ref": "#/texts/14"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"hyperlink": "https://en.wikipedia.org/wiki/Albert_Einstein"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/11"
},
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Key",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Example",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Ampersand",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "&",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Less-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "<",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Greater-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": ">",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Quotes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "\"",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Apostrophes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "'",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 6,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Key",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Example",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Ampersand",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "&",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Less-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "<",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Greater-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": ">",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Quotes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "\"",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Apostrophes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "'",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,41 @@
# Headers:
## &amp; &lt; &gt; " '
Text: 00:16.000 ----&gt; 00:18.000 &amp; &lt; &gt; " '
# Lists
1. &amp; &lt; &gt; " '
- &amp; &lt; &gt; " '
# Inline code
```
& < > " '
```
# Code block
```
& < > " '
```
# Table
| Key | Example |
|--------------|-----------|
| Ampersand | & |
| Less-than | < |
| Greater-than | > |
| Quotes | " |
| Apostrophes | ' |
# Raw HTML
&amp; &lt; &gt; " '/div&gt;
## Link
[&amp; &lt; &gt; " '](https://en.wikipedia.org/wiki/Albert_Einstein)

View File

@@ -186,6 +186,7 @@ tables:
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -196,6 +197,7 @@ tables:
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -206,6 +208,7 @@ tables:
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -216,6 +219,7 @@ tables:
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -229,6 +233,7 @@ tables:
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -239,6 +244,7 @@ tables:
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -249,6 +255,7 @@ tables:
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -259,6 +266,7 @@ tables:
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -269,6 +277,7 @@ tables:
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -279,6 +288,7 @@ tables:
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -289,6 +299,7 @@ tables:
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -299,6 +310,7 @@ tables:
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -878,4 +890,4 @@ texts:
prov: []
self_ref: '#/texts/48'
text: Table Heading
version: 1.6.0
version: 1.7.0

View File

@@ -136,4 +136,4 @@ texts:
prov: []
self_ref: '#/texts/7'
text: The end!
version: 1.6.0
version: 1.7.0

33
tests/data/md/escaped_characters.md vendored Normal file
View File

@@ -0,0 +1,33 @@
# Headers:
## &amp; &lt; &gt; &quot; &#39;
Text:
00:16.000 ----&gt; 00:18.000
&amp; &lt; &gt; &quot; &#39;
# Lists
1. &amp; &lt; &gt; &quot; &#39;
- &amp; &lt; &gt; &quot; &#39;
# Inline code
`&amp; &lt; &gt; &quot; &#39; `
# Code block
```
&amp; &lt; &gt; &quot; &#39;
```
# Table
| Key | Example |
| ------------------- | ----------------- |
| Ampersand | &amp; |
| Less-than | &lt; |
| Greater-than | &gt; |
| Quotes | &quot; |
| Apostrophes | &#39; |
# Raw HTML
<div title="">&amp; &lt; &gt; &quot; &#39;/div>
## Link
[&amp; &lt; &gt; &quot; &#39;](https://en.wikipedia.org/wiki/Albert_Einstein)