fix: support escaped characters in markdown backend (#2304)

fix: improve markdown backend to support input documents with escaped characters

Signed-off-by: Lucas Morin <lucas.morin222@gmail.com>
This commit is contained in:
Lucas Morin
2025-09-23 18:00:16 +02:00
committed by GitHub
parent d599177547
commit 9d67bb9ed6
7 changed files with 772 additions and 3 deletions

View File

@@ -3,6 +3,7 @@ import re
import warnings
from copy import deepcopy
from enum import Enum
from html import unescape
from io import BytesIO
from pathlib import Path
from typing import Literal, Optional, Union, cast
@@ -321,9 +322,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "":
title = unescape(element.title)
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION,
text=element.title,
text=title,
formatting=formatting,
hyperlink=hyperlink,
)
@@ -351,6 +353,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
snippet_text = (
element.children.strip() if isinstance(element.children, str) else ""
)
snippet_text = unescape(snippet_text)
# Detect start of the table:
if "|" in snippet_text or self.in_table:
# most likely part of the markdown table

View File

@@ -0,0 +1,675 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "escaped_characters",
"origin": {
"mimetype": "text/html",
"binary_hash": 10682185258371912110,
"filename": "escaped_characters.md"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/9"
},
{
"$ref": "#/texts/11"
},
{
"$ref": "#/texts/12"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/texts/4"
},
"children": [
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"name": "ordered list",
"label": "list"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/texts/4"
},
"children": [
{
"$ref": "#/texts/6"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "title",
"prov": [],
"orig": "escaped_characters",
"text": "escaped_characters"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Headers:",
"text": "Headers:"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/3"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"level": 1
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/texts/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Text: 00:16.000 ----> 00:18.000 & < > \" '",
"text": "Text: 00:16.000 ----> 00:18.000 & < > \" '"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Lists",
"text": "Lists"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"enumerated": true,
"marker": ""
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Inline code",
"text": "Inline code"
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/texts/7"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/10"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Code block",
"text": "Code block"
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/texts/9"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/tables/0"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Table",
"text": "Table"
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Raw HTML",
"text": "Raw HTML"
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/texts/12"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "& < > \" '/div>",
"text": "& < > \" '/div>"
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/texts/12"
},
"children": [
{
"$ref": "#/texts/15"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Link",
"text": "Link",
"level": 1
},
{
"self_ref": "#/texts/15",
"parent": {
"$ref": "#/texts/14"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"hyperlink": "https://en.wikipedia.org/wiki/Albert_Einstein"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/11"
},
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Key",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Example",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Ampersand",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "&",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Less-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "<",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Greater-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": ">",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Quotes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "\"",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Apostrophes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "'",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 6,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Key",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Example",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Ampersand",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "&",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Less-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "<",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Greater-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": ">",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Quotes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "\"",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Apostrophes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "'",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,41 @@
# Headers:
## &amp; &lt; &gt; " '
Text: 00:16.000 ----&gt; 00:18.000 &amp; &lt; &gt; " '
# Lists
1. &amp; &lt; &gt; " '
- &amp; &lt; &gt; " '
# Inline code
```
& < > " '
```
# Code block
```
& < > " '
```
# Table
| Key | Example |
|--------------|-----------|
| Ampersand | & |
| Less-than | < |
| Greater-than | > |
| Quotes | " |
| Apostrophes | ' |
# Raw HTML
&amp; &lt; &gt; " '/div&gt;
## Link
[&amp; &lt; &gt; " '](https://en.wikipedia.org/wiki/Albert_Einstein)

View File

@@ -186,6 +186,7 @@ tables:
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -196,6 +197,7 @@ tables:
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -206,6 +208,7 @@ tables:
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -216,6 +219,7 @@ tables:
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -229,6 +233,7 @@ tables:
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -239,6 +244,7 @@ tables:
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -249,6 +255,7 @@ tables:
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -259,6 +266,7 @@ tables:
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -269,6 +277,7 @@ tables:
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -279,6 +288,7 @@ tables:
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -289,6 +299,7 @@ tables:
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -299,6 +310,7 @@ tables:
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -878,4 +890,4 @@ texts:
prov: []
self_ref: '#/texts/48'
text: Table Heading
version: 1.6.0
version: 1.7.0

View File

@@ -136,4 +136,4 @@ texts:
prov: []
self_ref: '#/texts/7'
text: The end!
version: 1.6.0
version: 1.7.0

33
tests/data/md/escaped_characters.md vendored Normal file
View File

@@ -0,0 +1,33 @@
# Headers:
## &amp; &lt; &gt; &quot; &#39;
Text:
00:16.000 ----&gt; 00:18.000
&amp; &lt; &gt; &quot; &#39;
# Lists
1. &amp; &lt; &gt; &quot; &#39;
- &amp; &lt; &gt; &quot; &#39;
# Inline code
`&amp; &lt; &gt; &quot; &#39; `
# Code block
```
&amp; &lt; &gt; &quot; &#39;
```
# Table
| Key | Example |
| ------------------- | ----------------- |
| Ampersand | &amp; |
| Less-than | &lt; |
| Greater-than | &gt; |
| Quotes | &quot; |
| Apostrophes | &#39; |
# Raw HTML
<div title="">&amp; &lt; &gt; &quot; &#39;/div>
## Link
[&amp; &lt; &gt; &quot; &#39;](https://en.wikipedia.org/wiki/Albert_Einstein)

View File

@@ -26,10 +26,12 @@ def test_convert_valid():
assert len(relevant_paths) > 0
yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
json_filter = ["escaped_characters"]
for in_path in relevant_paths:
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
json_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.json"
in_doc = InputDocument(
path_or_stream=in_path,
@@ -45,6 +47,9 @@ def test_convert_valid():
act_doc = backend.convert()
act_data = act_doc.export_to_markdown()
if in_path.stem in json_filter:
assert verify_document(act_doc, json_gt_path, GENERATE), "export to json"
if GEN_TEST_DATA:
with open(md_gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n")