From 6a34f6f5c5dd8e7dd595cf14a95d52401eb5a3b8 Mon Sep 17 00:00:00 2001 From: Michael Honaker Date: Wed, 18 Jun 2025 18:12:24 -0700 Subject: [PATCH] fix: update md table classification Signed-off-by: Michael Honaker --- docling/backend/md_backend.py | 2 +- .../docling_v2/inline_and_formatting.md.md | 6 +- .../docling_v2/inline_and_formatting.md.yaml | 150 +++++++++++++++++- tests/data/md/inline_and_formatting.md | 4 + 4 files changed, 157 insertions(+), 5 deletions(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index b8b0e6d0..d6055d2a 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -276,7 +276,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): _log.debug(f" - Paragraph (raw text): {element.children}") snippet_text = element.children.strip() # Detect start of the table: - if "|" in snippet_text: + if "|" in snippet_text or self.in_table: # most likely part of the markdown table self.in_table = True if len(self.md_table_buffer) > 0: diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md index 31c3f3be..f8f71a37 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md @@ -12,9 +12,13 @@ Create your feature branch: `git checkout -b feature/AmazingFeature` . 4. Push to the branch ( `git push origin feature/AmazingFeature` ) 5. Open a Pull Request -## +## *Second* section - **First** : Lorem ipsum. - **Second** : Dolor `sit` amet. + +| Bold Heading | Italic Heading | +|----------------|------------------| +| data a | data b | \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml index 0cdc5c54..4ef8e189 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml @@ -7,6 +7,8 @@ body: - $ref: '#/groups/2' - $ref: '#/texts/27' - $ref: '#/groups/8' + - $ref: '#/groups/11' + - $ref: '#/tables/0' content_layer: body label: unspecified name: _root_ @@ -131,16 +133,158 @@ groups: parent: $ref: '#/texts/33' self_ref: '#/groups/10' +- children: [] + content_layer: body + label: inline + name: group + parent: + $ref: '#/body' + self_ref: '#/groups/11' key_value_items: [] name: inline_and_formatting origin: - binary_hash: 9342273634728023910 + binary_hash: 15980020574215496313 filename: inline_and_formatting.md mimetype: text/markdown pages: {} pictures: [] schema_name: DoclingDocument -tables: [] +tables: +- captions: [] + children: [] + content_layer: body + data: + grid: + - - col_span: 1 + column_header: true + end_col_offset_idx: 1 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 0 + text: Bold Heading + - col_span: 1 + column_header: true + end_col_offset_idx: 2 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 0 + text: Italic Heading + - - col_span: 1 + column_header: false + end_col_offset_idx: 1 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 1 + text: data a + - col_span: 1 + column_header: false + end_col_offset_idx: 2 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 1 + text: data b + num_cols: 2 + num_rows: 2 + table_cells: + - col_span: 1 + column_header: true + end_col_offset_idx: 1 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 0 + text: Bold Heading + - col_span: 1 + column_header: true + end_col_offset_idx: 2 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 0 + text: Italic Heading + - col_span: 1 + column_header: false + end_col_offset_idx: 1 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 1 + text: data a + - col_span: 1 + column_header: false + end_col_offset_idx: 2 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 1 + text: data b + - col_span: 1 + column_header: true + end_col_offset_idx: 1 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 0 + text: Bold Heading + - col_span: 1 + column_header: true + end_col_offset_idx: 2 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 0 + text: Italic Heading + - col_span: 1 + column_header: false + end_col_offset_idx: 1 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 1 + text: data a + - col_span: 1 + column_header: false + end_col_offset_idx: 2 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 1 + text: data b + footnotes: [] + label: table + parent: + $ref: '#/body' + prov: [] + references: [] + self_ref: '#/tables/0' texts: - children: [] content_layer: body @@ -562,4 +706,4 @@ texts: prov: [] self_ref: '#/texts/37' text: amet. -version: 1.3.0 +version: 1.3.0 \ No newline at end of file diff --git a/tests/data/md/inline_and_formatting.md b/tests/data/md/inline_and_formatting.md index e18a46c5..0393386d 100644 --- a/tests/data/md/inline_and_formatting.md +++ b/tests/data/md/inline_and_formatting.md @@ -16,3 +16,7 @@ Create your feature branch: `git checkout -b feature/AmazingFeature`. - **First**: Lorem ipsum. - **Second**: Dolor `sit` amet. + +| **Bold Heading** | *Italic Heading* | +|------------------|------------------| +| data a | data b |