mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 21:58:15 +00:00
Merge branch 'main' of https://github.com/docling-project/docling
This commit is contained in:
16
CHANGELOG.md
16
CHANGELOG.md
@@ -1,3 +1,19 @@
|
||||
## [v2.54.0](https://github.com/docling-project/docling/releases/tag/v2.54.0) - 2025-09-22
|
||||
|
||||
### Feature
|
||||
|
||||
* Rich tables for MSWord backend ([#2291](https://github.com/docling-project/docling/issues/2291)) ([`e2482a2`](https://github.com/docling-project/docling/commit/e2482a2ada52b2b8a41c4402b27e125adbe4385f))
|
||||
* Add a backend parser for WebVTT files ([#2288](https://github.com/docling-project/docling/issues/2288)) ([`46efaae`](https://github.com/docling-project/docling/commit/46efaaefee17a6b83e02a050f9f3c8a51afbbd53))
|
||||
|
||||
### Fix
|
||||
|
||||
* Correct y-axis scaling in draw_table_cells ([#2287](https://github.com/docling-project/docling/issues/2287)) ([`b5628f1`](https://github.com/docling-project/docling/commit/b5628f12273297d9db1393f4b734cfa337caa8c9))
|
||||
|
||||
### Documentation
|
||||
|
||||
* Update API VLM example with granite-docling ([#2294](https://github.com/docling-project/docling/issues/2294)) ([`8b7e83a`](https://github.com/docling-project/docling/commit/8b7e83a8c7b9e333c31d5ae0b96213e3c70c6bf3))
|
||||
* Fix examples rendering ([#2281](https://github.com/docling-project/docling/issues/2281)) ([`8322c2e`](https://github.com/docling-project/docling/commit/8322c2ea9b4fbb1625bcbf1ec1b3dea6c1cd3ed0))
|
||||
|
||||
## [v2.53.0](https://github.com/docling-project/docling/releases/tag/v2.53.0) - 2025-09-17
|
||||
|
||||
### Feature
|
||||
|
||||
@@ -29,7 +29,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
||||
|
||||
## Features
|
||||
|
||||
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
||||
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
|
||||
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
||||
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
||||
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
||||
@@ -45,13 +45,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
||||
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
||||
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
|
||||
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
|
||||
* 💬 Parsing of Web Video Text Tracks (WebVTT) files
|
||||
|
||||
### Coming soon
|
||||
|
||||
* 📝 Metadata extraction, including title, authors, references & language
|
||||
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
||||
* 📝 Complex chemistry understanding (Molecular structures)
|
||||
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
|
||||
|
||||
## Installation
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import re
|
||||
import warnings
|
||||
from copy import deepcopy
|
||||
from enum import Enum
|
||||
from html import unescape
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Literal, Optional, Union, cast
|
||||
@@ -321,9 +322,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
fig_caption: Optional[TextItem] = None
|
||||
if element.title is not None and element.title != "":
|
||||
title = unescape(element.title)
|
||||
fig_caption = doc.add_text(
|
||||
label=DocItemLabel.CAPTION,
|
||||
text=element.title,
|
||||
text=title,
|
||||
formatting=formatting,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
@@ -351,6 +353,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
snippet_text = (
|
||||
element.children.strip() if isinstance(element.children, str) else ""
|
||||
)
|
||||
snippet_text = unescape(snippet_text)
|
||||
# Detect start of the table:
|
||||
if "|" in snippet_text or self.in_table:
|
||||
# most likely part of the markdown table
|
||||
|
||||
@@ -12,8 +12,11 @@ from docling_core.types.doc import (
|
||||
ImageRef,
|
||||
ListGroup,
|
||||
NodeItem,
|
||||
RefItem,
|
||||
RichTableCell,
|
||||
TableCell,
|
||||
TableData,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import Formatting
|
||||
from docx import Document
|
||||
@@ -128,7 +131,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
if self.is_valid():
|
||||
assert self.docx_obj is not None
|
||||
doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||
doc, _ = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||
# doc, _ = doc_info
|
||||
return doc
|
||||
else:
|
||||
raise RuntimeError(
|
||||
@@ -172,7 +176,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
body: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
doc: DoclingDocument,
|
||||
) -> DoclingDocument:
|
||||
# parent:
|
||||
) -> tuple[DoclingDocument, list[RefItem]]:
|
||||
added_elements = []
|
||||
for element in body:
|
||||
tag_name = etree.QName(element).localname
|
||||
# Check for Inline Images (blip elements)
|
||||
@@ -230,8 +236,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent=self.parents[level - 1],
|
||||
name="shape-text",
|
||||
)
|
||||
added_elements.append(shape_group.get_ref())
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=shape_group,
|
||||
text=text_content,
|
||||
)
|
||||
@@ -246,23 +253,27 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
_log.debug(
|
||||
f"Found textbox content with {len(textbox_elements)} elements"
|
||||
)
|
||||
self._handle_textbox_content(textbox_elements, docx_obj, doc)
|
||||
tbc = self._handle_textbox_content(textbox_elements, docx_obj, doc)
|
||||
added_elements.extend(tbc)
|
||||
|
||||
# Check for Tables
|
||||
if element.tag.endswith("tbl"):
|
||||
try:
|
||||
self._handle_tables(element, docx_obj, doc)
|
||||
t = self._handle_tables(element, docx_obj, doc)
|
||||
added_elements.extend(t)
|
||||
except Exception:
|
||||
_log.debug("could not parse a table, broken docx table")
|
||||
# Check for Image
|
||||
elif drawing_blip:
|
||||
self._handle_pictures(docx_obj, drawing_blip, doc)
|
||||
pics = self._handle_pictures(docx_obj, drawing_blip, doc)
|
||||
added_elements.extend(pics)
|
||||
# Check for Text after the Image
|
||||
if (
|
||||
tag_name in ["p"]
|
||||
and element.find(".//w:t", namespaces=namespaces) is not None
|
||||
):
|
||||
self._handle_text_elements(element, docx_obj, doc)
|
||||
te1 = self._handle_text_elements(element, docx_obj, doc)
|
||||
added_elements.extend(te1)
|
||||
# Check for the sdt containers, like table of contents
|
||||
elif tag_name in ["sdt"]:
|
||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||
@@ -270,15 +281,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
|
||||
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
|
||||
for p in paragraphs:
|
||||
self._handle_text_elements(p, docx_obj, doc)
|
||||
te = self._handle_text_elements(p, docx_obj, doc)
|
||||
added_elements.extend(te)
|
||||
# Check for Text
|
||||
elif tag_name in ["p"]:
|
||||
# "tcPr", "sectPr"
|
||||
self._handle_text_elements(element, docx_obj, doc)
|
||||
te = self._handle_text_elements(element, docx_obj, doc)
|
||||
added_elements.extend(te)
|
||||
else:
|
||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||
|
||||
return doc
|
||||
return doc, added_elements
|
||||
|
||||
def _str_to_int(
|
||||
self, s: Optional[str], default: Optional[int] = 0
|
||||
@@ -674,14 +687,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
textbox_elements: list,
|
||||
docx_obj: DocxDocument,
|
||||
doc: DoclingDocument,
|
||||
) -> None:
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
"""Process textbox content and add it to the document structure."""
|
||||
level = self._get_level()
|
||||
# Create a textbox group to contain all text from the textbox
|
||||
textbox_group = doc.add_group(
|
||||
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
|
||||
)
|
||||
|
||||
elem_ref.append(textbox_group.get_ref())
|
||||
# Set this as the current parent to ensure textbox content
|
||||
# is properly nested in document structure
|
||||
original_parent = self.parents[level]
|
||||
@@ -729,11 +743,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Mark this paragraph as processed
|
||||
processed_paragraphs.add(paragraph_id)
|
||||
|
||||
self._handle_text_elements(p, docx_obj, doc)
|
||||
elem_ref.extend(self._handle_text_elements(p, docx_obj, doc))
|
||||
|
||||
# Restore original parent
|
||||
self.parents[level] = original_parent
|
||||
return
|
||||
return elem_ref
|
||||
|
||||
def _handle_equations_in_text(self, element, text):
|
||||
only_texts = []
|
||||
@@ -803,7 +817,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
element: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
doc: DoclingDocument,
|
||||
) -> None:
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
paragraph = Paragraph(element, docx_obj)
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text, equations = self._handle_equations_in_text(
|
||||
@@ -811,7 +826,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
|
||||
if text is None:
|
||||
return
|
||||
return elem_ref
|
||||
text = text.strip()
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
@@ -832,15 +847,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Check if this is actually a numbered list by examining the numFmt
|
||||
is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
|
||||
|
||||
self._add_list_item(
|
||||
li = self._add_list_item(
|
||||
doc=doc,
|
||||
numid=numid,
|
||||
ilevel=ilevel,
|
||||
elements=paragraph_elements,
|
||||
is_numbered=is_numbered,
|
||||
)
|
||||
elem_ref.extend(li) # MUST BE REF!!!
|
||||
self._update_history(p_style_id, p_level, numid, ilevel)
|
||||
return
|
||||
return elem_ref
|
||||
elif (
|
||||
numid is None
|
||||
and self._prev_numid() is not None
|
||||
@@ -860,9 +876,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if p_style_id in ["Title"]:
|
||||
for key in range(len(self.parents)):
|
||||
self.parents[key] = None
|
||||
self.parents[0] = doc.add_text(
|
||||
parent=None, label=DocItemLabel.TITLE, text=text
|
||||
)
|
||||
te = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text)
|
||||
self.parents[0] = te
|
||||
elem_ref.append(te.get_ref())
|
||||
elif "Heading" in p_style_id:
|
||||
style_element = getattr(paragraph.style, "element", None)
|
||||
if style_element is not None:
|
||||
@@ -871,7 +887,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
else:
|
||||
is_numbered_style = False
|
||||
self._add_header(doc, p_level, text, is_numbered_style)
|
||||
h1 = self._add_header(doc, p_level, text, is_numbered_style)
|
||||
elem_ref.extend(h1)
|
||||
|
||||
elif len(equations) > 0:
|
||||
if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
|
||||
@@ -879,15 +896,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
) > 0:
|
||||
# Standalone equation
|
||||
level = self._get_level()
|
||||
doc.add_text(
|
||||
t1 = doc.add_text(
|
||||
label=DocItemLabel.FORMULA,
|
||||
parent=self.parents[level - 1],
|
||||
text=text.replace("<eq>", "").replace("</eq>", ""),
|
||||
)
|
||||
elem_ref.append(t1.get_ref())
|
||||
else:
|
||||
# Inline equation
|
||||
level = self._get_level()
|
||||
inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
|
||||
elem_ref.append(inline_equation.get_ref())
|
||||
text_tmp = text
|
||||
for eq in equations:
|
||||
if len(text_tmp) == 0:
|
||||
@@ -899,23 +918,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
|
||||
|
||||
if len(pre_eq_text) > 0:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
e1 = doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=inline_equation,
|
||||
text=pre_eq_text,
|
||||
)
|
||||
doc.add_text(
|
||||
elem_ref.append(e1.get_ref())
|
||||
e2 = doc.add_text(
|
||||
label=DocItemLabel.FORMULA,
|
||||
parent=inline_equation,
|
||||
text=eq.replace("<eq>", "").replace("</eq>", ""),
|
||||
)
|
||||
elem_ref.append(e2.get_ref())
|
||||
|
||||
if len(text_tmp) > 0:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
e3 = doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=inline_equation,
|
||||
text=text_tmp.strip(),
|
||||
)
|
||||
elem_ref.append(e3.get_ref())
|
||||
|
||||
elif p_style_id in [
|
||||
"Paragraph",
|
||||
@@ -934,13 +956,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
paragraph_elements=paragraph_elements,
|
||||
)
|
||||
for text, format, hyperlink in paragraph_elements:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
t2 = doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
elem_ref.append(t2.get_ref())
|
||||
|
||||
else:
|
||||
# Text style names can, and will have, not only default values but user values too
|
||||
@@ -952,16 +975,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
paragraph_elements=paragraph_elements,
|
||||
)
|
||||
for text, format, hyperlink in paragraph_elements:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
t3 = doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
elem_ref.append(t3.get_ref())
|
||||
|
||||
self._update_history(p_style_id, p_level, numid, ilevel)
|
||||
return
|
||||
return elem_ref
|
||||
|
||||
def _add_header(
|
||||
self,
|
||||
@@ -969,17 +993,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
curr_level: Optional[int],
|
||||
text: str,
|
||||
is_numbered_style: bool = False,
|
||||
) -> None:
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
level = self._get_level()
|
||||
if isinstance(curr_level, int):
|
||||
if curr_level > level:
|
||||
# add invisible group
|
||||
for i in range(level, curr_level):
|
||||
self.parents[i] = doc.add_group(
|
||||
gr1 = doc.add_group(
|
||||
parent=self.parents[i - 1],
|
||||
label=GroupLabel.SECTION,
|
||||
name=f"header-{i}",
|
||||
)
|
||||
elem_ref.append(gr1.get_ref())
|
||||
self.parents[i] = gr1
|
||||
|
||||
elif curr_level < level:
|
||||
# remove the tail
|
||||
for key in range(len(self.parents)):
|
||||
@@ -1019,12 +1047,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
text = f"{self.numbered_headers[previous_level]}.{text}"
|
||||
previous_level -= 1
|
||||
|
||||
self.parents[current_level] = doc.add_heading(
|
||||
hd = doc.add_heading(
|
||||
parent=self.parents[parent_level],
|
||||
text=text,
|
||||
level=add_level,
|
||||
)
|
||||
return
|
||||
self.parents[current_level] = hd
|
||||
elem_ref.append(hd.get_ref())
|
||||
return elem_ref
|
||||
|
||||
def _add_formatted_list_item(
|
||||
self,
|
||||
@@ -1033,12 +1063,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
marker: str,
|
||||
enumerated: bool,
|
||||
level: int,
|
||||
) -> None:
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
# This should not happen by construction
|
||||
if not isinstance(self.parents[level], ListGroup):
|
||||
return
|
||||
return elem_ref
|
||||
if not elements:
|
||||
return
|
||||
return elem_ref
|
||||
|
||||
if len(elements) == 1:
|
||||
text, format, hyperlink = elements[0]
|
||||
@@ -1068,6 +1099,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
return elem_ref
|
||||
|
||||
def _add_list_item(
|
||||
self,
|
||||
@@ -1077,10 +1109,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
ilevel: int,
|
||||
elements: list,
|
||||
is_numbered: bool = False,
|
||||
) -> None:
|
||||
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
# this method is always called with is_numbered. Numbered lists should be properly addressed.
|
||||
if not elements:
|
||||
return None
|
||||
return elem_ref
|
||||
enum_marker = ""
|
||||
|
||||
level = self._get_level()
|
||||
@@ -1091,9 +1124,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Reset counters for the new numbering sequence
|
||||
self._reset_list_counters_for_new_sequence(numid)
|
||||
|
||||
self.parents[level] = doc.add_list_group(
|
||||
name="list", parent=self.parents[level - 1]
|
||||
)
|
||||
list_gr = doc.add_list_group(name="list", parent=self.parents[level - 1])
|
||||
self.parents[level] = list_gr
|
||||
elem_ref.append(list_gr.get_ref())
|
||||
|
||||
# Set marker and enumerated arguments if this is an enumeration element.
|
||||
if is_numbered:
|
||||
@@ -1114,9 +1147,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.level_at_new_list + prev_indent + 1,
|
||||
self.level_at_new_list + ilevel + 1,
|
||||
):
|
||||
self.parents[i] = doc.add_list_group(
|
||||
name="list", parent=self.parents[i - 1]
|
||||
)
|
||||
list_gr1 = doc.add_list_group(name="list", parent=self.parents[i - 1])
|
||||
self.parents[i] = list_gr1
|
||||
elem_ref.append(list_gr1.get_ref())
|
||||
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
if is_numbered:
|
||||
@@ -1156,7 +1189,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
|
||||
elif self._prev_numid() == numid or prev_indent == ilevel:
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
# Set marker and enumerated arguments if this is an enumeration element.
|
||||
if is_numbered:
|
||||
counter = self._get_list_counter(numid, ilevel)
|
||||
enum_marker = str(counter) + "."
|
||||
@@ -1165,15 +1198,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._add_formatted_list_item(
|
||||
doc, elements, enum_marker, is_numbered, level - 1
|
||||
)
|
||||
|
||||
return
|
||||
return elem_ref
|
||||
|
||||
def _handle_tables(
|
||||
self,
|
||||
element: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
doc: DoclingDocument,
|
||||
) -> None:
|
||||
) -> List[RefItem]:
|
||||
elem_ref: List[RefItem] = []
|
||||
table: Table = Table(element, docx_obj)
|
||||
num_rows = len(table.rows)
|
||||
num_cols = len(table.columns)
|
||||
@@ -1184,9 +1217,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
# In case we have a table of only 1 cell, we consider it furniture
|
||||
# And proceed processing the content of the cell as though it's in the document body
|
||||
self._walk_linear(cell_element._element, docx_obj, doc)
|
||||
return
|
||||
return elem_ref
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
||||
level = self._get_level()
|
||||
docling_table = doc.add_table(data=data, parent=self.parents[level - 1])
|
||||
elem_ref.append(docling_table.get_ref())
|
||||
|
||||
cell_set: set[CT_Tc] = set()
|
||||
for row_idx, row in enumerate(table.rows):
|
||||
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
|
||||
@@ -1223,7 +1260,70 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
text = text.replace("<eq>", "$").replace("</eq>", "$")
|
||||
|
||||
table_cell = TableCell(
|
||||
provs_in_cell: List[RefItem] = []
|
||||
_, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
|
||||
ref_for_rich_cell = provs_in_cell[0]
|
||||
rich_table_cell = False
|
||||
|
||||
def group_cell_elements(
|
||||
group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
|
||||
) -> RefItem:
|
||||
group_element = doc.add_group(
|
||||
label=GroupLabel.UNSPECIFIED,
|
||||
name=group_name,
|
||||
parent=docling_table,
|
||||
)
|
||||
for prov in provs_in_cell:
|
||||
group_element.children.append(prov)
|
||||
pr_item = prov.resolve(doc)
|
||||
item_parent = pr_item.parent.resolve(doc)
|
||||
if pr_item.get_ref() in item_parent.children:
|
||||
item_parent.children.remove(pr_item.get_ref())
|
||||
pr_item.parent = group_element.get_ref()
|
||||
ref_for_rich_cell = group_element.get_ref()
|
||||
return ref_for_rich_cell
|
||||
|
||||
if len(provs_in_cell) > 1:
|
||||
# Cell has multiple elements, we need to group them
|
||||
rich_table_cell = True
|
||||
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
|
||||
ref_for_rich_cell = group_cell_elements(
|
||||
group_name, doc, provs_in_cell
|
||||
)
|
||||
|
||||
elif len(provs_in_cell) == 1:
|
||||
item_ref = provs_in_cell[0]
|
||||
pr_item = item_ref.resolve(doc)
|
||||
if isinstance(pr_item, TextItem):
|
||||
# Cell has only one element and it's just a text
|
||||
rich_table_cell = False
|
||||
doc.delete_items(node_items=[pr_item])
|
||||
else:
|
||||
rich_table_cell = True
|
||||
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
|
||||
ref_for_rich_cell = group_cell_elements(
|
||||
group_name, doc, provs_in_cell
|
||||
)
|
||||
else:
|
||||
rich_table_cell = False
|
||||
|
||||
if rich_table_cell:
|
||||
rich_cell = RichTableCell(
|
||||
text=text,
|
||||
row_span=spanned_idx - row_idx,
|
||||
col_span=cell.grid_span,
|
||||
start_row_offset_idx=row.grid_cols_before + row_idx,
|
||||
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + cell.grid_span,
|
||||
column_header=row.grid_cols_before + row_idx == 0,
|
||||
row_header=False,
|
||||
ref=ref_for_rich_cell, # points to an artificial group around children
|
||||
)
|
||||
doc.add_table_cell(table_item=docling_table, cell=rich_cell)
|
||||
col_idx += cell.grid_span
|
||||
else:
|
||||
simple_cell = TableCell(
|
||||
text=text,
|
||||
row_span=spanned_idx - row_idx,
|
||||
col_span=cell.grid_span,
|
||||
@@ -1234,16 +1334,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
column_header=row.grid_cols_before + row_idx == 0,
|
||||
row_header=False,
|
||||
)
|
||||
data.table_cells.append(table_cell)
|
||||
doc.add_table_cell(table_item=docling_table, cell=simple_cell)
|
||||
col_idx += cell.grid_span
|
||||
|
||||
level = self._get_level()
|
||||
doc.add_table(data=data, parent=self.parents[level - 1])
|
||||
return
|
||||
return elem_ref
|
||||
|
||||
def _handle_pictures(
|
||||
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
||||
) -> None:
|
||||
) -> List[RefItem]:
|
||||
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
|
||||
image_data: Optional[bytes] = None
|
||||
rId = drawing_blip[0].get(
|
||||
@@ -1255,28 +1352,32 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
image_data = image_part.blob # Get the binary image data
|
||||
return image_data
|
||||
|
||||
elem_ref: List[RefItem] = []
|
||||
level = self._get_level()
|
||||
# Open the BytesIO object with PIL to create an Image
|
||||
image_data: Optional[bytes] = get_docx_image(drawing_blip)
|
||||
if image_data is None:
|
||||
_log.warning("Warning: image cannot be found")
|
||||
doc.add_picture(
|
||||
p1 = doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
caption=None,
|
||||
)
|
||||
elem_ref.append(p1.get_ref())
|
||||
else:
|
||||
try:
|
||||
image_bytes = BytesIO(image_data)
|
||||
pil_image = Image.open(image_bytes)
|
||||
doc.add_picture(
|
||||
p2 = doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||
caption=None,
|
||||
)
|
||||
elem_ref.append(p2.get_ref())
|
||||
except (UnidentifiedImageError, OSError):
|
||||
_log.warning("Warning: image cannot be loaded by Pillow")
|
||||
doc.add_picture(
|
||||
p3 = doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
caption=None,
|
||||
)
|
||||
return
|
||||
elem_ref.append(p3.get_ref())
|
||||
return elem_ref
|
||||
|
||||
572
docling/backend/webvtt_backend.py
Normal file
572
docling/backend/webvtt_backend.py
Normal file
@@ -0,0 +1,572 @@
|
||||
import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Annotated, ClassVar, Literal, Optional, Union, cast
|
||||
|
||||
from docling_core.types.doc import (
|
||||
ContentLayer,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
Formatting,
|
||||
GroupLabel,
|
||||
NodeItem,
|
||||
)
|
||||
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
||||
from pydantic.types import StringConstraints
|
||||
from typing_extensions import Self, override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class _WebVTTTimestamp(BaseModel):
|
||||
"""Model representing a WebVTT timestamp.
|
||||
|
||||
A WebVTT timestamp is always interpreted relative to the current playback position
|
||||
of the media data that the WebVTT file is to be synchronized with.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(regex_engine="python-re")
|
||||
|
||||
raw: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description="A representation of the WebVTT Timestamp as a single string"
|
||||
),
|
||||
]
|
||||
|
||||
_pattern: ClassVar[re.Pattern] = re.compile(
|
||||
r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
|
||||
)
|
||||
_hours: int
|
||||
_minutes: int
|
||||
_seconds: int
|
||||
_millis: int
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_raw(self) -> Self:
|
||||
m = self._pattern.match(self.raw)
|
||||
if not m:
|
||||
raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
|
||||
self._hours = int(m.group(1)) if m.group(1) else 0
|
||||
self._minutes = int(m.group(2))
|
||||
self._seconds = int(m.group(3))
|
||||
self._millis = int(m.group(4))
|
||||
|
||||
if self._minutes < 0 or self._minutes > 59:
|
||||
raise ValueError("Minutes must be between 0 and 59")
|
||||
if self._seconds < 0 or self._seconds > 59:
|
||||
raise ValueError("Seconds must be between 0 and 59")
|
||||
|
||||
return self
|
||||
|
||||
@property
|
||||
def seconds(self) -> float:
|
||||
"""A representation of the WebVTT Timestamp in seconds"""
|
||||
return (
|
||||
self._hours * 3600
|
||||
+ self._minutes * 60
|
||||
+ self._seconds
|
||||
+ self._millis / 1000.0
|
||||
)
|
||||
|
||||
@override
|
||||
def __str__(self) -> str:
|
||||
return self.raw
|
||||
|
||||
|
||||
_WebVTTCueIdentifier = Annotated[
|
||||
str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
|
||||
]
|
||||
|
||||
|
||||
class _WebVTTCueTimings(BaseModel):
|
||||
"""Model representating WebVTT cue timings."""
|
||||
|
||||
start: Annotated[
|
||||
_WebVTTTimestamp, Field(description="Start time offset of the cue")
|
||||
]
|
||||
end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
|
||||
|
||||
@model_validator(mode="after")
|
||||
def check_order(self) -> Self:
|
||||
if self.start and self.end:
|
||||
if self.end.seconds <= self.start.seconds:
|
||||
raise ValueError("End timestamp must be greater than start timestamp")
|
||||
return self
|
||||
|
||||
@override
|
||||
def __str__(self):
|
||||
return f"{self.start} --> {self.end}"
|
||||
|
||||
|
||||
class _WebVTTCueTextSpan(BaseModel):
|
||||
"""Model representing a WebVTT cue text span."""
|
||||
|
||||
text: str
|
||||
span_type: Literal["text"] = "text"
|
||||
|
||||
@field_validator("text", mode="after")
|
||||
@classmethod
|
||||
def validate_text(cls, value: str) -> str:
|
||||
if any(ch in value for ch in {"\n", "\r", "&", "<"}):
|
||||
raise ValueError("Cue text span contains invalid characters")
|
||||
if len(value) == 0:
|
||||
raise ValueError("Cue text span cannot be empty")
|
||||
return value
|
||||
|
||||
@override
|
||||
def __str__(self):
|
||||
return self.text
|
||||
|
||||
|
||||
class _WebVTTCueVoiceSpan(BaseModel):
|
||||
"""Model representing a WebVTT cue voice span."""
|
||||
|
||||
annotation: Annotated[
|
||||
str,
|
||||
Field(
|
||||
description=(
|
||||
"Cue span start tag annotation text representing the name of thevoice"
|
||||
)
|
||||
),
|
||||
]
|
||||
classes: Annotated[
|
||||
list[str],
|
||||
Field(description="List of classes representing the cue span's significance"),
|
||||
] = []
|
||||
components: Annotated[
|
||||
list["_WebVTTCueComponent"],
|
||||
Field(description="The components representing the cue internal text"),
|
||||
] = []
|
||||
span_type: Literal["v"] = "v"
|
||||
|
||||
@field_validator("annotation", mode="after")
|
||||
@classmethod
|
||||
def validate_annotation(cls, value: str) -> str:
|
||||
if any(ch in value for ch in {"\n", "\r", "&", ">"}):
|
||||
raise ValueError(
|
||||
"Cue span start tag annotation contains invalid characters"
|
||||
)
|
||||
if not value:
|
||||
raise ValueError("Cue text span cannot be empty")
|
||||
return value
|
||||
|
||||
@field_validator("classes", mode="after")
|
||||
@classmethod
|
||||
def validate_classes(cls, value: list[str]) -> list[str]:
|
||||
for item in value:
|
||||
if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
|
||||
raise ValueError(
|
||||
"A cue span start tag class contains invalid characters"
|
||||
)
|
||||
if not item:
|
||||
raise ValueError("Cue span start tag classes cannot be empty")
|
||||
return value
|
||||
|
||||
@override
|
||||
def __str__(self):
|
||||
tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
|
||||
inner = "".join(str(span) for span in self.components)
|
||||
return f"<{tag} {self.annotation}>{inner}</v>"
|
||||
|
||||
|
||||
class _WebVTTCueClassSpan(BaseModel):
|
||||
span_type: Literal["c"] = "c"
|
||||
components: list["_WebVTTCueComponent"]
|
||||
|
||||
@override
|
||||
def __str__(self):
|
||||
inner = "".join(str(span) for span in self.components)
|
||||
return f"<c>{inner}</c>"
|
||||
|
||||
|
||||
class _WebVTTCueItalicSpan(BaseModel):
|
||||
span_type: Literal["i"] = "i"
|
||||
components: list["_WebVTTCueComponent"]
|
||||
|
||||
@override
|
||||
def __str__(self):
|
||||
inner = "".join(str(span) for span in self.components)
|
||||
return f"<i>{inner}</i>"
|
||||
|
||||
|
||||
class _WebVTTCueBoldSpan(BaseModel):
|
||||
span_type: Literal["b"] = "b"
|
||||
components: list["_WebVTTCueComponent"]
|
||||
|
||||
@override
|
||||
def __str__(self):
|
||||
inner = "".join(str(span) for span in self.components)
|
||||
return f"<b>{inner}</b>"
|
||||
|
||||
|
||||
class _WebVTTCueUnderlineSpan(BaseModel):
|
||||
span_type: Literal["u"] = "u"
|
||||
components: list["_WebVTTCueComponent"]
|
||||
|
||||
@override
|
||||
def __str__(self):
|
||||
inner = "".join(str(span) for span in self.components)
|
||||
return f"<u>{inner}</u>"
|
||||
|
||||
|
||||
_WebVTTCueComponent = Annotated[
|
||||
Union[
|
||||
_WebVTTCueTextSpan,
|
||||
_WebVTTCueClassSpan,
|
||||
_WebVTTCueItalicSpan,
|
||||
_WebVTTCueBoldSpan,
|
||||
_WebVTTCueUnderlineSpan,
|
||||
_WebVTTCueVoiceSpan,
|
||||
],
|
||||
Field(discriminator="span_type", description="The WebVTT cue component"),
|
||||
]
|
||||
|
||||
|
||||
class _WebVTTCueBlock(BaseModel):
|
||||
"""Model representing a WebVTT cue block.
|
||||
|
||||
The optional WebVTT cue settings list is not supported.
|
||||
The cue payload is limited to the following spans: text, class, italic, bold,
|
||||
underline, and voice.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(regex_engine="python-re")
|
||||
|
||||
identifier: Optional[_WebVTTCueIdentifier] = Field(
|
||||
None, description="The WebVTT cue identifier"
|
||||
)
|
||||
timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
|
||||
payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
|
||||
|
||||
_pattern_block: ClassVar[re.Pattern] = re.compile(
|
||||
r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
|
||||
)
|
||||
_pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
|
||||
r"^<v(?P<class>\.[^\t\n\r &<>]+)?" # zero or more classes
|
||||
r"[ \t]+(?P<annotation>[^\n\r&>]+)>" # required space and annotation
|
||||
)
|
||||
|
||||
@field_validator("payload", mode="after")
|
||||
@classmethod
|
||||
def validate_payload(cls, payload):
|
||||
for voice in payload:
|
||||
if "-->" in str(voice):
|
||||
raise ValueError("Cue payload must not contain '-->'")
|
||||
return payload
|
||||
|
||||
@classmethod
|
||||
def parse(cls, raw: str) -> "_WebVTTCueBlock":
|
||||
lines = raw.strip().splitlines()
|
||||
if not lines:
|
||||
raise ValueError("Cue block must have at least one line")
|
||||
identifier: Optional[_WebVTTCueIdentifier] = None
|
||||
timing_line = lines[0]
|
||||
if "-->" not in timing_line and len(lines) > 1:
|
||||
identifier = timing_line
|
||||
timing_line = lines[1]
|
||||
cue_lines = lines[2:]
|
||||
else:
|
||||
cue_lines = lines[1:]
|
||||
|
||||
if "-->" not in timing_line:
|
||||
raise ValueError("Cue block must contain WebVTT cue timings")
|
||||
|
||||
start, end = [t.strip() for t in timing_line.split("-->")]
|
||||
end = re.split(" |\t", end)[0] # ignore the cue settings list
|
||||
timings: _WebVTTCueTimings = _WebVTTCueTimings(
|
||||
start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
|
||||
)
|
||||
cue_text = " ".join(cue_lines).strip()
|
||||
if cue_text.startswith("<v") and "</v>" not in cue_text:
|
||||
# adding close tag for cue voice spans without end tag
|
||||
cue_text += "</v>"
|
||||
|
||||
stack: list[list[_WebVTTCueComponent]] = [[]]
|
||||
tag_stack: list[Union[str, tuple]] = []
|
||||
|
||||
pos = 0
|
||||
matches = list(cls._pattern_block.finditer(cue_text))
|
||||
i = 0
|
||||
while i < len(matches):
|
||||
match = matches[i]
|
||||
if match.start() > pos:
|
||||
stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
|
||||
tag = match.group(0)
|
||||
|
||||
if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
|
||||
tag_type = tag[1:2]
|
||||
tag_stack.append(tag_type)
|
||||
stack.append([])
|
||||
elif tag == "</i>":
|
||||
children = stack.pop()
|
||||
stack[-1].append(_WebVTTCueItalicSpan(components=children))
|
||||
tag_stack.pop()
|
||||
elif tag == "</b>":
|
||||
children = stack.pop()
|
||||
stack[-1].append(_WebVTTCueBoldSpan(components=children))
|
||||
tag_stack.pop()
|
||||
elif tag == "</u>":
|
||||
children = stack.pop()
|
||||
stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
|
||||
tag_stack.pop()
|
||||
elif tag == "</c>":
|
||||
children = stack.pop()
|
||||
stack[-1].append(_WebVTTCueClassSpan(components=children))
|
||||
tag_stack.pop()
|
||||
elif tag.startswith("<v"):
|
||||
tag_stack.append(("v", tag))
|
||||
stack.append([])
|
||||
elif tag.startswith("</v"):
|
||||
children = stack.pop() if stack else []
|
||||
if (
|
||||
tag_stack
|
||||
and isinstance(tag_stack[-1], tuple)
|
||||
and tag_stack[-1][0] == "v"
|
||||
):
|
||||
_, voice = cast(tuple, tag_stack.pop())
|
||||
voice_match = cls._pattern_voice_tag.match(voice)
|
||||
if voice_match:
|
||||
class_string = voice_match.group("class")
|
||||
annotation = voice_match.group("annotation")
|
||||
if annotation:
|
||||
classes: list[str] = []
|
||||
if class_string:
|
||||
classes = [c for c in class_string.split(".") if c]
|
||||
stack[-1].append(
|
||||
_WebVTTCueVoiceSpan(
|
||||
annotation=annotation.strip(),
|
||||
classes=classes,
|
||||
components=children,
|
||||
)
|
||||
)
|
||||
|
||||
pos = match.end()
|
||||
i += 1
|
||||
|
||||
if pos < len(cue_text):
|
||||
stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
|
||||
|
||||
return cls(
|
||||
identifier=identifier,
|
||||
timings=timings,
|
||||
payload=stack[0],
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
parts = []
|
||||
if self.identifier:
|
||||
parts.append(f"{self.identifier}\n")
|
||||
timings_line = str(self.timings)
|
||||
parts.append(timings_line + "\n")
|
||||
for idx, span in enumerate(self.payload):
|
||||
if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
|
||||
# the end tag may be omitted for brevity
|
||||
parts.append(str(span).removesuffix("</v>"))
|
||||
else:
|
||||
parts.append(str(span))
|
||||
|
||||
return "".join(parts)
|
||||
|
||||
|
||||
class _WebVTTFile(BaseModel):
|
||||
"""A model representing a WebVTT file."""
|
||||
|
||||
cue_blocks: list[_WebVTTCueBlock]
|
||||
|
||||
@staticmethod
|
||||
def verify_signature(content: str) -> bool:
|
||||
if not content:
|
||||
return False
|
||||
elif len(content) == 6:
|
||||
return content == "WEBVTT"
|
||||
elif len(content) > 6 and content.startswith("WEBVTT"):
|
||||
return content[6] in (" ", "\t", "\n")
|
||||
else:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def parse(cls, raw: str) -> "_WebVTTFile":
|
||||
# Normalize newlines to LF
|
||||
raw = raw.replace("\r\n", "\n").replace("\r", "\n")
|
||||
|
||||
# Check WebVTT signature
|
||||
if not cls.verify_signature(raw):
|
||||
raise ValueError("Invalid WebVTT file signature")
|
||||
|
||||
# Strip "WEBVTT" header line
|
||||
lines = raw.split("\n", 1)
|
||||
body = lines[1] if len(lines) > 1 else ""
|
||||
|
||||
# Remove NOTE/STYLE/REGION blocks
|
||||
body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
|
||||
body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
|
||||
|
||||
# Split into cue blocks
|
||||
raw_blocks = re.split(r"\n\s*\n", body.strip())
|
||||
cues: list[_WebVTTCueBlock] = []
|
||||
for block in raw_blocks:
|
||||
try:
|
||||
cues.append(_WebVTTCueBlock.parse(block))
|
||||
except ValueError as e:
|
||||
_log.warning(f"Failed to parse cue block:\n{block}\n{e}")
|
||||
|
||||
return cls(cue_blocks=cues)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.cue_blocks)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self.cue_blocks[idx]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.cue_blocks)
|
||||
|
||||
|
||||
class WebVTTDocumentBackend(DeclarativeDocumentBackend):
|
||||
"""Declarative backend for WebVTT (.vtt) files.
|
||||
|
||||
This parser reads the content of a WebVTT file and converts
|
||||
it to a DoclingDocument, following the W3C specs on https://www.w3.org/TR/webvtt1
|
||||
|
||||
Each cue becomes a TextItem and the items are appended to the
|
||||
document body by the cue's start time.
|
||||
"""
|
||||
|
||||
@override
|
||||
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
self.content: str = ""
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.content = self.path_or_stream.getvalue().decode("utf-8")
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||
self.content = f.read()
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
"Could not initialize the WebVTT backend for file with hash "
|
||||
f"{self.document_hash}."
|
||||
) from e
|
||||
|
||||
@override
|
||||
def is_valid(self) -> bool:
|
||||
return _WebVTTFile.verify_signature(self.content)
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
@override
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.VTT}
|
||||
|
||||
@staticmethod
|
||||
def _add_text_from_component(
|
||||
doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
|
||||
) -> None:
|
||||
"""Adds a TextItem to a document by extracting text from a cue span component.
|
||||
|
||||
TODO: address nesting
|
||||
"""
|
||||
formatting = Formatting()
|
||||
text = ""
|
||||
if isinstance(item, _WebVTTCueItalicSpan):
|
||||
formatting.italic = True
|
||||
elif isinstance(item, _WebVTTCueBoldSpan):
|
||||
formatting.bold = True
|
||||
elif isinstance(item, _WebVTTCueUnderlineSpan):
|
||||
formatting.underline = True
|
||||
if isinstance(item, _WebVTTCueTextSpan):
|
||||
text = item.text
|
||||
else:
|
||||
# TODO: address nesting
|
||||
text = "".join(
|
||||
[t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
|
||||
)
|
||||
if text := text.strip():
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
text=text,
|
||||
parent=parent,
|
||||
content_layer=ContentLayer.BODY,
|
||||
formatting=formatting,
|
||||
)
|
||||
|
||||
@override
|
||||
def convert(self) -> DoclingDocument:
|
||||
_log.debug("Starting WebVTT conversion...")
|
||||
if not self.is_valid():
|
||||
raise RuntimeError("Invalid WebVTT document.")
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=self.file.name or "file",
|
||||
mimetype="text/vtt",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
|
||||
vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
|
||||
for block in vtt.cue_blocks:
|
||||
block_group = doc.add_group(
|
||||
label=GroupLabel.SECTION,
|
||||
name="WebVTT cue block",
|
||||
parent=None,
|
||||
content_layer=ContentLayer.BODY,
|
||||
)
|
||||
if block.identifier:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
text=str(block.identifier),
|
||||
parent=block_group,
|
||||
content_layer=ContentLayer.BODY,
|
||||
)
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
text=str(block.timings),
|
||||
parent=block_group,
|
||||
content_layer=ContentLayer.BODY,
|
||||
)
|
||||
for cue_span in block.payload:
|
||||
if isinstance(cue_span, _WebVTTCueVoiceSpan):
|
||||
voice_group = doc.add_group(
|
||||
label=GroupLabel.INLINE,
|
||||
name="WebVTT cue voice span",
|
||||
parent=block_group,
|
||||
content_layer=ContentLayer.BODY,
|
||||
)
|
||||
voice = cue_span.annotation
|
||||
if classes := cue_span.classes:
|
||||
voice += f" ({', '.join(classes)})"
|
||||
voice += ": "
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
text=voice,
|
||||
parent=voice_group,
|
||||
content_layer=ContentLayer.BODY,
|
||||
)
|
||||
for item in cue_span.components:
|
||||
WebVTTDocumentBackend._add_text_from_component(
|
||||
doc, item, voice_group
|
||||
)
|
||||
else:
|
||||
WebVTTDocumentBackend._add_text_from_component(
|
||||
doc, cue_span, block_group
|
||||
)
|
||||
|
||||
return doc
|
||||
@@ -1,7 +1,6 @@
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
|
||||
from typing import TYPE_CHECKING, Optional, Type, Union
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import (
|
||||
@@ -14,9 +13,7 @@ from docling_core.types.doc import (
|
||||
)
|
||||
from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
from docling_core.types.io import (
|
||||
DocumentStream,
|
||||
)
|
||||
from docling_core.types.io import DocumentStream
|
||||
|
||||
# DO NOT REMOVE; explicitly exposed from this location
|
||||
from PIL.Image import Image
|
||||
@@ -71,6 +68,7 @@ class InputFormat(str, Enum):
|
||||
METS_GBS = "mets_gbs"
|
||||
JSON_DOCLING = "json_docling"
|
||||
AUDIO = "audio"
|
||||
VTT = "vtt"
|
||||
|
||||
|
||||
class OutputFormat(str, Enum):
|
||||
@@ -82,7 +80,7 @@ class OutputFormat(str, Enum):
|
||||
DOCTAGS = "doctags"
|
||||
|
||||
|
||||
FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
FormatToExtensions: dict[InputFormat, list[str]] = {
|
||||
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
||||
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
||||
InputFormat.PDF: ["pdf"],
|
||||
@@ -97,9 +95,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.METS_GBS: ["tar.gz"],
|
||||
InputFormat.JSON_DOCLING: ["json"],
|
||||
InputFormat.AUDIO: ["wav", "mp3"],
|
||||
InputFormat.VTT: ["vtt"],
|
||||
}
|
||||
|
||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
FormatToMimeType: dict[InputFormat, list[str]] = {
|
||||
InputFormat.DOCX: [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
||||
@@ -130,6 +129,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.METS_GBS: ["application/mets+xml"],
|
||||
InputFormat.JSON_DOCLING: ["application/json"],
|
||||
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
|
||||
InputFormat.VTT: ["text/vtt"],
|
||||
}
|
||||
|
||||
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
||||
@@ -162,8 +162,8 @@ class Cluster(BaseModel):
|
||||
label: DocItemLabel
|
||||
bbox: BoundingBox
|
||||
confidence: float = 1.0
|
||||
cells: List[TextCell] = []
|
||||
children: List["Cluster"] = [] # Add child cluster support
|
||||
cells: list[TextCell] = []
|
||||
children: list["Cluster"] = [] # Add child cluster support
|
||||
|
||||
@field_serializer("confidence")
|
||||
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
||||
@@ -179,7 +179,7 @@ class BasePageElement(BaseModel):
|
||||
|
||||
|
||||
class LayoutPrediction(BaseModel):
|
||||
clusters: List[Cluster] = []
|
||||
clusters: list[Cluster] = []
|
||||
|
||||
|
||||
class VlmPredictionToken(BaseModel):
|
||||
@@ -201,14 +201,14 @@ class ContainerElement(
|
||||
|
||||
|
||||
class Table(BasePageElement):
|
||||
otsl_seq: List[str]
|
||||
otsl_seq: list[str]
|
||||
num_rows: int = 0
|
||||
num_cols: int = 0
|
||||
table_cells: List[TableCell]
|
||||
table_cells: list[TableCell]
|
||||
|
||||
|
||||
class TableStructurePrediction(BaseModel):
|
||||
table_map: Dict[int, Table] = {}
|
||||
table_map: dict[int, Table] = {}
|
||||
|
||||
|
||||
class TextElement(BasePageElement):
|
||||
@@ -216,7 +216,7 @@ class TextElement(BasePageElement):
|
||||
|
||||
|
||||
class FigureElement(BasePageElement):
|
||||
annotations: List[PictureDataType] = []
|
||||
annotations: list[PictureDataType] = []
|
||||
provenance: Optional[str] = None
|
||||
predicted_class: Optional[str] = None
|
||||
confidence: Optional[float] = None
|
||||
@@ -234,12 +234,12 @@ class FigureElement(BasePageElement):
|
||||
|
||||
class FigureClassificationPrediction(BaseModel):
|
||||
figure_count: int = 0
|
||||
figure_map: Dict[int, FigureElement] = {}
|
||||
figure_map: dict[int, FigureElement] = {}
|
||||
|
||||
|
||||
class EquationPrediction(BaseModel):
|
||||
equation_count: int = 0
|
||||
equation_map: Dict[int, TextElement] = {}
|
||||
equation_map: dict[int, TextElement] = {}
|
||||
|
||||
|
||||
class PagePredictions(BaseModel):
|
||||
@@ -254,9 +254,9 @@ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
|
||||
|
||||
|
||||
class AssembledUnit(BaseModel):
|
||||
elements: List[PageElement] = []
|
||||
body: List[PageElement] = []
|
||||
headers: List[PageElement] = []
|
||||
elements: list[PageElement] = []
|
||||
body: list[PageElement] = []
|
||||
headers: list[PageElement] = []
|
||||
|
||||
|
||||
class ItemAndImageEnrichmentElement(BaseModel):
|
||||
@@ -280,12 +280,12 @@ class Page(BaseModel):
|
||||
None # Internal PDF backend. By default it is cleared during assembling.
|
||||
)
|
||||
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
||||
_image_cache: Dict[
|
||||
_image_cache: dict[
|
||||
float, Image
|
||||
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
||||
|
||||
@property
|
||||
def cells(self) -> List[TextCell]:
|
||||
def cells(self) -> list[TextCell]:
|
||||
"""Return text cells as a read-only view of parsed_page.textline_cells."""
|
||||
if self.parsed_page is not None:
|
||||
return self.parsed_page.textline_cells
|
||||
@@ -354,7 +354,7 @@ class OpenAiApiResponse(BaseModel):
|
||||
|
||||
id: str
|
||||
model: Optional[str] = None # returned by openai
|
||||
choices: List[OpenAiResponseChoice]
|
||||
choices: list[OpenAiResponseChoice]
|
||||
created: int
|
||||
usage: OpenAiResponseUsage
|
||||
|
||||
@@ -430,7 +430,7 @@ class PageConfidenceScores(BaseModel):
|
||||
|
||||
|
||||
class ConfidenceReport(PageConfidenceScores):
|
||||
pages: Dict[int, PageConfidenceScores] = Field(
|
||||
pages: dict[int, PageConfidenceScores] = Field(
|
||||
default_factory=lambda: defaultdict(PageConfidenceScores)
|
||||
)
|
||||
|
||||
|
||||
@@ -394,6 +394,8 @@ class _DocumentConversionInput(BaseModel):
|
||||
mime = FormatToMimeType[InputFormat.PPTX][0]
|
||||
elif ext in FormatToExtensions[InputFormat.XLSX]:
|
||||
mime = FormatToMimeType[InputFormat.XLSX][0]
|
||||
elif ext in FormatToExtensions[InputFormat.VTT]:
|
||||
mime = FormatToMimeType[InputFormat.VTT][0]
|
||||
|
||||
return mime
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.noop_backend import NoOpBackend
|
||||
from docling.backend.webvtt_backend import WebVTTDocumentBackend
|
||||
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
@@ -170,6 +171,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
||||
),
|
||||
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
|
||||
InputFormat.VTT: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
|
||||
),
|
||||
}
|
||||
if (options := format_to_default_options.get(format)) is not None:
|
||||
return options
|
||||
|
||||
4
docs/examples/minimal_vlm_pipeline.py
vendored
4
docs/examples/minimal_vlm_pipeline.py
vendored
@@ -3,7 +3,7 @@
|
||||
#
|
||||
# What this example does
|
||||
# - Runs the VLM-powered pipeline on a PDF (by URL) and prints Markdown output.
|
||||
# - Shows two setups: default (Transformers/SmolDocling) and macOS MPS/MLX.
|
||||
# - Shows two setups: default (Transformers/GraniteDocling) and macOS MPS/MLX.
|
||||
#
|
||||
# Prerequisites
|
||||
# - Install Docling with VLM extras and the appropriate backend (Transformers or MLX).
|
||||
@@ -15,7 +15,7 @@
|
||||
#
|
||||
# Notes
|
||||
# - `source` may be a local path or a URL to a PDF.
|
||||
# - The second section demonstrates macOS MPS acceleration via MLX (`vlm_model_specs.SMOLDOCLING_MLX`).
|
||||
# - The second section demonstrates macOS MPS acceleration via MLX (`vlm_model_specs.GRANITEDOCLING_MLX`).
|
||||
# - For more configurations and model comparisons, see `docs/examples/compare_vlm_models.py`.
|
||||
|
||||
# %%
|
||||
|
||||
4
docs/index.md
vendored
4
docs/index.md
vendored
@@ -21,7 +21,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
||||
|
||||
## Features
|
||||
|
||||
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
||||
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
|
||||
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
||||
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
||||
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
||||
@@ -37,13 +37,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
||||
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
||||
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
|
||||
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
|
||||
* 💬 Parsing of Web Video Text Tracks (WebVTT) files
|
||||
|
||||
### Coming soon
|
||||
|
||||
* 📝 Metadata extraction, including title, authors, references & language
|
||||
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
||||
* 📝 Complex chemistry understanding (Molecular structures)
|
||||
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
|
||||
|
||||
## Get started
|
||||
|
||||
|
||||
5
docs/usage/supported_formats.md
vendored
5
docs/usage/supported_formats.md
vendored
@@ -11,10 +11,11 @@ Below you can find a listing of all supported input and output formats.
|
||||
| PDF | |
|
||||
| DOCX, XLSX, PPTX | Default formats in MS Office 2007+, based on Office Open XML |
|
||||
| Markdown | |
|
||||
| AsciiDoc | |
|
||||
| AsciiDoc | Human-readable, plain-text markup language for structured technical content |
|
||||
| HTML, XHTML | |
|
||||
| CSV | |
|
||||
| PNG, JPEG, TIFF, BMP, WEBP | Image formats |
|
||||
| WebVTT | Web Video Text Tracks format for displaying timed text |
|
||||
|
||||
Schema-specific support:
|
||||
|
||||
@@ -32,4 +33,4 @@ Schema-specific support:
|
||||
| Markdown | |
|
||||
| JSON | Lossless serialization of Docling Document |
|
||||
| Text | Plain text, i.e. without Markdown markers |
|
||||
| Doctags | |
|
||||
| [Doctags](https://arxiv.org/pdf/2503.11576) | Markup format for efficiently representing the full content and layout characteristics of a document |
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "docling"
|
||||
version = "2.53.0" # DO NOT EDIT, updated automatically
|
||||
version = "2.54.0" # DO NOT EDIT, updated automatically
|
||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||
license = "MIT"
|
||||
keywords = [
|
||||
@@ -44,7 +44,7 @@ authors = [
|
||||
requires-python = '>=3.9,<4.0'
|
||||
dependencies = [
|
||||
'pydantic (>=2.0.0,<3.0.0)',
|
||||
'docling-core[chunking] (>=2.48.0,<3.0.0)',
|
||||
'docling-core[chunking] (>=2.48.2,<3.0.0)',
|
||||
'docling-parse (>=4.4.0,<5.0.0)',
|
||||
"docling-ibm-models>=3.9.1,<4",
|
||||
'filetype (>=1.2.0,<2.0.0)',
|
||||
|
||||
@@ -1,40 +1,40 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: inline: group group
|
||||
item-2 at level 2: paragraph: This is a word document and this is an inline equation:
|
||||
item-2 at level 2: text: This is a word document and this is an inline equation:
|
||||
item-3 at level 2: formula: A= \pi r^{2}
|
||||
item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
|
||||
item-5 at level 1: paragraph:
|
||||
item-4 at level 2: text: . If instead, I want an equation by line, I can do this:
|
||||
item-5 at level 1: text:
|
||||
item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
|
||||
item-7 at level 1: paragraph: And that is an equation by itself. Cheers!
|
||||
item-8 at level 1: paragraph:
|
||||
item-9 at level 1: paragraph: This is another equation:
|
||||
item-7 at level 1: text: And that is an equation by itself. Cheers!
|
||||
item-8 at level 1: text:
|
||||
item-9 at level 1: text: This is another equation:
|
||||
item-10 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right)
|
||||
item-11 at level 1: paragraph:
|
||||
item-12 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
|
||||
item-13 at level 1: paragraph:
|
||||
item-14 at level 1: paragraph:
|
||||
item-11 at level 1: text:
|
||||
item-12 at level 1: text: This is text. This is text. This ... s is text. This is text. This is text.
|
||||
item-13 at level 1: text:
|
||||
item-14 at level 1: text:
|
||||
item-15 at level 1: inline: group group
|
||||
item-16 at level 2: paragraph: This is a word document and this is an inline equation:
|
||||
item-16 at level 2: text: This is a word document and this is an inline equation:
|
||||
item-17 at level 2: formula: A= \pi r^{2}
|
||||
item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
|
||||
item-19 at level 1: paragraph:
|
||||
item-18 at level 2: text: . If instead, I want an equation by line, I can do this:
|
||||
item-19 at level 1: text:
|
||||
item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
|
||||
item-21 at level 1: paragraph:
|
||||
item-22 at level 1: paragraph: And that is an equation by itself. Cheers!
|
||||
item-23 at level 1: paragraph:
|
||||
item-24 at level 1: paragraph: This is another equation:
|
||||
item-25 at level 1: paragraph:
|
||||
item-21 at level 1: text:
|
||||
item-22 at level 1: text: And that is an equation by itself. Cheers!
|
||||
item-23 at level 1: text:
|
||||
item-24 at level 1: text: This is another equation:
|
||||
item-25 at level 1: text:
|
||||
item-26 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ght)x^{2}}{2!}+ \text{ \textellipsis }
|
||||
item-27 at level 1: paragraph:
|
||||
item-28 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
|
||||
item-29 at level 1: paragraph:
|
||||
item-30 at level 1: paragraph:
|
||||
item-27 at level 1: text:
|
||||
item-28 at level 1: text: This is text. This is text. This ... s is text. This is text. This is text.
|
||||
item-29 at level 1: text:
|
||||
item-30 at level 1: text:
|
||||
item-31 at level 1: inline: group group
|
||||
item-32 at level 2: paragraph: This is a word document and this is an inline equation:
|
||||
item-32 at level 2: text: This is a word document and this is an inline equation:
|
||||
item-33 at level 2: formula: A= \pi r^{2}
|
||||
item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
|
||||
item-35 at level 1: paragraph:
|
||||
item-34 at level 2: text: . If instead, I want an equation by line, I can do this:
|
||||
item-35 at level 1: text:
|
||||
item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty
|
||||
item-37 at level 1: paragraph:
|
||||
item-38 at level 1: paragraph: And that is an equation by itself. Cheers!
|
||||
item-39 at level 1: paragraph:
|
||||
item-37 at level 1: text:
|
||||
item-38 at level 1: text: And that is an equation by itself. Cheers!
|
||||
item-39 at level 1: text:
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "equations",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -182,7 +182,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is a word document and this is an inline equation: ",
|
||||
"text": "This is a word document and this is an inline equation: "
|
||||
@@ -206,7 +206,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": ". If instead, I want an equation by line, I can do this:",
|
||||
"text": ". If instead, I want an equation by line, I can do this:"
|
||||
@@ -218,7 +218,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -242,7 +242,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "And that is an equation by itself. Cheers!",
|
||||
"text": "And that is an equation by itself. Cheers!",
|
||||
@@ -261,7 +261,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -273,7 +273,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is another equation:",
|
||||
"text": "This is another equation:",
|
||||
@@ -304,7 +304,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -316,7 +316,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
||||
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
||||
@@ -335,7 +335,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -347,7 +347,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -359,7 +359,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is a word document and this is an inline equation: ",
|
||||
"text": "This is a word document and this is an inline equation: "
|
||||
@@ -383,7 +383,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": ". If instead, I want an equation by line, I can do this:",
|
||||
"text": ". If instead, I want an equation by line, I can do this:"
|
||||
@@ -395,7 +395,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -419,7 +419,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -431,7 +431,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "And that is an equation by itself. Cheers!",
|
||||
"text": "And that is an equation by itself. Cheers!",
|
||||
@@ -450,7 +450,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -462,7 +462,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is another equation:",
|
||||
"text": "This is another equation:",
|
||||
@@ -481,7 +481,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -505,7 +505,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -517,7 +517,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
||||
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
|
||||
@@ -536,7 +536,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -548,7 +548,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -560,7 +560,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is a word document and this is an inline equation: ",
|
||||
"text": "This is a word document and this is an inline equation: "
|
||||
@@ -584,7 +584,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": ". If instead, I want an equation by line, I can do this:",
|
||||
"text": ". If instead, I want an equation by line, I can do this:"
|
||||
@@ -596,7 +596,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -620,7 +620,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -632,7 +632,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "And that is an equation by itself. Cheers!",
|
||||
"text": "And that is an equation by itself. Cheers!",
|
||||
@@ -651,7 +651,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
|
||||
675
tests/data/groundtruth/docling_v2/escaped_characters.md.json
vendored
Normal file
675
tests/data/groundtruth/docling_v2/escaped_characters.md.json
vendored
Normal file
@@ -0,0 +1,675 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.7.0",
|
||||
"name": "escaped_characters",
|
||||
"origin": {
|
||||
"mimetype": "text/html",
|
||||
"binary_hash": 10682185258371912110,
|
||||
"filename": "escaped_characters.md"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/7"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/9"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/11"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/12"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "ordered list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "escaped_characters",
|
||||
"text": "escaped_characters"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Headers:",
|
||||
"text": "Headers:"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "& < > \" '",
|
||||
"text": "& < > \" '",
|
||||
"level": 1
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/texts/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Text: 00:16.000 ----> 00:18.000 & < > \" '",
|
||||
"text": "Text: 00:16.000 ----> 00:18.000 & < > \" '"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Lists",
|
||||
"text": "Lists"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "& < > \" '",
|
||||
"text": "& < > \" '",
|
||||
"enumerated": true,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "& < > \" '",
|
||||
"text": "& < > \" '",
|
||||
"enumerated": false,
|
||||
"marker": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/8"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Inline code",
|
||||
"text": "Inline code"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/8",
|
||||
"parent": {
|
||||
"$ref": "#/texts/7"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "code",
|
||||
"prov": [],
|
||||
"orig": "& < > \" '",
|
||||
"text": "& < > \" '",
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"code_language": "unknown"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/9",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/10"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Code block",
|
||||
"text": "Code block"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/10",
|
||||
"parent": {
|
||||
"$ref": "#/texts/9"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "code",
|
||||
"prov": [],
|
||||
"orig": "& < > \" '",
|
||||
"text": "& < > \" '",
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"code_language": "unknown"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/11",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Table",
|
||||
"text": "Table"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/12",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/13"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/14"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "title",
|
||||
"prov": [],
|
||||
"orig": "Raw HTML",
|
||||
"text": "Raw HTML"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/13",
|
||||
"parent": {
|
||||
"$ref": "#/texts/12"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "& < > \" '/div>",
|
||||
"text": "& < > \" '/div>"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/14",
|
||||
"parent": {
|
||||
"$ref": "#/texts/12"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/15"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Link",
|
||||
"text": "Link",
|
||||
"level": 1
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/15",
|
||||
"parent": {
|
||||
"$ref": "#/texts/14"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "& < > \" '",
|
||||
"text": "& < > \" '",
|
||||
"hyperlink": "https://en.wikipedia.org/wiki/Albert_Einstein"
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/texts/11"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "table",
|
||||
"prov": [],
|
||||
"captions": [],
|
||||
"references": [],
|
||||
"footnotes": [],
|
||||
"data": {
|
||||
"table_cells": [
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Key",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Example",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Ampersand",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "&",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Less-than",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "<",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 3,
|
||||
"end_row_offset_idx": 4,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Greater-than",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 3,
|
||||
"end_row_offset_idx": 4,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": ">",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 4,
|
||||
"end_row_offset_idx": 5,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Quotes",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 4,
|
||||
"end_row_offset_idx": 5,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "\"",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 5,
|
||||
"end_row_offset_idx": 6,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Apostrophes",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 5,
|
||||
"end_row_offset_idx": 6,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "'",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 6,
|
||||
"num_cols": 2,
|
||||
"grid": [
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Key",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 0,
|
||||
"end_row_offset_idx": 1,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "Example",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Ampersand",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 1,
|
||||
"end_row_offset_idx": 2,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "&",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Less-than",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 2,
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "<",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 3,
|
||||
"end_row_offset_idx": 4,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Greater-than",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 3,
|
||||
"end_row_offset_idx": 4,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": ">",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 4,
|
||||
"end_row_offset_idx": 5,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Quotes",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 4,
|
||||
"end_row_offset_idx": 5,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "\"",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 5,
|
||||
"end_row_offset_idx": 6,
|
||||
"start_col_offset_idx": 0,
|
||||
"end_col_offset_idx": 1,
|
||||
"text": "Apostrophes",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
"col_span": 1,
|
||||
"start_row_offset_idx": 5,
|
||||
"end_row_offset_idx": 6,
|
||||
"start_col_offset_idx": 1,
|
||||
"end_col_offset_idx": 2,
|
||||
"text": "'",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"annotations": []
|
||||
}
|
||||
],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
||||
41
tests/data/groundtruth/docling_v2/escaped_characters.md.md
vendored
Normal file
41
tests/data/groundtruth/docling_v2/escaped_characters.md.md
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
# Headers:
|
||||
|
||||
## & < > " '
|
||||
|
||||
Text: 00:16.000 ----> 00:18.000 & < > " '
|
||||
|
||||
# Lists
|
||||
|
||||
1. & < > " '
|
||||
|
||||
- & < > " '
|
||||
|
||||
# Inline code
|
||||
|
||||
```
|
||||
& < > " '
|
||||
```
|
||||
|
||||
# Code block
|
||||
|
||||
```
|
||||
& < > " '
|
||||
```
|
||||
|
||||
# Table
|
||||
|
||||
| Key | Example |
|
||||
|--------------|-----------|
|
||||
| Ampersand | & |
|
||||
| Less-than | < |
|
||||
| Greater-than | > |
|
||||
| Quotes | " |
|
||||
| Apostrophes | ' |
|
||||
|
||||
# Raw HTML
|
||||
|
||||
& < > " '/div>
|
||||
|
||||
## Link
|
||||
|
||||
[& < > " '](https://en.wikipedia.org/wiki/Albert_Einstein)
|
||||
@@ -186,6 +186,7 @@ tables:
|
||||
column_header: true
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -196,6 +197,7 @@ tables:
|
||||
column_header: true
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -206,6 +208,7 @@ tables:
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -216,6 +219,7 @@ tables:
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -229,6 +233,7 @@ tables:
|
||||
column_header: true
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -239,6 +244,7 @@ tables:
|
||||
column_header: true
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -249,6 +255,7 @@ tables:
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -259,6 +266,7 @@ tables:
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -269,6 +277,7 @@ tables:
|
||||
column_header: true
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -279,6 +288,7 @@ tables:
|
||||
column_header: true
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -289,6 +299,7 @@ tables:
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -299,6 +310,7 @@ tables:
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
fillable: false
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
@@ -878,4 +890,4 @@ texts:
|
||||
prov: []
|
||||
self_ref: '#/texts/48'
|
||||
text: Table Heading
|
||||
version: 1.6.0
|
||||
version: 1.7.0
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: paragraph: Lorem ipsum dolor sit amet, cons ... quam non, sodales sem. Nulla facilisi.
|
||||
item-2 at level 1: paragraph:
|
||||
item-3 at level 1: paragraph: Duis condimentum dui eget ullamc ... cus tempor, et tristique ante aliquet.
|
||||
item-4 at level 1: paragraph:
|
||||
item-5 at level 1: paragraph: Maecenas id neque pharetra, elei ... ulla faucibus eu. Donec ut nisl metus.
|
||||
item-6 at level 1: paragraph:
|
||||
item-7 at level 1: paragraph: Duis ac tellus sed turpis feugia ... pellentesque rhoncus, blandit eu nisl.
|
||||
item-8 at level 1: paragraph:
|
||||
item-9 at level 1: paragraph: Nunc vehicula mattis erat ac con ... udin, vehicula turpis eu, tempus nibh.
|
||||
item-1 at level 1: text: Lorem ipsum dolor sit amet, cons ... quam non, sodales sem. Nulla facilisi.
|
||||
item-2 at level 1: text:
|
||||
item-3 at level 1: text: Duis condimentum dui eget ullamc ... cus tempor, et tristique ante aliquet.
|
||||
item-4 at level 1: text:
|
||||
item-5 at level 1: text: Maecenas id neque pharetra, elei ... ulla faucibus eu. Donec ut nisl metus.
|
||||
item-6 at level 1: text:
|
||||
item-7 at level 1: text: Duis ac tellus sed turpis feugia ... pellentesque rhoncus, blandit eu nisl.
|
||||
item-8 at level 1: text:
|
||||
item-9 at level 1: text: Nunc vehicula mattis erat ac con ... udin, vehicula turpis eu, tempus nibh.
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "lorem_ipsum",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -58,7 +58,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.",
|
||||
"text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.",
|
||||
@@ -77,7 +77,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -89,7 +89,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.",
|
||||
"text": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.",
|
||||
@@ -108,7 +108,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -120,7 +120,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.",
|
||||
"text": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.",
|
||||
@@ -139,7 +139,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -151,7 +151,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.",
|
||||
"text": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.",
|
||||
@@ -170,7 +170,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -182,7 +182,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.",
|
||||
"text": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.",
|
||||
|
||||
@@ -136,4 +136,4 @@ texts:
|
||||
prov: []
|
||||
self_ref: '#/texts/7'
|
||||
text: The end!
|
||||
version: 1.6.0
|
||||
version: 1.7.0
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: table with [2x2]
|
||||
item-2 at level 1: paragraph:
|
||||
item-2 at level 1: text:
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "table_with_equations",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -37,7 +37,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -69,7 +69,8 @@
|
||||
"text": "The next cell has an equation",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -81,7 +82,8 @@
|
||||
"text": "$A= \\pi r^{2}$",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -93,7 +95,8 @@
|
||||
"text": "The next cell has another equation",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -105,7 +108,8 @@
|
||||
"text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 2,
|
||||
@@ -122,7 +126,8 @@
|
||||
"text": "The next cell has an equation",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -134,7 +139,8 @@
|
||||
"text": "$A= \\pi r^{2}$",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -148,7 +154,8 @@
|
||||
"text": "The next cell has another equation",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -160,7 +167,8 @@
|
||||
"text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
|
||||
@@ -2,9 +2,9 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: list: group list
|
||||
item-2 at level 2: list_item: Hello world1
|
||||
item-3 at level 2: list_item: Hello2
|
||||
item-4 at level 1: paragraph:
|
||||
item-5 at level 1: paragraph: Some text before
|
||||
item-4 at level 1: text:
|
||||
item-5 at level 1: text: Some text before
|
||||
item-6 at level 1: table with [3x3]
|
||||
item-7 at level 1: paragraph:
|
||||
item-8 at level 1: paragraph:
|
||||
item-9 at level 1: paragraph: Some text after
|
||||
item-7 at level 1: text:
|
||||
item-8 at level 1: text:
|
||||
item-9 at level 1: text: Some text after
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "tablecell",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -112,7 +112,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -124,7 +124,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Some text before",
|
||||
"text": "Some text before",
|
||||
@@ -143,7 +143,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -155,7 +155,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -167,7 +167,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Some text after",
|
||||
"text": "Some text after",
|
||||
@@ -206,7 +206,8 @@
|
||||
"text": "Tab1",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -218,7 +219,8 @@
|
||||
"text": "Tab2",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -230,7 +232,8 @@
|
||||
"text": "Tab3",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -242,7 +245,8 @@
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -254,7 +258,8 @@
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -266,7 +271,8 @@
|
||||
"text": "C",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -278,7 +284,8 @@
|
||||
"text": "D",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -290,7 +297,8 @@
|
||||
"text": "E",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -302,7 +310,8 @@
|
||||
"text": "F",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 3,
|
||||
@@ -319,7 +328,8 @@
|
||||
"text": "Tab1",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -331,7 +341,8 @@
|
||||
"text": "Tab2",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -343,7 +354,8 @@
|
||||
"text": "Tab3",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -357,7 +369,8 @@
|
||||
"text": "A",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -369,7 +382,8 @@
|
||||
"text": "B",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -381,7 +395,8 @@
|
||||
"text": "C",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -395,7 +410,8 @@
|
||||
"text": "D",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -407,7 +423,8 @@
|
||||
"text": "E",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -419,7 +436,8 @@
|
||||
"text": "F",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: paragraph: Test with three images in unusual formats
|
||||
item-2 at level 1: paragraph: Raster in emf:
|
||||
item-1 at level 1: text: Test with three images in unusual formats
|
||||
item-2 at level 1: text: Raster in emf:
|
||||
item-3 at level 1: picture
|
||||
item-4 at level 1: paragraph: Vector in emf:
|
||||
item-4 at level 1: text: Vector in emf:
|
||||
item-5 at level 1: picture
|
||||
item-6 at level 1: paragraph: Raster in webp:
|
||||
item-6 at level 1: text: Raster in webp:
|
||||
item-7 at level 1: picture
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "test_emf_docx",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -52,7 +52,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Test with three images in unusual formats",
|
||||
"text": "Test with three images in unusual formats",
|
||||
@@ -71,7 +71,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Raster in emf:",
|
||||
"text": "Raster in emf:",
|
||||
@@ -90,7 +90,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Vector in emf:",
|
||||
"text": "Vector in emf:",
|
||||
@@ -109,7 +109,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Raster in webp:",
|
||||
"text": "Raster in webp:",
|
||||
|
||||
120
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
120
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
@@ -1,90 +1,90 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: paragraph: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
|
||||
item-2 at level 1: paragraph: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
|
||||
item-3 at level 1: paragraph:
|
||||
item-1 at level 1: text: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
|
||||
item-2 at level 1: text: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
|
||||
item-3 at level 1: text:
|
||||
item-4 at level 1: section: group textbox
|
||||
item-5 at level 2: paragraph: Student falls ill
|
||||
item-6 at level 2: paragraph:
|
||||
item-5 at level 2: text: Student falls ill
|
||||
item-6 at level 2: text:
|
||||
item-7 at level 2: list: group list
|
||||
item-8 at level 3: list_item: Suggested Reportable Symptoms:
|
||||
* ... sh
|
||||
* Blisters
|
||||
* Headache
|
||||
* Sore throat
|
||||
item-9 at level 1: paragraph:
|
||||
item-10 at level 1: paragraph:
|
||||
item-9 at level 1: text:
|
||||
item-10 at level 1: text:
|
||||
item-11 at level 1: section: group textbox
|
||||
item-12 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
|
||||
item-13 at level 1: paragraph:
|
||||
item-14 at level 1: paragraph:
|
||||
item-15 at level 1: paragraph:
|
||||
item-16 at level 1: paragraph:
|
||||
item-12 at level 2: text: If a caregiver suspects that wit ... the same suggested reportable symptoms
|
||||
item-13 at level 1: text:
|
||||
item-14 at level 1: text:
|
||||
item-15 at level 1: text:
|
||||
item-16 at level 1: text:
|
||||
item-17 at level 1: section: group textbox
|
||||
item-18 at level 2: paragraph: Yes
|
||||
item-19 at level 1: paragraph:
|
||||
item-20 at level 1: paragraph:
|
||||
item-18 at level 2: text: Yes
|
||||
item-19 at level 1: text:
|
||||
item-20 at level 1: text:
|
||||
item-21 at level 1: section: group textbox
|
||||
item-22 at level 2: list: group list
|
||||
item-23 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
|
||||
item-24 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
|
||||
item-25 at level 2: paragraph:
|
||||
item-25 at level 2: text:
|
||||
item-26 at level 1: list: group list
|
||||
item-27 at level 1: paragraph:
|
||||
item-28 at level 1: paragraph:
|
||||
item-29 at level 1: paragraph:
|
||||
item-30 at level 1: paragraph:
|
||||
item-31 at level 1: paragraph:
|
||||
item-27 at level 1: text:
|
||||
item-28 at level 1: text:
|
||||
item-29 at level 1: text:
|
||||
item-30 at level 1: text:
|
||||
item-31 at level 1: text:
|
||||
item-32 at level 1: section: group textbox
|
||||
item-33 at level 2: paragraph: Health Bureau:
|
||||
item-34 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
|
||||
item-33 at level 2: text: Health Bureau:
|
||||
item-34 at level 2: text: Upon receiving a report from the ... rt to the Centers for Disease Control.
|
||||
item-35 at level 2: list: group list
|
||||
item-36 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
|
||||
item-37 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
|
||||
item-38 at level 2: paragraph:
|
||||
item-38 at level 2: text:
|
||||
item-39 at level 1: list: group list
|
||||
item-40 at level 1: paragraph:
|
||||
item-40 at level 1: text:
|
||||
item-41 at level 1: section: group textbox
|
||||
item-42 at level 2: paragraph: Department of Education:
|
||||
item-42 at level 2: text: Department of Education:
|
||||
Collabo ... vention measures at all school levels.
|
||||
item-43 at level 1: paragraph:
|
||||
item-44 at level 1: paragraph:
|
||||
item-45 at level 1: paragraph:
|
||||
item-46 at level 1: paragraph:
|
||||
item-47 at level 1: paragraph:
|
||||
item-48 at level 1: paragraph:
|
||||
item-49 at level 1: paragraph:
|
||||
item-43 at level 1: text:
|
||||
item-44 at level 1: text:
|
||||
item-45 at level 1: text:
|
||||
item-46 at level 1: text:
|
||||
item-47 at level 1: text:
|
||||
item-48 at level 1: text:
|
||||
item-49 at level 1: text:
|
||||
item-50 at level 1: section: group textbox
|
||||
item-51 at level 2: inline: group group
|
||||
item-52 at level 3: paragraph: The Health Bureau will handle
|
||||
item-53 at level 3: paragraph: reporting and specimen collection
|
||||
item-54 at level 3: paragraph: .
|
||||
item-55 at level 2: paragraph:
|
||||
item-56 at level 1: paragraph:
|
||||
item-57 at level 1: paragraph:
|
||||
item-58 at level 1: paragraph:
|
||||
item-52 at level 3: text: The Health Bureau will handle
|
||||
item-53 at level 3: text: reporting and specimen collection
|
||||
item-54 at level 3: text: .
|
||||
item-55 at level 2: text:
|
||||
item-56 at level 1: text:
|
||||
item-57 at level 1: text:
|
||||
item-58 at level 1: text:
|
||||
item-59 at level 1: section: group textbox
|
||||
item-60 at level 2: paragraph: Whether the epidemic has eased.
|
||||
item-61 at level 2: paragraph:
|
||||
item-62 at level 1: paragraph:
|
||||
item-60 at level 2: text: Whether the epidemic has eased.
|
||||
item-61 at level 2: text:
|
||||
item-62 at level 1: text:
|
||||
item-63 at level 1: section: group textbox
|
||||
item-64 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
|
||||
item-65 at level 2: paragraph: No
|
||||
item-66 at level 1: paragraph:
|
||||
item-67 at level 1: paragraph:
|
||||
item-64 at level 2: text: Whether the test results are pos ... legally designated infectious disease.
|
||||
item-65 at level 2: text: No
|
||||
item-66 at level 1: text:
|
||||
item-67 at level 1: text:
|
||||
item-68 at level 1: section: group textbox
|
||||
item-69 at level 2: paragraph: Yes
|
||||
item-70 at level 1: paragraph:
|
||||
item-69 at level 2: text: Yes
|
||||
item-70 at level 1: text:
|
||||
item-71 at level 1: section: group textbox
|
||||
item-72 at level 2: paragraph: Yes
|
||||
item-73 at level 1: paragraph:
|
||||
item-74 at level 1: paragraph:
|
||||
item-72 at level 2: text: Yes
|
||||
item-73 at level 1: text:
|
||||
item-74 at level 1: text:
|
||||
item-75 at level 1: section: group textbox
|
||||
item-76 at level 2: paragraph: Case closed.
|
||||
item-77 at level 2: paragraph:
|
||||
item-78 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
|
||||
item-79 at level 1: paragraph:
|
||||
item-76 at level 2: text: Case closed.
|
||||
item-77 at level 2: text:
|
||||
item-78 at level 2: text: The Health Bureau will carry out ... ters for Disease Control if necessary.
|
||||
item-79 at level 1: text:
|
||||
item-80 at level 1: section: group textbox
|
||||
item-81 at level 2: paragraph: No
|
||||
item-82 at level 1: paragraph:
|
||||
item-83 at level 1: paragraph:
|
||||
item-84 at level 1: paragraph:
|
||||
item-81 at level 2: text: No
|
||||
item-82 at level 1: text:
|
||||
item-83 at level 1: text:
|
||||
item-84 at level 1: text:
|
||||
122
tests/data/groundtruth/docling_v2/textbox.docx.json
vendored
122
tests/data/groundtruth/docling_v2/textbox.docx.json
vendored
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "textbox",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -491,7 +491,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten",
|
||||
"text": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten",
|
||||
@@ -510,7 +510,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten",
|
||||
"text": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten",
|
||||
@@ -529,7 +529,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -541,7 +541,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Student falls ill",
|
||||
"text": "Student falls ill",
|
||||
@@ -560,7 +560,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -593,7 +593,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -605,7 +605,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -617,7 +617,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms",
|
||||
"text": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms",
|
||||
@@ -636,7 +636,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -648,7 +648,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -660,7 +660,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -672,7 +672,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -684,7 +684,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Yes",
|
||||
"text": "Yes",
|
||||
@@ -703,7 +703,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -715,7 +715,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -769,7 +769,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -781,7 +781,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -793,7 +793,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -805,7 +805,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -817,7 +817,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -829,7 +829,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -841,7 +841,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Health Bureau:",
|
||||
"text": "Health Bureau:",
|
||||
@@ -860,7 +860,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.",
|
||||
"text": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.",
|
||||
@@ -921,7 +921,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -933,7 +933,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -945,7 +945,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.",
|
||||
"text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.",
|
||||
@@ -964,7 +964,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -976,7 +976,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -988,7 +988,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1000,7 +1000,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1012,7 +1012,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1024,7 +1024,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1036,7 +1036,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1048,7 +1048,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "The Health Bureau will handle",
|
||||
"text": "The Health Bureau will handle",
|
||||
@@ -1067,7 +1067,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "reporting and specimen collection",
|
||||
"text": "reporting and specimen collection",
|
||||
@@ -1086,7 +1086,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": ".",
|
||||
"text": ".",
|
||||
@@ -1105,7 +1105,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1117,7 +1117,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1129,7 +1129,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1141,7 +1141,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1153,7 +1153,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Whether the epidemic has eased.",
|
||||
"text": "Whether the epidemic has eased.",
|
||||
@@ -1172,7 +1172,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1184,7 +1184,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1196,7 +1196,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Whether the test results are positive for a legally designated infectious disease.",
|
||||
"text": "Whether the test results are positive for a legally designated infectious disease.",
|
||||
@@ -1215,7 +1215,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "No",
|
||||
"text": "No",
|
||||
@@ -1234,7 +1234,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1246,7 +1246,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1258,7 +1258,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Yes",
|
||||
"text": "Yes",
|
||||
@@ -1277,7 +1277,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1289,7 +1289,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Yes",
|
||||
"text": "Yes",
|
||||
@@ -1308,7 +1308,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1320,7 +1320,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1332,7 +1332,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Case closed.",
|
||||
"text": "Case closed.",
|
||||
@@ -1351,7 +1351,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1363,7 +1363,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.",
|
||||
"text": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.",
|
||||
@@ -1382,7 +1382,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1394,7 +1394,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "No",
|
||||
"text": "No",
|
||||
@@ -1413,7 +1413,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1425,7 +1425,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1437,7 +1437,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: paragraph: italic
|
||||
item-2 at level 1: paragraph: bold
|
||||
item-3 at level 1: paragraph: underline
|
||||
item-4 at level 1: paragraph: hyperlink
|
||||
item-5 at level 1: paragraph: italic and bold hyperlink
|
||||
item-1 at level 1: text: italic
|
||||
item-2 at level 1: text: bold
|
||||
item-3 at level 1: text: underline
|
||||
item-4 at level 1: text: hyperlink
|
||||
item-5 at level 1: text: italic and bold hyperlink
|
||||
item-6 at level 1: inline: group group
|
||||
item-7 at level 2: paragraph: Normal
|
||||
item-8 at level 2: paragraph: italic
|
||||
item-9 at level 2: paragraph: bold
|
||||
item-10 at level 2: paragraph: underline
|
||||
item-11 at level 2: paragraph: and
|
||||
item-12 at level 2: paragraph: hyperlink
|
||||
item-13 at level 2: paragraph: on the same line
|
||||
item-14 at level 1: paragraph:
|
||||
item-7 at level 2: text: Normal
|
||||
item-8 at level 2: text: italic
|
||||
item-9 at level 2: text: bold
|
||||
item-10 at level 2: text: underline
|
||||
item-11 at level 2: text: and
|
||||
item-12 at level 2: text: hyperlink
|
||||
item-13 at level 2: text: on the same line
|
||||
item-14 at level 1: text:
|
||||
item-15 at level 1: list: group list
|
||||
item-16 at level 2: list_item: Italic bullet 1
|
||||
item-17 at level 2: list_item: Bold bullet 2
|
||||
@@ -29,4 +29,4 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-28 at level 5: text: Nested
|
||||
item-29 at level 5: text: italic
|
||||
item-30 at level 5: text: bold
|
||||
item-31 at level 1: paragraph:
|
||||
item-31 at level 1: text:
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "unit_test_formatting",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -174,7 +174,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
@@ -193,7 +193,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
@@ -212,7 +212,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "underline",
|
||||
"text": "underline",
|
||||
@@ -231,7 +231,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "hyperlink",
|
||||
"text": "hyperlink",
|
||||
@@ -251,7 +251,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "italic and bold hyperlink",
|
||||
"text": "italic and bold hyperlink",
|
||||
@@ -271,7 +271,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Normal",
|
||||
"text": "Normal",
|
||||
@@ -290,7 +290,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
@@ -309,7 +309,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
@@ -328,7 +328,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "underline",
|
||||
"text": "underline",
|
||||
@@ -347,7 +347,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "and",
|
||||
"text": "and",
|
||||
@@ -366,7 +366,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "hyperlink",
|
||||
"text": "hyperlink",
|
||||
@@ -386,7 +386,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "on the same line",
|
||||
"text": "on the same line",
|
||||
@@ -405,7 +405,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -649,7 +649,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
|
||||
@@ -1,48 +1,48 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Test Document
|
||||
item-2 at level 2: paragraph:
|
||||
item-2 at level 2: text:
|
||||
item-3 at level 2: section_header: Section 1
|
||||
item-4 at level 3: paragraph:
|
||||
item-5 at level 3: paragraph: Paragraph 1.1
|
||||
item-6 at level 3: paragraph:
|
||||
item-7 at level 3: paragraph: Paragraph 1.2
|
||||
item-8 at level 3: paragraph:
|
||||
item-4 at level 3: text:
|
||||
item-5 at level 3: text: Paragraph 1.1
|
||||
item-6 at level 3: text:
|
||||
item-7 at level 3: text: Paragraph 1.2
|
||||
item-8 at level 3: text:
|
||||
item-9 at level 3: section_header: Section 1.1
|
||||
item-10 at level 4: paragraph:
|
||||
item-11 at level 4: paragraph: Paragraph 1.1.1
|
||||
item-12 at level 4: paragraph:
|
||||
item-13 at level 4: paragraph: Paragraph 1.1.2
|
||||
item-14 at level 4: paragraph:
|
||||
item-10 at level 4: text:
|
||||
item-11 at level 4: text: Paragraph 1.1.1
|
||||
item-12 at level 4: text:
|
||||
item-13 at level 4: text: Paragraph 1.1.2
|
||||
item-14 at level 4: text:
|
||||
item-15 at level 3: section_header: Section 1.2
|
||||
item-16 at level 4: paragraph:
|
||||
item-17 at level 4: paragraph: Paragraph 1.1.1
|
||||
item-18 at level 4: paragraph:
|
||||
item-19 at level 4: paragraph: Paragraph 1.1.2
|
||||
item-20 at level 4: paragraph:
|
||||
item-16 at level 4: text:
|
||||
item-17 at level 4: text: Paragraph 1.1.1
|
||||
item-18 at level 4: text:
|
||||
item-19 at level 4: text: Paragraph 1.1.2
|
||||
item-20 at level 4: text:
|
||||
item-21 at level 4: section_header: Section 1.2.3
|
||||
item-22 at level 5: paragraph:
|
||||
item-23 at level 5: paragraph: Paragraph 1.2.3.1
|
||||
item-24 at level 5: paragraph:
|
||||
item-25 at level 5: paragraph: Paragraph 1.2.3.1
|
||||
item-26 at level 5: paragraph:
|
||||
item-27 at level 5: paragraph:
|
||||
item-22 at level 5: text:
|
||||
item-23 at level 5: text: Paragraph 1.2.3.1
|
||||
item-24 at level 5: text:
|
||||
item-25 at level 5: text: Paragraph 1.2.3.1
|
||||
item-26 at level 5: text:
|
||||
item-27 at level 5: text:
|
||||
item-28 at level 2: section_header: Section 2
|
||||
item-29 at level 3: paragraph:
|
||||
item-30 at level 3: paragraph: Paragraph 2.1
|
||||
item-31 at level 3: paragraph:
|
||||
item-32 at level 3: paragraph: Paragraph 2.2
|
||||
item-33 at level 3: paragraph:
|
||||
item-29 at level 3: text:
|
||||
item-30 at level 3: text: Paragraph 2.1
|
||||
item-31 at level 3: text:
|
||||
item-32 at level 3: text: Paragraph 2.2
|
||||
item-33 at level 3: text:
|
||||
item-34 at level 3: section: group header-2
|
||||
item-35 at level 4: section_header: Section 2.1.1
|
||||
item-36 at level 5: paragraph:
|
||||
item-37 at level 5: paragraph: Paragraph 2.1.1.1
|
||||
item-38 at level 5: paragraph:
|
||||
item-39 at level 5: paragraph: Paragraph 2.1.1.1
|
||||
item-40 at level 5: paragraph:
|
||||
item-36 at level 5: text:
|
||||
item-37 at level 5: text: Paragraph 2.1.1.1
|
||||
item-38 at level 5: text:
|
||||
item-39 at level 5: text: Paragraph 2.1.1.1
|
||||
item-40 at level 5: text:
|
||||
item-41 at level 3: section_header: Section 2.1
|
||||
item-42 at level 4: paragraph:
|
||||
item-43 at level 4: paragraph: Paragraph 2.1.1
|
||||
item-44 at level 4: paragraph:
|
||||
item-45 at level 4: paragraph: Paragraph 2.1.2
|
||||
item-46 at level 4: paragraph:
|
||||
item-47 at level 4: paragraph:
|
||||
item-42 at level 4: text:
|
||||
item-43 at level 4: text: Paragraph 2.1.1
|
||||
item-44 at level 4: text:
|
||||
item-45 at level 4: text: Paragraph 2.1.2
|
||||
item-46 at level 4: text:
|
||||
item-47 at level 4: text:
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "unit_test_headers",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -71,7 +71,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -118,7 +118,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -130,7 +130,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.1",
|
||||
"text": "Paragraph 1.1",
|
||||
@@ -149,7 +149,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -161,7 +161,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.2",
|
||||
"text": "Paragraph 1.2",
|
||||
@@ -180,7 +180,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -221,7 +221,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -233,7 +233,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.1.1",
|
||||
"text": "Paragraph 1.1.1",
|
||||
@@ -252,7 +252,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -264,7 +264,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.1.2",
|
||||
"text": "Paragraph 1.1.2",
|
||||
@@ -283,7 +283,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -327,7 +327,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -339,7 +339,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.1.1",
|
||||
"text": "Paragraph 1.1.1",
|
||||
@@ -358,7 +358,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -370,7 +370,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.1.2",
|
||||
"text": "Paragraph 1.1.2",
|
||||
@@ -389,7 +389,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -433,7 +433,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -445,7 +445,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.2.3.1",
|
||||
"text": "Paragraph 1.2.3.1",
|
||||
@@ -464,7 +464,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -476,7 +476,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.2.3.1",
|
||||
"text": "Paragraph 1.2.3.1",
|
||||
@@ -495,7 +495,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -507,7 +507,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -554,7 +554,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -566,7 +566,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1",
|
||||
"text": "Paragraph 2.1",
|
||||
@@ -585,7 +585,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -597,7 +597,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.2",
|
||||
"text": "Paragraph 2.2",
|
||||
@@ -616,7 +616,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -657,7 +657,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -669,7 +669,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1.1.1",
|
||||
"text": "Paragraph 2.1.1.1",
|
||||
@@ -688,7 +688,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -700,7 +700,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1.1.1",
|
||||
"text": "Paragraph 2.1.1.1",
|
||||
@@ -719,7 +719,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -763,7 +763,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -775,7 +775,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1.1",
|
||||
"text": "Paragraph 2.1.1",
|
||||
@@ -794,7 +794,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -806,7 +806,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1.2",
|
||||
"text": "Paragraph 2.1.2",
|
||||
@@ -825,7 +825,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -837,7 +837,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
|
||||
@@ -1,52 +1,52 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Test Document
|
||||
item-2 at level 2: paragraph:
|
||||
item-2 at level 2: text:
|
||||
item-3 at level 2: section_header: 1 Section 1
|
||||
item-4 at level 1: paragraph:
|
||||
item-5 at level 1: paragraph: Paragraph 1.1
|
||||
item-6 at level 1: paragraph:
|
||||
item-7 at level 1: paragraph: Paragraph 1.2
|
||||
item-8 at level 1: paragraph:
|
||||
item-4 at level 1: text:
|
||||
item-5 at level 1: text: Paragraph 1.1
|
||||
item-6 at level 1: text:
|
||||
item-7 at level 1: text: Paragraph 1.2
|
||||
item-8 at level 1: text:
|
||||
item-9 at level 1: section: group header-0
|
||||
item-10 at level 2: section: group header-1
|
||||
item-11 at level 3: section_header: 1.1 Section 1.1
|
||||
item-12 at level 4: paragraph:
|
||||
item-13 at level 4: paragraph: Paragraph 1.1.1
|
||||
item-14 at level 4: paragraph:
|
||||
item-15 at level 4: paragraph: Paragraph 1.1.2
|
||||
item-16 at level 4: paragraph:
|
||||
item-12 at level 4: text:
|
||||
item-13 at level 4: text: Paragraph 1.1.1
|
||||
item-14 at level 4: text:
|
||||
item-15 at level 4: text: Paragraph 1.1.2
|
||||
item-16 at level 4: text:
|
||||
item-17 at level 3: section_header: 1.2 Section 1.2
|
||||
item-18 at level 4: paragraph:
|
||||
item-19 at level 4: paragraph: Paragraph 1.1.1
|
||||
item-20 at level 4: paragraph:
|
||||
item-21 at level 4: paragraph: Paragraph 1.1.2
|
||||
item-22 at level 4: paragraph:
|
||||
item-18 at level 4: text:
|
||||
item-19 at level 4: text: Paragraph 1.1.1
|
||||
item-20 at level 4: text:
|
||||
item-21 at level 4: text: Paragraph 1.1.2
|
||||
item-22 at level 4: text:
|
||||
item-23 at level 4: section_header: 1.2.1 Section 1.2.3
|
||||
item-24 at level 5: paragraph:
|
||||
item-25 at level 5: paragraph: Paragraph 1.2.3.1
|
||||
item-26 at level 5: paragraph:
|
||||
item-27 at level 5: paragraph: Paragraph 1.2.3.1
|
||||
item-28 at level 5: paragraph:
|
||||
item-29 at level 5: paragraph:
|
||||
item-24 at level 5: text:
|
||||
item-25 at level 5: text: Paragraph 1.2.3.1
|
||||
item-26 at level 5: text:
|
||||
item-27 at level 5: text: Paragraph 1.2.3.1
|
||||
item-28 at level 5: text:
|
||||
item-29 at level 5: text:
|
||||
item-30 at level 2: section_header: 2 Section 2
|
||||
item-31 at level 1: paragraph:
|
||||
item-32 at level 1: paragraph: Paragraph 2.1
|
||||
item-33 at level 1: paragraph:
|
||||
item-34 at level 1: paragraph: Paragraph 2.2
|
||||
item-35 at level 1: paragraph:
|
||||
item-31 at level 1: text:
|
||||
item-32 at level 1: text: Paragraph 2.1
|
||||
item-33 at level 1: text:
|
||||
item-34 at level 1: text: Paragraph 2.2
|
||||
item-35 at level 1: text:
|
||||
item-36 at level 1: section: group header-0
|
||||
item-37 at level 2: section: group header-1
|
||||
item-38 at level 3: section: group header-2
|
||||
item-39 at level 4: section_header: 2.1.1 Section 2.1.1
|
||||
item-40 at level 5: paragraph:
|
||||
item-41 at level 5: paragraph: Paragraph 2.1.1.1
|
||||
item-42 at level 5: paragraph:
|
||||
item-43 at level 5: paragraph: Paragraph 2.1.1.1
|
||||
item-44 at level 5: paragraph:
|
||||
item-40 at level 5: text:
|
||||
item-41 at level 5: text: Paragraph 2.1.1.1
|
||||
item-42 at level 5: text:
|
||||
item-43 at level 5: text: Paragraph 2.1.1.1
|
||||
item-44 at level 5: text:
|
||||
item-45 at level 3: section_header: 2.2 Section 2.1
|
||||
item-46 at level 4: paragraph:
|
||||
item-47 at level 4: paragraph: Paragraph 2.1.1
|
||||
item-48 at level 4: paragraph:
|
||||
item-49 at level 4: paragraph: Paragraph 2.1.2
|
||||
item-50 at level 4: paragraph:
|
||||
item-51 at level 4: paragraph:
|
||||
item-46 at level 4: text:
|
||||
item-47 at level 4: text: Paragraph 2.1.1
|
||||
item-48 at level 4: text:
|
||||
item-49 at level 4: text: Paragraph 2.1.2
|
||||
item-50 at level 4: text:
|
||||
item-51 at level 4: text:
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "unit_test_headers_numbered",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -169,7 +169,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -194,7 +194,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -206,7 +206,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.1",
|
||||
"text": "Paragraph 1.1",
|
||||
@@ -225,7 +225,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -237,7 +237,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.2",
|
||||
"text": "Paragraph 1.2",
|
||||
@@ -256,7 +256,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -297,7 +297,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -309,7 +309,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.1.1",
|
||||
"text": "Paragraph 1.1.1",
|
||||
@@ -328,7 +328,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -340,7 +340,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.1.2",
|
||||
"text": "Paragraph 1.1.2",
|
||||
@@ -359,7 +359,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -403,7 +403,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -415,7 +415,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.1.1",
|
||||
"text": "Paragraph 1.1.1",
|
||||
@@ -434,7 +434,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -446,7 +446,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.1.2",
|
||||
"text": "Paragraph 1.1.2",
|
||||
@@ -465,7 +465,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -509,7 +509,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -521,7 +521,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.2.3.1",
|
||||
"text": "Paragraph 1.2.3.1",
|
||||
@@ -540,7 +540,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -552,7 +552,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 1.2.3.1",
|
||||
"text": "Paragraph 1.2.3.1",
|
||||
@@ -571,7 +571,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -583,7 +583,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -608,7 +608,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -620,7 +620,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1",
|
||||
"text": "Paragraph 2.1",
|
||||
@@ -639,7 +639,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -651,7 +651,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.2",
|
||||
"text": "Paragraph 2.2",
|
||||
@@ -670,7 +670,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -711,7 +711,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -723,7 +723,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1.1.1",
|
||||
"text": "Paragraph 2.1.1.1",
|
||||
@@ -742,7 +742,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -754,7 +754,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1.1.1",
|
||||
"text": "Paragraph 2.1.1.1",
|
||||
@@ -773,7 +773,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -817,7 +817,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -829,7 +829,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1.1",
|
||||
"text": "Paragraph 2.1.1",
|
||||
@@ -848,7 +848,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -860,7 +860,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1.2",
|
||||
"text": "Paragraph 2.1.2",
|
||||
@@ -879,7 +879,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -891,7 +891,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
|
||||
@@ -1,25 +1,25 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group header-0
|
||||
item-2 at level 2: section_header: Test Document
|
||||
item-3 at level 3: paragraph:
|
||||
item-4 at level 3: paragraph:
|
||||
item-5 at level 3: paragraph: Paragraph 2.1.1
|
||||
item-6 at level 3: paragraph:
|
||||
item-7 at level 3: paragraph: Paragraph 2.1.2
|
||||
item-8 at level 3: paragraph:
|
||||
item-3 at level 3: text:
|
||||
item-4 at level 3: text:
|
||||
item-5 at level 3: text: Paragraph 2.1.1
|
||||
item-6 at level 3: text:
|
||||
item-7 at level 3: text: Paragraph 2.1.2
|
||||
item-8 at level 3: text:
|
||||
item-9 at level 3: section: group header-2
|
||||
item-10 at level 4: section_header: Test 1:
|
||||
item-11 at level 5: list: group list
|
||||
item-12 at level 6: list_item: List item 1
|
||||
item-13 at level 6: list_item: List item 2
|
||||
item-14 at level 6: list_item: List item 3
|
||||
item-15 at level 5: paragraph:
|
||||
item-15 at level 5: text:
|
||||
item-16 at level 4: section_header: Test 2:
|
||||
item-17 at level 5: list: group list
|
||||
item-18 at level 6: list_item: List item a
|
||||
item-19 at level 6: list_item: List item b
|
||||
item-20 at level 6: list_item: List item c
|
||||
item-21 at level 5: paragraph:
|
||||
item-21 at level 5: text:
|
||||
item-22 at level 4: section_header: Test 3:
|
||||
item-23 at level 5: list: group list
|
||||
item-24 at level 6: list_item: List item 1
|
||||
@@ -29,14 +29,14 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-28 at level 7: list_item: List item 1.2
|
||||
item-29 at level 7: list_item: List item 1.3
|
||||
item-30 at level 6: list_item: List item 3
|
||||
item-31 at level 5: paragraph:
|
||||
item-31 at level 5: text:
|
||||
item-32 at level 4: section_header: Test 4:
|
||||
item-33 at level 5: list: group list
|
||||
item-34 at level 6: list_item: List item 1
|
||||
item-35 at level 6: list: group list
|
||||
item-36 at level 7: list_item: List item 1.1
|
||||
item-37 at level 6: list_item: List item 2
|
||||
item-38 at level 5: paragraph:
|
||||
item-38 at level 5: text:
|
||||
item-39 at level 4: section_header: Test 5:
|
||||
item-40 at level 5: list: group list
|
||||
item-41 at level 6: list_item: List item 1
|
||||
@@ -45,7 +45,7 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-44 at level 7: list: group list
|
||||
item-45 at level 8: list_item: List item 1.1.1
|
||||
item-46 at level 6: list_item: List item 3
|
||||
item-47 at level 5: paragraph:
|
||||
item-47 at level 5: text:
|
||||
item-48 at level 4: section_header: Test 6:
|
||||
item-49 at level 5: list: group list
|
||||
item-50 at level 6: list_item: List item 1
|
||||
@@ -56,6 +56,6 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-55 at level 7: list: group list
|
||||
item-56 at level 8: list_item: List item 1.2.1
|
||||
item-57 at level 6: list_item: List item 3
|
||||
item-58 at level 5: paragraph:
|
||||
item-59 at level 5: paragraph:
|
||||
item-60 at level 5: paragraph:
|
||||
item-58 at level 5: text:
|
||||
item-59 at level 5: text:
|
||||
item-60 at level 5: text:
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "unit_test_lists",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -338,7 +338,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -350,7 +350,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -362,7 +362,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1.1",
|
||||
"text": "Paragraph 2.1.1",
|
||||
@@ -381,7 +381,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -393,7 +393,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Paragraph 2.1.2",
|
||||
"text": "Paragraph 2.1.2",
|
||||
@@ -412,7 +412,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -507,7 +507,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -602,7 +602,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -760,7 +760,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -855,7 +855,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -971,7 +971,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1135,7 +1135,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1147,7 +1147,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -1159,7 +1159,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
|
||||
66
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
vendored
Normal file
66
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group WebVTT cue block
|
||||
item-2 at level 2: text: 00:11.000 --> 00:13.000
|
||||
item-3 at level 2: inline: group WebVTT cue voice span
|
||||
item-4 at level 3: text: Roger Bingham:
|
||||
item-5 at level 3: text: We are in New York City
|
||||
item-6 at level 1: section: group WebVTT cue block
|
||||
item-7 at level 2: text: 00:13.000 --> 00:16.000
|
||||
item-8 at level 2: inline: group WebVTT cue voice span
|
||||
item-9 at level 3: text: Roger Bingham:
|
||||
item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
|
||||
item-11 at level 1: section: group WebVTT cue block
|
||||
item-12 at level 2: text: 00:16.000 --> 00:18.000
|
||||
item-13 at level 2: inline: group WebVTT cue voice span
|
||||
item-14 at level 3: text: Roger Bingham:
|
||||
item-15 at level 3: text: from the American Museum of Natural History
|
||||
item-16 at level 1: section: group WebVTT cue block
|
||||
item-17 at level 2: text: 00:18.000 --> 00:20.000
|
||||
item-18 at level 2: inline: group WebVTT cue voice span
|
||||
item-19 at level 3: text: Roger Bingham:
|
||||
item-20 at level 3: text: And with me is Neil deGrasse Tyson
|
||||
item-21 at level 1: section: group WebVTT cue block
|
||||
item-22 at level 2: text: 00:20.000 --> 00:22.000
|
||||
item-23 at level 2: inline: group WebVTT cue voice span
|
||||
item-24 at level 3: text: Roger Bingham:
|
||||
item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
|
||||
item-26 at level 1: section: group WebVTT cue block
|
||||
item-27 at level 2: text: 00:22.000 --> 00:24.000
|
||||
item-28 at level 2: inline: group WebVTT cue voice span
|
||||
item-29 at level 3: text: Roger Bingham:
|
||||
item-30 at level 3: text: at the AMNH.
|
||||
item-31 at level 1: section: group WebVTT cue block
|
||||
item-32 at level 2: text: 00:24.000 --> 00:26.000
|
||||
item-33 at level 2: inline: group WebVTT cue voice span
|
||||
item-34 at level 3: text: Roger Bingham:
|
||||
item-35 at level 3: text: Thank you for walking down here.
|
||||
item-36 at level 1: section: group WebVTT cue block
|
||||
item-37 at level 2: text: 00:27.000 --> 00:30.000
|
||||
item-38 at level 2: inline: group WebVTT cue voice span
|
||||
item-39 at level 3: text: Roger Bingham:
|
||||
item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
|
||||
item-41 at level 1: section: group WebVTT cue block
|
||||
item-42 at level 2: text: 00:30.000 --> 00:31.500
|
||||
item-43 at level 2: inline: group WebVTT cue voice span
|
||||
item-44 at level 3: text: Roger Bingham:
|
||||
item-45 at level 3: text: When we e-mailed—
|
||||
item-46 at level 1: section: group WebVTT cue block
|
||||
item-47 at level 2: text: 00:30.500 --> 00:32.500
|
||||
item-48 at level 2: inline: group WebVTT cue voice span
|
||||
item-49 at level 3: text: Neil deGrasse Tyson:
|
||||
item-50 at level 3: text: Didn’t we talk about enough in that conversation?
|
||||
item-51 at level 1: section: group WebVTT cue block
|
||||
item-52 at level 2: text: 00:32.000 --> 00:35.500
|
||||
item-53 at level 2: inline: group WebVTT cue voice span
|
||||
item-54 at level 3: text: Roger Bingham:
|
||||
item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
|
||||
item-56 at level 1: section: group WebVTT cue block
|
||||
item-57 at level 2: text: 00:32.500 --> 00:33.500
|
||||
item-58 at level 2: inline: group WebVTT cue voice span
|
||||
item-59 at level 3: text: Neil deGrasse Tyson:
|
||||
item-60 at level 3: text: Laughs
|
||||
item-61 at level 1: section: group WebVTT cue block
|
||||
item-62 at level 2: text: 00:35.500 --> 00:38.000
|
||||
item-63 at level 2: inline: group WebVTT cue voice span
|
||||
item-64 at level 3: text: Roger Bingham:
|
||||
item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
|
||||
1074
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
vendored
Normal file
1074
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
51
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
vendored
Normal file
51
tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
00:11.000 --> 00:13.000
|
||||
|
||||
Roger Bingham: We are in New York City
|
||||
|
||||
00:13.000 --> 00:16.000
|
||||
|
||||
Roger Bingham: We’re actually at the Lucern Hotel, just down the street
|
||||
|
||||
00:16.000 --> 00:18.000
|
||||
|
||||
Roger Bingham: from the American Museum of Natural History
|
||||
|
||||
00:18.000 --> 00:20.000
|
||||
|
||||
Roger Bingham: And with me is Neil deGrasse Tyson
|
||||
|
||||
00:20.000 --> 00:22.000
|
||||
|
||||
Roger Bingham: Astrophysicist, Director of the Hayden Planetarium
|
||||
|
||||
00:22.000 --> 00:24.000
|
||||
|
||||
Roger Bingham: at the AMNH.
|
||||
|
||||
00:24.000 --> 00:26.000
|
||||
|
||||
Roger Bingham: Thank you for walking down here.
|
||||
|
||||
00:27.000 --> 00:30.000
|
||||
|
||||
Roger Bingham: And I want to do a follow-up on the last conversation we did.
|
||||
|
||||
00:30.000 --> 00:31.500
|
||||
|
||||
Roger Bingham: When we e-mailed—
|
||||
|
||||
00:30.500 --> 00:32.500
|
||||
|
||||
Neil deGrasse Tyson: Didn’t we talk about enough in that conversation?
|
||||
|
||||
00:32.000 --> 00:35.500
|
||||
|
||||
Roger Bingham: No! No no no no; 'cos 'cos obviously 'cos
|
||||
|
||||
00:32.500 --> 00:33.500
|
||||
|
||||
Neil deGrasse Tyson: *Laughs*
|
||||
|
||||
00:35.500 --> 00:38.000
|
||||
|
||||
Roger Bingham: You know I’m so excited my glasses are falling off here.
|
||||
22
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
vendored
Normal file
22
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group WebVTT cue block
|
||||
item-2 at level 2: text: 00:00.000 --> 00:02.000
|
||||
item-3 at level 2: inline: group WebVTT cue voice span
|
||||
item-4 at level 3: text: Esme (first, loud):
|
||||
item-5 at level 3: text: It’s a blue apple tree!
|
||||
item-6 at level 1: section: group WebVTT cue block
|
||||
item-7 at level 2: text: 00:02.000 --> 00:04.000
|
||||
item-8 at level 2: inline: group WebVTT cue voice span
|
||||
item-9 at level 3: text: Mary:
|
||||
item-10 at level 3: text: No way!
|
||||
item-11 at level 1: section: group WebVTT cue block
|
||||
item-12 at level 2: text: 00:04.000 --> 00:06.000
|
||||
item-13 at level 2: inline: group WebVTT cue voice span
|
||||
item-14 at level 3: text: Esme:
|
||||
item-15 at level 3: text: Hee!
|
||||
item-16 at level 2: text: laughter
|
||||
item-17 at level 1: section: group WebVTT cue block
|
||||
item-18 at level 2: text: 00:06.000 --> 00:08.000
|
||||
item-19 at level 2: inline: group WebVTT cue voice span
|
||||
item-20 at level 3: text: Mary (loud):
|
||||
item-21 at level 3: text: That’s awesome!
|
||||
376
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
vendored
Normal file
376
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
vendored
Normal file
@@ -0,0 +1,376 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"name": "webvtt_example_02",
|
||||
"origin": {
|
||||
"mimetype": "text/vtt",
|
||||
"binary_hash": 12867774546881601731,
|
||||
"filename": "webvtt_example_02.vtt"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/6"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue block",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue voice span",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/2",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue block",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue voice span",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/4",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/5"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/9"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue block",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/7"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/8"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue voice span",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/6",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/10"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/7"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue block",
|
||||
"label": "section"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/7",
|
||||
"parent": {
|
||||
"$ref": "#/groups/6"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/11"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/12"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "WebVTT cue voice span",
|
||||
"label": "inline"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "00:00.000 --> 00:02.000",
|
||||
"text": "00:00.000 --> 00:02.000"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Esme (first, loud): ",
|
||||
"text": "Esme (first, loud): "
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "It’s a blue apple tree!",
|
||||
"text": "It’s a blue apple tree!",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "00:02.000 --> 00:04.000",
|
||||
"text": "00:02.000 --> 00:04.000"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Mary: ",
|
||||
"text": "Mary: "
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "No way!",
|
||||
"text": "No way!",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "00:04.000 --> 00:06.000",
|
||||
"text": "00:04.000 --> 00:06.000"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/groups/5"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Esme: ",
|
||||
"text": "Esme: "
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/8",
|
||||
"parent": {
|
||||
"$ref": "#/groups/5"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Hee!",
|
||||
"text": "Hee!",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/9",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "laughter",
|
||||
"text": "laughter",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/10",
|
||||
"parent": {
|
||||
"$ref": "#/groups/6"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "00:06.000 --> 00:08.000",
|
||||
"text": "00:06.000 --> 00:08.000"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/11",
|
||||
"parent": {
|
||||
"$ref": "#/groups/7"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Mary (loud): ",
|
||||
"text": "Mary (loud): "
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/12",
|
||||
"parent": {
|
||||
"$ref": "#/groups/7"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "That’s awesome!",
|
||||
"text": "That’s awesome!",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false,
|
||||
"script": "baseline"
|
||||
}
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
||||
17
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
vendored
Normal file
17
tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
00:00.000 --> 00:02.000
|
||||
|
||||
Esme (first, loud): It’s a blue apple tree!
|
||||
|
||||
00:02.000 --> 00:04.000
|
||||
|
||||
Mary: No way!
|
||||
|
||||
00:04.000 --> 00:06.000
|
||||
|
||||
Esme: Hee!
|
||||
|
||||
*laughter*
|
||||
|
||||
00:06.000 --> 00:08.000
|
||||
|
||||
Mary (loud): That’s awesome!
|
||||
77
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
vendored
Normal file
77
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group WebVTT cue block
|
||||
item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
|
||||
item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
|
||||
item-4 at level 2: inline: group WebVTT cue voice span
|
||||
item-5 at level 3: text: Speaker A:
|
||||
item-6 at level 3: text: OK, I think now we should be recording
|
||||
item-7 at level 1: section: group WebVTT cue block
|
||||
item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
|
||||
item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
|
||||
item-10 at level 2: inline: group WebVTT cue voice span
|
||||
item-11 at level 3: text: Speaker A:
|
||||
item-12 at level 3: text: properly.
|
||||
item-13 at level 1: section: group WebVTT cue block
|
||||
item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
|
||||
item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
|
||||
item-16 at level 2: text: Good.
|
||||
item-17 at level 1: section: group WebVTT cue block
|
||||
item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
|
||||
item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
|
||||
item-20 at level 2: inline: group WebVTT cue voice span
|
||||
item-21 at level 3: text: Speaker A:
|
||||
item-22 at level 3: text: Yeah.
|
||||
item-23 at level 1: section: group WebVTT cue block
|
||||
item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
|
||||
item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
|
||||
item-26 at level 2: inline: group WebVTT cue voice span
|
||||
item-27 at level 3: text: Speaker B:
|
||||
item-28 at level 3: text: I was also thinking.
|
||||
item-29 at level 1: section: group WebVTT cue block
|
||||
item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
|
||||
item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
|
||||
item-32 at level 2: inline: group WebVTT cue voice span
|
||||
item-33 at level 3: text: Speaker B:
|
||||
item-34 at level 3: text: Would be maybe good to create items,
|
||||
item-35 at level 1: section: group WebVTT cue block
|
||||
item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
|
||||
item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
|
||||
item-38 at level 2: inline: group WebVTT cue voice span
|
||||
item-39 at level 3: text: Speaker B:
|
||||
item-40 at level 3: text: some metadata, some options that can be specific.
|
||||
item-41 at level 1: section: group WebVTT cue block
|
||||
item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
|
||||
item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
|
||||
item-44 at level 2: inline: group WebVTT cue voice span
|
||||
item-45 at level 3: text: Speaker A:
|
||||
item-46 at level 3: text: Yeah, I mean I think you went even more than
|
||||
item-47 at level 1: section: group WebVTT cue block
|
||||
item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
|
||||
item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
|
||||
item-50 at level 2: inline: group WebVTT cue voice span
|
||||
item-51 at level 3: text: Speaker B:
|
||||
item-52 at level 3: text: But we preserved the atoms.
|
||||
item-53 at level 1: section: group WebVTT cue block
|
||||
item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
|
||||
item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
|
||||
item-56 at level 2: inline: group WebVTT cue voice span
|
||||
item-57 at level 3: text: Speaker A:
|
||||
item-58 at level 3: text: than me. I just opened the format.
|
||||
item-59 at level 1: section: group WebVTT cue block
|
||||
item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
|
||||
item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
|
||||
item-62 at level 2: inline: group WebVTT cue voice span
|
||||
item-63 at level 3: text: Speaker A:
|
||||
item-64 at level 3: text: give it a try, yeah.
|
||||
item-65 at level 1: section: group WebVTT cue block
|
||||
item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
|
||||
item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
|
||||
item-68 at level 2: inline: group WebVTT cue voice span
|
||||
item-69 at level 3: text: Speaker B:
|
||||
item-70 at level 3: text: Okay, talk to you later.
|
||||
item-71 at level 1: section: group WebVTT cue block
|
||||
item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
|
||||
item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
|
||||
item-74 at level 2: inline: group WebVTT cue voice span
|
||||
item-75 at level 3: text: Speaker A:
|
||||
item-76 at level 3: text: See you.
|
||||
1240
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
vendored
Normal file
1240
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
77
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
vendored
Normal file
77
tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
|
||||
|
||||
00:00:04.963 --> 00:00:08.571
|
||||
|
||||
Speaker A: OK, I think now we should be recording
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
|
||||
|
||||
00:00:08.571 --> 00:00:09.403
|
||||
|
||||
Speaker A: properly.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
|
||||
|
||||
00:00:10.683 --> 00:00:11.563
|
||||
|
||||
Good.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
|
||||
|
||||
00:00:13.363 --> 00:00:13.803
|
||||
|
||||
Speaker A: Yeah.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
|
||||
|
||||
00:00:49.603 --> 00:00:53.363
|
||||
|
||||
Speaker B: I was also thinking.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
|
||||
|
||||
00:00:54.963 --> 00:01:02.072
|
||||
|
||||
Speaker B: Would be maybe good to create items,
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
|
||||
|
||||
00:01:02.072 --> 00:01:06.811
|
||||
|
||||
Speaker B: some metadata, some options that can be specific.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
|
||||
|
||||
00:01:10.243 --> 00:01:13.014
|
||||
|
||||
Speaker A: Yeah, I mean I think you went even more than
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
|
||||
|
||||
00:01:10.563 --> 00:01:12.643
|
||||
|
||||
Speaker B: But we preserved the atoms.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
|
||||
|
||||
00:01:13.014 --> 00:01:15.907
|
||||
|
||||
Speaker A: than me. I just opened the format.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
|
||||
|
||||
00:01:50.222 --> 00:01:51.643
|
||||
|
||||
Speaker A: give it a try, yeah.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
|
||||
|
||||
00:01:52.043 --> 00:01:55.043
|
||||
|
||||
Speaker B: Okay, talk to you later.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
|
||||
|
||||
00:01:54.603 --> 00:01:55.283
|
||||
|
||||
Speaker A: See you.
|
||||
@@ -1,16 +1,16 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: paragraph: Transcript
|
||||
item-2 at level 1: paragraph: February 20, 2025, 8:32PM
|
||||
item-1 at level 1: text: Transcript
|
||||
item-2 at level 1: text: February 20, 2025, 8:32PM
|
||||
item-3 at level 1: picture
|
||||
item-4 at level 1: inline: group group
|
||||
item-5 at level 2: paragraph: This is test 1
|
||||
item-6 at level 2: paragraph: 0:08
|
||||
item-5 at level 2: text: This is test 1
|
||||
item-6 at level 2: text: 0:08
|
||||
Correct, he is not.
|
||||
item-7 at level 1: paragraph:
|
||||
item-7 at level 1: text:
|
||||
item-8 at level 1: picture
|
||||
item-9 at level 1: inline: group group
|
||||
item-10 at level 2: paragraph: This is test 2
|
||||
item-11 at level 2: paragraph: 0:16
|
||||
item-10 at level 2: text: This is test 2
|
||||
item-11 at level 2: text: 0:16
|
||||
Yeah, exactly.
|
||||
item-12 at level 1: paragraph:
|
||||
item-13 at level 1: paragraph:
|
||||
item-12 at level 1: text:
|
||||
item-13 at level 1: text:
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "word_image_anchors",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -93,7 +93,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Transcript",
|
||||
"text": "Transcript",
|
||||
@@ -112,7 +112,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "February 20, 2025, 8:32PM",
|
||||
"text": "February 20, 2025, 8:32PM",
|
||||
@@ -131,7 +131,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is test 1",
|
||||
"text": "This is test 1",
|
||||
@@ -150,7 +150,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "0:08\nCorrect, he is not.",
|
||||
"text": "0:08\nCorrect, he is not.",
|
||||
@@ -169,7 +169,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -181,7 +181,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "This is test 2",
|
||||
"text": "This is test 2",
|
||||
@@ -200,7 +200,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "0:16\nYeah, exactly.",
|
||||
"text": "0:16\nYeah, exactly.",
|
||||
@@ -219,7 +219,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -231,7 +231,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
|
||||
@@ -1,28 +1,28 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: paragraph: Summer activities
|
||||
item-1 at level 1: text: Summer activities
|
||||
item-2 at level 1: title: Swimming in the lake
|
||||
item-3 at level 2: paragraph: Duck
|
||||
item-3 at level 2: text: Duck
|
||||
item-4 at level 2: picture
|
||||
item-5 at level 2: paragraph: Figure 1: This is a cute duckling
|
||||
item-5 at level 2: text: Figure 1: This is a cute duckling
|
||||
item-6 at level 2: section_header: Let’s swim!
|
||||
item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown:
|
||||
item-7 at level 3: text: To get started with swimming, fi ... down in a water and try not to drown:
|
||||
item-8 at level 3: list: group list
|
||||
item-9 at level 4: list_item: You can relax and look around
|
||||
item-10 at level 4: list_item: Paddle about
|
||||
item-11 at level 4: list_item: Enjoy summer warmth
|
||||
item-12 at level 3: paragraph: Also, don’t forget:
|
||||
item-12 at level 3: text: Also, don’t forget:
|
||||
item-13 at level 3: list: group list
|
||||
item-14 at level 4: list_item: Wear sunglasses
|
||||
item-15 at level 4: list_item: Don’t forget to drink water
|
||||
item-16 at level 4: list_item: Use sun cream
|
||||
item-17 at level 3: paragraph: Hmm, what else…
|
||||
item-17 at level 3: text: Hmm, what else…
|
||||
item-18 at level 3: section_header: Let’s eat
|
||||
item-19 at level 4: paragraph: After we had a good day of swimm ... , it’s important to eat something nice
|
||||
item-20 at level 4: paragraph: I like to eat leaves
|
||||
item-21 at level 4: paragraph: Here are some interesting things a respectful duck could eat:
|
||||
item-19 at level 4: text: After we had a good day of swimm ... , it’s important to eat something nice
|
||||
item-20 at level 4: text: I like to eat leaves
|
||||
item-21 at level 4: text: Here are some interesting things a respectful duck could eat:
|
||||
item-22 at level 4: table with [4x3]
|
||||
item-23 at level 4: paragraph:
|
||||
item-24 at level 4: paragraph: And let’s add another list in the end:
|
||||
item-23 at level 4: text:
|
||||
item-24 at level 4: text: And let’s add another list in the end:
|
||||
item-25 at level 4: list: group list
|
||||
item-26 at level 5: list_item: Leaves
|
||||
item-27 at level 5: list_item: Berries
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.6.0",
|
||||
"version": "1.7.0",
|
||||
"name": "word_sample",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@@ -98,7 +98,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Summer activities",
|
||||
"text": "Summer activities",
|
||||
@@ -142,7 +142,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Duck",
|
||||
"text": "Duck",
|
||||
@@ -161,7 +161,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Figure 1: This is a cute duckling",
|
||||
"text": "Figure 1: This is a cute duckling",
|
||||
@@ -212,7 +212,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "To get started with swimming, first lay down in a water and try not to drown:",
|
||||
"text": "To get started with swimming, first lay down in a water and try not to drown:",
|
||||
@@ -294,7 +294,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Also, don’t forget:",
|
||||
"text": "Also, don’t forget:",
|
||||
@@ -376,7 +376,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Hmm, what else…",
|
||||
"text": "Hmm, what else…",
|
||||
@@ -430,7 +430,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "After we had a good day of swimming in the lake, it’s important to eat something nice",
|
||||
"text": "After we had a good day of swimming in the lake, it’s important to eat something nice",
|
||||
@@ -449,7 +449,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "I like to eat leaves",
|
||||
"text": "I like to eat leaves",
|
||||
@@ -468,7 +468,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Here are some interesting things a respectful duck could eat:",
|
||||
"text": "Here are some interesting things a respectful duck could eat:",
|
||||
@@ -487,7 +487,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
@@ -499,7 +499,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "And let’s add another list in the end:",
|
||||
"text": "And let’s add another list in the end:",
|
||||
@@ -625,7 +625,8 @@
|
||||
"text": "",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -637,7 +638,8 @@
|
||||
"text": "Food",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -649,7 +651,8 @@
|
||||
"text": "Calories per portion",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -661,7 +664,8 @@
|
||||
"text": "Leaves",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -673,7 +677,8 @@
|
||||
"text": "Ash, Elm, Maple",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -685,7 +690,8 @@
|
||||
"text": "50",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -697,7 +703,8 @@
|
||||
"text": "Berries",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -709,7 +716,8 @@
|
||||
"text": "Blueberry, Strawberry, Cranberry",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -721,7 +729,8 @@
|
||||
"text": "150",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -733,7 +742,8 @@
|
||||
"text": "Grain",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -745,7 +755,8 @@
|
||||
"text": "Corn, Buckwheat, Barley",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -757,7 +768,8 @@
|
||||
"text": "200",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
"num_rows": 4,
|
||||
@@ -774,7 +786,8 @@
|
||||
"text": "",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -786,7 +799,8 @@
|
||||
"text": "Food",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -798,7 +812,8 @@
|
||||
"text": "Calories per portion",
|
||||
"column_header": true,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -812,7 +827,8 @@
|
||||
"text": "Leaves",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -824,7 +840,8 @@
|
||||
"text": "Ash, Elm, Maple",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -836,7 +853,8 @@
|
||||
"text": "50",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -850,7 +868,8 @@
|
||||
"text": "Berries",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -862,7 +881,8 @@
|
||||
"text": "Blueberry, Strawberry, Cranberry",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -874,7 +894,8 @@
|
||||
"text": "150",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
],
|
||||
[
|
||||
@@ -888,7 +909,8 @@
|
||||
"text": "Grain",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -900,7 +922,8 @@
|
||||
"text": "Corn, Buckwheat, Barley",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
},
|
||||
{
|
||||
"row_span": 1,
|
||||
@@ -912,7 +935,8 @@
|
||||
"text": "200",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
"row_section": false,
|
||||
"fillable": false
|
||||
}
|
||||
]
|
||||
]
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group header-0
|
||||
item-2 at level 2: section_header: Test with tables
|
||||
item-3 at level 3: paragraph: A uniform table
|
||||
item-3 at level 3: text: A uniform table
|
||||
item-4 at level 3: table with [3x3]
|
||||
item-5 at level 3: paragraph:
|
||||
item-6 at level 3: paragraph: A non-uniform table with horizontal spans
|
||||
item-5 at level 3: text:
|
||||
item-6 at level 3: text: A non-uniform table with horizontal spans
|
||||
item-7 at level 3: table with [3x3]
|
||||
item-8 at level 3: paragraph:
|
||||
item-9 at level 3: paragraph: A non-uniform table with horizontal spans in inner columns
|
||||
item-8 at level 3: text:
|
||||
item-9 at level 3: text: A non-uniform table with horizontal spans in inner columns
|
||||
item-10 at level 3: table with [3x4]
|
||||
item-11 at level 3: paragraph:
|
||||
item-12 at level 3: paragraph: A non-uniform table with vertical spans
|
||||
item-11 at level 3: text:
|
||||
item-12 at level 3: text: A non-uniform table with vertical spans
|
||||
item-13 at level 3: table with [5x3]
|
||||
item-14 at level 3: paragraph:
|
||||
item-15 at level 3: paragraph: A non-uniform table with all kinds of spans and empty cells
|
||||
item-14 at level 3: text:
|
||||
item-15 at level 3: text: A non-uniform table with all kinds of spans and empty cells
|
||||
item-16 at level 3: table with [9x5]
|
||||
item-17 at level 3: paragraph:
|
||||
item-18 at level 3: paragraph:
|
||||
item-17 at level 3: text:
|
||||
item-18 at level 3: text:
|
||||
File diff suppressed because it is too large
Load Diff
33
tests/data/md/escaped_characters.md
vendored
Normal file
33
tests/data/md/escaped_characters.md
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
# Headers:
|
||||
## & < > " '
|
||||
|
||||
Text:
|
||||
00:16.000 ----> 00:18.000
|
||||
& < > " '
|
||||
|
||||
# Lists
|
||||
1. & < > " '
|
||||
- & < > " '
|
||||
|
||||
# Inline code
|
||||
`& < > " ' `
|
||||
|
||||
# Code block
|
||||
```
|
||||
& < > " '
|
||||
```
|
||||
|
||||
# Table
|
||||
| Key | Example |
|
||||
| ------------------- | ----------------- |
|
||||
| Ampersand | & |
|
||||
| Less-than | < |
|
||||
| Greater-than | > |
|
||||
| Quotes | " |
|
||||
| Apostrophes | ' |
|
||||
|
||||
# Raw HTML
|
||||
<div title="">& < > " '/div>
|
||||
|
||||
## Link
|
||||
[& < > " '](https://en.wikipedia.org/wiki/Albert_Einstein)
|
||||
42
tests/data/webvtt/webvtt_example_01.vtt
vendored
Normal file
42
tests/data/webvtt/webvtt_example_01.vtt
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
WEBVTT
|
||||
|
||||
NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
|
||||
|
||||
00:11.000 --> 00:13.000
|
||||
<v Roger Bingham>We are in New York City
|
||||
|
||||
00:13.000 --> 00:16.000
|
||||
<v Roger Bingham>We’re actually at the Lucern Hotel, just down the street
|
||||
|
||||
00:16.000 --> 00:18.000
|
||||
<v Roger Bingham>from the American Museum of Natural History
|
||||
|
||||
00:18.000 --> 00:20.000
|
||||
<v Roger Bingham>And with me is Neil deGrasse Tyson
|
||||
|
||||
00:20.000 --> 00:22.000
|
||||
<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
|
||||
|
||||
00:22.000 --> 00:24.000
|
||||
<v Roger Bingham>at the AMNH.
|
||||
|
||||
00:24.000 --> 00:26.000
|
||||
<v Roger Bingham>Thank you for walking down here.
|
||||
|
||||
00:27.000 --> 00:30.000
|
||||
<v Roger Bingham>And I want to do a follow-up on the last conversation we did.
|
||||
|
||||
00:30.000 --> 00:31.500 align:right size:50%
|
||||
<v Roger Bingham>When we e-mailed—
|
||||
|
||||
00:30.500 --> 00:32.500 align:left size:50%
|
||||
<v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?
|
||||
|
||||
00:32.000 --> 00:35.500 align:right size:50%
|
||||
<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
|
||||
|
||||
00:32.500 --> 00:33.500 align:left size:50%
|
||||
<v Neil deGrasse Tyson><i>Laughs</i>
|
||||
|
||||
00:35.500 --> 00:38.000
|
||||
<v Roger Bingham>You know I’m so excited my glasses are falling off here.
|
||||
15
tests/data/webvtt/webvtt_example_02.vtt
vendored
Normal file
15
tests/data/webvtt/webvtt_example_02.vtt
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
WEBVTT
|
||||
|
||||
NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
|
||||
|
||||
00:00.000 --> 00:02.000
|
||||
<v.first.loud Esme>It’s a blue apple tree!
|
||||
|
||||
00:02.000 --> 00:04.000
|
||||
<v Mary>No way!
|
||||
|
||||
00:04.000 --> 00:06.000
|
||||
<v Esme>Hee!</v> <i>laughter</i>
|
||||
|
||||
00:06.000 --> 00:08.000
|
||||
<v.loud Mary>That’s awesome!
|
||||
57
tests/data/webvtt/webvtt_example_03.vtt
vendored
Normal file
57
tests/data/webvtt/webvtt_example_03.vtt
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
WEBVTT
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
|
||||
00:00:04.963 --> 00:00:08.571
|
||||
<v Speaker A>OK,
|
||||
I think now we should be recording</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
|
||||
00:00:08.571 --> 00:00:09.403
|
||||
<v Speaker A>properly.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
|
||||
00:00:10.683 --> 00:00:11.563
|
||||
Good.
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
|
||||
00:00:13.363 --> 00:00:13.803
|
||||
<v Speaker A>Yeah.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
|
||||
00:00:49.603 --> 00:00:53.363
|
||||
<v Speaker B>I was also thinking.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
|
||||
00:00:54.963 --> 00:01:02.072
|
||||
<v Speaker B>Would be maybe good to create items,</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
|
||||
00:01:02.072 --> 00:01:06.811
|
||||
<v Speaker B>some metadata,
|
||||
some options that can be specific.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
|
||||
00:01:10.243 --> 00:01:13.014
|
||||
<v Speaker A>Yeah,
|
||||
I mean I think you went even more than</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
|
||||
00:01:10.563 --> 00:01:12.643
|
||||
<v Speaker B>But we preserved the atoms.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
|
||||
00:01:13.014 --> 00:01:15.907
|
||||
<v Speaker A>than me.
|
||||
I just opened the format.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
|
||||
00:01:50.222 --> 00:01:51.643
|
||||
<v Speaker A>give it a try, yeah.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
|
||||
00:01:52.043 --> 00:01:55.043
|
||||
<v Speaker B>Okay, talk to you later.</v>
|
||||
|
||||
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
|
||||
00:01:54.603 --> 00:01:55.283
|
||||
<v Speaker A>See you.</v>
|
||||
@@ -26,10 +26,12 @@ def test_convert_valid():
|
||||
assert len(relevant_paths) > 0
|
||||
|
||||
yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
|
||||
json_filter = ["escaped_characters"]
|
||||
|
||||
for in_path in relevant_paths:
|
||||
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
||||
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
|
||||
json_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.json"
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=in_path,
|
||||
@@ -45,6 +47,9 @@ def test_convert_valid():
|
||||
act_doc = backend.convert()
|
||||
act_data = act_doc.export_to_markdown()
|
||||
|
||||
if in_path.stem in json_filter:
|
||||
assert verify_document(act_doc, json_gt_path, GENERATE), "export to json"
|
||||
|
||||
if GEN_TEST_DATA:
|
||||
with open(md_gt_path, mode="w", encoding="utf-8") as f:
|
||||
f.write(f"{act_data}\n")
|
||||
|
||||
232
tests/test_backend_vtt.py
Normal file
232
tests/test_backend_vtt.py
Normal file
@@ -0,0 +1,232 @@
|
||||
# Assisted by watsonx Code Assistant
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
from pydantic import ValidationError
|
||||
|
||||
from docling.backend.webvtt_backend import (
|
||||
_WebVTTCueItalicSpan,
|
||||
_WebVTTCueTextSpan,
|
||||
_WebVTTCueTimings,
|
||||
_WebVTTCueVoiceSpan,
|
||||
_WebVTTFile,
|
||||
_WebVTTTimestamp,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .test_data_gen_flag import GEN_TEST_DATA
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
def test_vtt_cue_commponents():
|
||||
"""Test WebVTT components."""
|
||||
valid_timestamps = [
|
||||
"00:01:02.345",
|
||||
"12:34:56.789",
|
||||
"02:34.567",
|
||||
"00:00:00.000",
|
||||
]
|
||||
valid_total_seconds = [
|
||||
1 * 60 + 2.345,
|
||||
12 * 3600 + 34 * 60 + 56.789,
|
||||
2 * 60 + 34.567,
|
||||
0.0,
|
||||
]
|
||||
for idx, ts in enumerate(valid_timestamps):
|
||||
model = _WebVTTTimestamp(raw=ts)
|
||||
assert model.seconds == valid_total_seconds[idx]
|
||||
|
||||
"""Test invalid WebVTT timestamps."""
|
||||
invalid_timestamps = [
|
||||
"00:60:02.345", # minutes > 59
|
||||
"00:01:60.345", # seconds > 59
|
||||
"00:01:02.1000", # milliseconds > 999
|
||||
"01:02:03", # missing milliseconds
|
||||
"01:02", # missing milliseconds
|
||||
":01:02.345", # extra : for missing hours
|
||||
"abc:01:02.345", # invalid format
|
||||
]
|
||||
for ts in invalid_timestamps:
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTTimestamp(raw=ts)
|
||||
|
||||
"""Test the timestamp __str__ method."""
|
||||
model = _WebVTTTimestamp(raw="00:01:02.345")
|
||||
assert str(model) == "00:01:02.345"
|
||||
|
||||
"""Test valid cue timings."""
|
||||
start = _WebVTTTimestamp(raw="00:10.005")
|
||||
end = _WebVTTTimestamp(raw="00:14.007")
|
||||
cue_timings = _WebVTTCueTimings(start=start, end=end)
|
||||
assert cue_timings.start == start
|
||||
assert cue_timings.end == end
|
||||
assert str(cue_timings) == "00:10.005 --> 00:14.007"
|
||||
|
||||
"""Test invalid cue timings with end timestamp before start."""
|
||||
start = _WebVTTTimestamp(raw="00:10.700")
|
||||
end = _WebVTTTimestamp(raw="00:10.500")
|
||||
with pytest.raises(ValidationError) as excinfo:
|
||||
_WebVTTCueTimings(start=start, end=end)
|
||||
assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
|
||||
|
||||
"""Test invalid cue timings with missing end."""
|
||||
start = _WebVTTTimestamp(raw="00:10.500")
|
||||
with pytest.raises(ValidationError) as excinfo:
|
||||
_WebVTTCueTimings(start=start)
|
||||
assert "Field required" in str(excinfo.value)
|
||||
|
||||
"""Test invalid cue timings with missing start."""
|
||||
end = _WebVTTTimestamp(raw="00:10.500")
|
||||
with pytest.raises(ValidationError) as excinfo:
|
||||
_WebVTTCueTimings(end=end)
|
||||
assert "Field required" in str(excinfo.value)
|
||||
|
||||
"""Test with valid text."""
|
||||
valid_text = "This is a valid cue text span."
|
||||
span = _WebVTTCueTextSpan(text=valid_text)
|
||||
assert span.text == valid_text
|
||||
assert str(span) == valid_text
|
||||
|
||||
"""Test with text containing newline characters."""
|
||||
invalid_text = "This cue text span\ncontains a newline."
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueTextSpan(text=invalid_text)
|
||||
|
||||
"""Test with text containing ampersand."""
|
||||
invalid_text = "This cue text span contains &."
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueTextSpan(text=invalid_text)
|
||||
|
||||
"""Test with text containing less-than sign."""
|
||||
invalid_text = "This cue text span contains <."
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueTextSpan(text=invalid_text)
|
||||
|
||||
"""Test with empty text."""
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueTextSpan(text="")
|
||||
|
||||
"""Test that annotation validation works correctly."""
|
||||
valid_annotation = "valid-annotation"
|
||||
invalid_annotation = "invalid\nannotation"
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueVoiceSpan(annotation=invalid_annotation)
|
||||
assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
|
||||
|
||||
"""Test that classes validation works correctly."""
|
||||
annotation = "speaker name"
|
||||
valid_classes = ["class1", "class2"]
|
||||
invalid_classes = ["class\nwith\nnewlines", ""]
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
|
||||
assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
|
||||
|
||||
"""Test that components validation works correctly."""
|
||||
annotation = "speaker name"
|
||||
valid_components = [_WebVTTCueTextSpan(text="random text")]
|
||||
invalid_components = [123, "not a component"]
|
||||
with pytest.raises(ValidationError):
|
||||
_WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
|
||||
assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
|
||||
|
||||
"""Test valid cue voice spans."""
|
||||
cue_span = _WebVTTCueVoiceSpan(
|
||||
annotation="speaker",
|
||||
classes=["loud", "clear"],
|
||||
components=[_WebVTTCueTextSpan(text="random text")],
|
||||
)
|
||||
|
||||
expected_str = "<v.loud.clear speaker>random text</v>"
|
||||
assert str(cue_span) == expected_str
|
||||
|
||||
cue_span = _WebVTTCueVoiceSpan(
|
||||
annotation="speaker",
|
||||
components=[_WebVTTCueTextSpan(text="random text")],
|
||||
)
|
||||
expected_str = "<v speaker>random text</v>"
|
||||
assert str(cue_span) == expected_str
|
||||
|
||||
|
||||
def test_webvtt_file():
|
||||
"""Test WebVTT files."""
|
||||
with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
vtt = _WebVTTFile.parse(content)
|
||||
assert len(vtt) == 13
|
||||
block = vtt.cue_blocks[11]
|
||||
assert str(block.timings) == "00:32.500 --> 00:33.500"
|
||||
assert len(block.payload) == 1
|
||||
cue_span = block.payload[0]
|
||||
assert isinstance(cue_span, _WebVTTCueVoiceSpan)
|
||||
assert cue_span.annotation == "Neil deGrasse Tyson"
|
||||
assert not cue_span.classes
|
||||
assert len(cue_span.components) == 1
|
||||
comp = cue_span.components[0]
|
||||
assert isinstance(comp, _WebVTTCueItalicSpan)
|
||||
assert len(comp.components) == 1
|
||||
comp2 = comp.components[0]
|
||||
assert isinstance(comp2, _WebVTTCueTextSpan)
|
||||
assert comp2.text == "Laughs"
|
||||
|
||||
with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
vtt = _WebVTTFile.parse(content)
|
||||
assert len(vtt) == 4
|
||||
reverse = (
|
||||
"WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
|
||||
"https://www.w3.org/TR/webvtt1/\n\n"
|
||||
)
|
||||
reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
|
||||
assert content == reverse
|
||||
|
||||
with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
vtt = _WebVTTFile.parse(content)
|
||||
assert len(vtt) == 13
|
||||
for block in vtt:
|
||||
assert block.identifier
|
||||
block = vtt.cue_blocks[0]
|
||||
assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
|
||||
assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
|
||||
assert len(block.payload) == 1
|
||||
assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
|
||||
block = vtt.cue_blocks[2]
|
||||
assert isinstance(cue_span, _WebVTTCueVoiceSpan)
|
||||
assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
|
||||
assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
|
||||
assert len(block.payload) == 1
|
||||
assert isinstance(block.payload[0], _WebVTTCueTextSpan)
|
||||
assert block.payload[0].text == "Good."
|
||||
|
||||
|
||||
def test_e2e_vtt_conversions():
|
||||
directory = Path("./tests/data/webvtt/")
|
||||
vtt_paths = sorted(directory.rglob("*.vtt"))
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.VTT])
|
||||
|
||||
for vtt in vtt_paths:
|
||||
gt_path = vtt.parent.parent / "groundtruth" / "docling_v2" / vtt.name
|
||||
|
||||
conv_result: ConversionResult = converter.convert(vtt)
|
||||
|
||||
doc: DoclingDocument = conv_result.document
|
||||
|
||||
pred_md: str = doc.export_to_markdown(escape_html=False)
|
||||
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
||||
"export to md"
|
||||
)
|
||||
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
||||
"export to indented-text"
|
||||
)
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
|
||||
@@ -206,6 +206,11 @@ def test_guess_format(tmp_path):
|
||||
doc_path.write_text("xyz", encoding="utf-8")
|
||||
assert dci._guess_format(doc_path) is None
|
||||
|
||||
# Valid WebVTT
|
||||
buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read())
|
||||
stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.VTT
|
||||
|
||||
# Valid Docling JSON
|
||||
test_str = '{"name": ""}'
|
||||
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
|
||||
|
||||
10
uv.lock
generated
10
uv.lock
generated
@@ -1049,7 +1049,7 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "docling"
|
||||
version = "2.53.0"
|
||||
version = "2.54.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "accelerate" },
|
||||
@@ -1154,7 +1154,7 @@ requires-dist = [
|
||||
{ name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
|
||||
{ name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
|
||||
{ name = "certifi", specifier = ">=2024.7.4" },
|
||||
{ name = "docling-core", extras = ["chunking"], specifier = ">=2.48.0,<3.0.0" },
|
||||
{ name = "docling-core", extras = ["chunking"], specifier = ">=2.48.2,<3.0.0" },
|
||||
{ name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
|
||||
{ name = "docling-parse", specifier = ">=4.4.0,<5.0.0" },
|
||||
{ name = "easyocr", specifier = ">=1.7,<2.0" },
|
||||
@@ -1233,7 +1233,7 @@ examples = [
|
||||
|
||||
[[package]]
|
||||
name = "docling-core"
|
||||
version = "2.48.1"
|
||||
version = "2.48.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "jsonref" },
|
||||
@@ -1247,9 +1247,9 @@ dependencies = [
|
||||
{ name = "typer" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f9/0c/dce7f80e99e56570d143885fc40536107e8a39ef4de2888959e055b39607/docling_core-2.48.1.tar.gz", hash = "sha256:48cb77575dfd020a51413957e96b165e45f6d1027c641710fddb389dcb9b189c", size = 161311, upload-time = "2025-09-11T12:33:22.46Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/dd/e6/922de61f2a7b7d337ffc781f8e85f5581b12801fe193827066ccd6c5ba04/docling_core-2.48.2.tar.gz", hash = "sha256:01c12a1d3c9877c6658d0d6adf5cdcefd56cb814d8083860ba2d77ab882ac2d0", size = 161344, upload-time = "2025-09-22T08:39:41.431Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/90/fe/1b96120c9d94c97016716ccf46ad2708a2e76157e52dfcca4101db70fc21/docling_core-2.48.1-py3-none-any.whl", hash = "sha256:a3985999ac2067e15e589ef0f11ccde264deacaea403c0f94049242f10a6189a", size = 164330, upload-time = "2025-09-11T12:33:20.935Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/97/bc/a77739cc31d7de2be9d6682f880761083a2038355e513e813a73a041c644/docling_core-2.48.2-py3-none-any.whl", hash = "sha256:d1f2fe9be9a9f7e7a2fb6ddcc9d9fcbf437bfb02e0c6005cdec1ece1cf4aed44", size = 164376, upload-time = "2025-09-22T08:39:39.704Z" },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
|
||||
Reference in New Issue
Block a user