This commit is contained in:
utsavMongoDB
2025-09-29 12:56:11 +05:30
59 changed files with 5859 additions and 828 deletions

View File

@@ -1,3 +1,19 @@
## [v2.54.0](https://github.com/docling-project/docling/releases/tag/v2.54.0) - 2025-09-22
### Feature
* Rich tables for MSWord backend ([#2291](https://github.com/docling-project/docling/issues/2291)) ([`e2482a2`](https://github.com/docling-project/docling/commit/e2482a2ada52b2b8a41c4402b27e125adbe4385f))
* Add a backend parser for WebVTT files ([#2288](https://github.com/docling-project/docling/issues/2288)) ([`46efaae`](https://github.com/docling-project/docling/commit/46efaaefee17a6b83e02a050f9f3c8a51afbbd53))
### Fix
* Correct y-axis scaling in draw_table_cells ([#2287](https://github.com/docling-project/docling/issues/2287)) ([`b5628f1`](https://github.com/docling-project/docling/commit/b5628f12273297d9db1393f4b734cfa337caa8c9))
### Documentation
* Update API VLM example with granite-docling ([#2294](https://github.com/docling-project/docling/issues/2294)) ([`8b7e83a`](https://github.com/docling-project/docling/commit/8b7e83a8c7b9e333c31d5ae0b96213e3c70c6bf3))
* Fix examples rendering ([#2281](https://github.com/docling-project/docling/issues/2281)) ([`8322c2e`](https://github.com/docling-project/docling/commit/8322c2ea9b4fbb1625bcbf1ec1b3dea6c1cd3ed0))
## [v2.53.0](https://github.com/docling-project/docling/releases/tag/v2.53.0) - 2025-09-17 ## [v2.53.0](https://github.com/docling-project/docling/releases/tag/v2.53.0) - 2025-09-17
### Feature ### Feature

View File

@@ -29,7 +29,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features ## Features
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -45,13 +45,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
* 📤 Structured [information extraction][extraction] \[🧪 beta\] * 📤 Structured [information extraction][extraction] \[🧪 beta\]
* 📑 New layout model (**Heron**) by default, for faster PDF parsing * 📑 New layout model (**Heron**) by default, for faster PDF parsing
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
* 💬 Parsing of Web Video Text Tracks (WebVTT) files
### Coming soon ### Coming soon
* 📝 Metadata extraction, including title, authors, references & language * 📝 Metadata extraction, including title, authors, references & language
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc) * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
* 📝 Complex chemistry understanding (Molecular structures) * 📝 Complex chemistry understanding (Molecular structures)
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
## Installation ## Installation

View File

@@ -3,6 +3,7 @@ import re
import warnings import warnings
from copy import deepcopy from copy import deepcopy
from enum import Enum from enum import Enum
from html import unescape
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Literal, Optional, Union, cast from typing import Literal, Optional, Union, cast
@@ -321,9 +322,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
fig_caption: Optional[TextItem] = None fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "": if element.title is not None and element.title != "":
title = unescape(element.title)
fig_caption = doc.add_text( fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, label=DocItemLabel.CAPTION,
text=element.title, text=title,
formatting=formatting, formatting=formatting,
hyperlink=hyperlink, hyperlink=hyperlink,
) )
@@ -351,6 +353,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
snippet_text = ( snippet_text = (
element.children.strip() if isinstance(element.children, str) else "" element.children.strip() if isinstance(element.children, str) else ""
) )
snippet_text = unescape(snippet_text)
# Detect start of the table: # Detect start of the table:
if "|" in snippet_text or self.in_table: if "|" in snippet_text or self.in_table:
# most likely part of the markdown table # most likely part of the markdown table

View File

@@ -12,8 +12,11 @@ from docling_core.types.doc import (
ImageRef, ImageRef,
ListGroup, ListGroup,
NodeItem, NodeItem,
RefItem,
RichTableCell,
TableCell, TableCell,
TableData, TableData,
TextItem,
) )
from docling_core.types.doc.document import Formatting from docling_core.types.doc.document import Formatting
from docx import Document from docx import Document
@@ -128,7 +131,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc = DoclingDocument(name=self.file.stem or "file", origin=origin) doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid(): if self.is_valid():
assert self.docx_obj is not None assert self.docx_obj is not None
doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc) doc, _ = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
# doc, _ = doc_info
return doc return doc
else: else:
raise RuntimeError( raise RuntimeError(
@@ -172,7 +176,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
body: BaseOxmlElement, body: BaseOxmlElement,
docx_obj: DocxDocument, docx_obj: DocxDocument,
doc: DoclingDocument, doc: DoclingDocument,
) -> DoclingDocument: # parent:
) -> tuple[DoclingDocument, list[RefItem]]:
added_elements = []
for element in body: for element in body:
tag_name = etree.QName(element).localname tag_name = etree.QName(element).localname
# Check for Inline Images (blip elements) # Check for Inline Images (blip elements)
@@ -230,8 +236,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[level - 1], parent=self.parents[level - 1],
name="shape-text", name="shape-text",
) )
added_elements.append(shape_group.get_ref())
doc.add_text( doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.TEXT,
parent=shape_group, parent=shape_group,
text=text_content, text=text_content,
) )
@@ -246,23 +253,27 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
_log.debug( _log.debug(
f"Found textbox content with {len(textbox_elements)} elements" f"Found textbox content with {len(textbox_elements)} elements"
) )
self._handle_textbox_content(textbox_elements, docx_obj, doc) tbc = self._handle_textbox_content(textbox_elements, docx_obj, doc)
added_elements.extend(tbc)
# Check for Tables # Check for Tables
if element.tag.endswith("tbl"): if element.tag.endswith("tbl"):
try: try:
self._handle_tables(element, docx_obj, doc) t = self._handle_tables(element, docx_obj, doc)
added_elements.extend(t)
except Exception: except Exception:
_log.debug("could not parse a table, broken docx table") _log.debug("could not parse a table, broken docx table")
# Check for Image # Check for Image
elif drawing_blip: elif drawing_blip:
self._handle_pictures(docx_obj, drawing_blip, doc) pics = self._handle_pictures(docx_obj, drawing_blip, doc)
added_elements.extend(pics)
# Check for Text after the Image # Check for Text after the Image
if ( if (
tag_name in ["p"] tag_name in ["p"]
and element.find(".//w:t", namespaces=namespaces) is not None and element.find(".//w:t", namespaces=namespaces) is not None
): ):
self._handle_text_elements(element, docx_obj, doc) te1 = self._handle_text_elements(element, docx_obj, doc)
added_elements.extend(te1)
# Check for the sdt containers, like table of contents # Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]: elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -270,15 +281,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Iterate paragraphs, runs, or text inside <w:sdtContent>. # Iterate paragraphs, runs, or text inside <w:sdtContent>.
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces) paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
for p in paragraphs: for p in paragraphs:
self._handle_text_elements(p, docx_obj, doc) te = self._handle_text_elements(p, docx_obj, doc)
added_elements.extend(te)
# Check for Text # Check for Text
elif tag_name in ["p"]: elif tag_name in ["p"]:
# "tcPr", "sectPr" # "tcPr", "sectPr"
self._handle_text_elements(element, docx_obj, doc) te = self._handle_text_elements(element, docx_obj, doc)
added_elements.extend(te)
else: else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}") _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc return doc, added_elements
def _str_to_int( def _str_to_int(
self, s: Optional[str], default: Optional[int] = 0 self, s: Optional[str], default: Optional[int] = 0
@@ -674,14 +687,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
textbox_elements: list, textbox_elements: list,
docx_obj: DocxDocument, docx_obj: DocxDocument,
doc: DoclingDocument, doc: DoclingDocument,
) -> None: ) -> List[RefItem]:
elem_ref: List[RefItem] = []
"""Process textbox content and add it to the document structure.""" """Process textbox content and add it to the document structure."""
level = self._get_level() level = self._get_level()
# Create a textbox group to contain all text from the textbox # Create a textbox group to contain all text from the textbox
textbox_group = doc.add_group( textbox_group = doc.add_group(
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox" label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
) )
elem_ref.append(textbox_group.get_ref())
# Set this as the current parent to ensure textbox content # Set this as the current parent to ensure textbox content
# is properly nested in document structure # is properly nested in document structure
original_parent = self.parents[level] original_parent = self.parents[level]
@@ -729,11 +743,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Mark this paragraph as processed # Mark this paragraph as processed
processed_paragraphs.add(paragraph_id) processed_paragraphs.add(paragraph_id)
self._handle_text_elements(p, docx_obj, doc) elem_ref.extend(self._handle_text_elements(p, docx_obj, doc))
# Restore original parent # Restore original parent
self.parents[level] = original_parent self.parents[level] = original_parent
return return elem_ref
def _handle_equations_in_text(self, element, text): def _handle_equations_in_text(self, element, text):
only_texts = [] only_texts = []
@@ -803,7 +817,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
element: BaseOxmlElement, element: BaseOxmlElement,
docx_obj: DocxDocument, docx_obj: DocxDocument,
doc: DoclingDocument, doc: DoclingDocument,
) -> None: ) -> List[RefItem]:
elem_ref: List[RefItem] = []
paragraph = Paragraph(element, docx_obj) paragraph = Paragraph(element, docx_obj)
paragraph_elements = self._get_paragraph_elements(paragraph) paragraph_elements = self._get_paragraph_elements(paragraph)
text, equations = self._handle_equations_in_text( text, equations = self._handle_equations_in_text(
@@ -811,7 +826,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
if text is None: if text is None:
return return elem_ref
text = text.strip() text = text.strip()
# Common styles for bullet and numbered lists. # Common styles for bullet and numbered lists.
@@ -832,15 +847,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Check if this is actually a numbered list by examining the numFmt # Check if this is actually a numbered list by examining the numFmt
is_numbered = self._is_numbered_list(docx_obj, numid, ilevel) is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
self._add_list_item( li = self._add_list_item(
doc=doc, doc=doc,
numid=numid, numid=numid,
ilevel=ilevel, ilevel=ilevel,
elements=paragraph_elements, elements=paragraph_elements,
is_numbered=is_numbered, is_numbered=is_numbered,
) )
elem_ref.extend(li) # MUST BE REF!!!
self._update_history(p_style_id, p_level, numid, ilevel) self._update_history(p_style_id, p_level, numid, ilevel)
return return elem_ref
elif ( elif (
numid is None numid is None
and self._prev_numid() is not None and self._prev_numid() is not None
@@ -860,9 +876,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if p_style_id in ["Title"]: if p_style_id in ["Title"]:
for key in range(len(self.parents)): for key in range(len(self.parents)):
self.parents[key] = None self.parents[key] = None
self.parents[0] = doc.add_text( te = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text)
parent=None, label=DocItemLabel.TITLE, text=text self.parents[0] = te
) elem_ref.append(te.get_ref())
elif "Heading" in p_style_id: elif "Heading" in p_style_id:
style_element = getattr(paragraph.style, "element", None) style_element = getattr(paragraph.style, "element", None)
if style_element is not None: if style_element is not None:
@@ -871,7 +887,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
else: else:
is_numbered_style = False is_numbered_style = False
self._add_header(doc, p_level, text, is_numbered_style) h1 = self._add_header(doc, p_level, text, is_numbered_style)
elem_ref.extend(h1)
elif len(equations) > 0: elif len(equations) > 0:
if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len( if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
@@ -879,15 +896,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) > 0: ) > 0:
# Standalone equation # Standalone equation
level = self._get_level() level = self._get_level()
doc.add_text( t1 = doc.add_text(
label=DocItemLabel.FORMULA, label=DocItemLabel.FORMULA,
parent=self.parents[level - 1], parent=self.parents[level - 1],
text=text.replace("<eq>", "").replace("</eq>", ""), text=text.replace("<eq>", "").replace("</eq>", ""),
) )
elem_ref.append(t1.get_ref())
else: else:
# Inline equation # Inline equation
level = self._get_level() level = self._get_level()
inline_equation = doc.add_inline_group(parent=self.parents[level - 1]) inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
elem_ref.append(inline_equation.get_ref())
text_tmp = text text_tmp = text
for eq in equations: for eq in equations:
if len(text_tmp) == 0: if len(text_tmp) == 0:
@@ -899,23 +918,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1] text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
if len(pre_eq_text) > 0: if len(pre_eq_text) > 0:
doc.add_text( e1 = doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.TEXT,
parent=inline_equation, parent=inline_equation,
text=pre_eq_text, text=pre_eq_text,
) )
doc.add_text( elem_ref.append(e1.get_ref())
e2 = doc.add_text(
label=DocItemLabel.FORMULA, label=DocItemLabel.FORMULA,
parent=inline_equation, parent=inline_equation,
text=eq.replace("<eq>", "").replace("</eq>", ""), text=eq.replace("<eq>", "").replace("</eq>", ""),
) )
elem_ref.append(e2.get_ref())
if len(text_tmp) > 0: if len(text_tmp) > 0:
doc.add_text( e3 = doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.TEXT,
parent=inline_equation, parent=inline_equation,
text=text_tmp.strip(), text=text_tmp.strip(),
) )
elem_ref.append(e3.get_ref())
elif p_style_id in [ elif p_style_id in [
"Paragraph", "Paragraph",
@@ -934,13 +956,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph_elements=paragraph_elements, paragraph_elements=paragraph_elements,
) )
for text, format, hyperlink in paragraph_elements: for text, format, hyperlink in paragraph_elements:
doc.add_text( t2 = doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.TEXT,
parent=parent, parent=parent,
text=text, text=text,
formatting=format, formatting=format,
hyperlink=hyperlink, hyperlink=hyperlink,
) )
elem_ref.append(t2.get_ref())
else: else:
# Text style names can, and will have, not only default values but user values too # Text style names can, and will have, not only default values but user values too
@@ -952,16 +975,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph_elements=paragraph_elements, paragraph_elements=paragraph_elements,
) )
for text, format, hyperlink in paragraph_elements: for text, format, hyperlink in paragraph_elements:
doc.add_text( t3 = doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.TEXT,
parent=parent, parent=parent,
text=text, text=text,
formatting=format, formatting=format,
hyperlink=hyperlink, hyperlink=hyperlink,
) )
elem_ref.append(t3.get_ref())
self._update_history(p_style_id, p_level, numid, ilevel) self._update_history(p_style_id, p_level, numid, ilevel)
return return elem_ref
def _add_header( def _add_header(
self, self,
@@ -969,17 +993,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
curr_level: Optional[int], curr_level: Optional[int],
text: str, text: str,
is_numbered_style: bool = False, is_numbered_style: bool = False,
) -> None: ) -> List[RefItem]:
elem_ref: List[RefItem] = []
level = self._get_level() level = self._get_level()
if isinstance(curr_level, int): if isinstance(curr_level, int):
if curr_level > level: if curr_level > level:
# add invisible group # add invisible group
for i in range(level, curr_level): for i in range(level, curr_level):
self.parents[i] = doc.add_group( gr1 = doc.add_group(
parent=self.parents[i - 1], parent=self.parents[i - 1],
label=GroupLabel.SECTION, label=GroupLabel.SECTION,
name=f"header-{i}", name=f"header-{i}",
) )
elem_ref.append(gr1.get_ref())
self.parents[i] = gr1
elif curr_level < level: elif curr_level < level:
# remove the tail # remove the tail
for key in range(len(self.parents)): for key in range(len(self.parents)):
@@ -1019,12 +1047,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
text = f"{self.numbered_headers[previous_level]}.{text}" text = f"{self.numbered_headers[previous_level]}.{text}"
previous_level -= 1 previous_level -= 1
self.parents[current_level] = doc.add_heading( hd = doc.add_heading(
parent=self.parents[parent_level], parent=self.parents[parent_level],
text=text, text=text,
level=add_level, level=add_level,
) )
return self.parents[current_level] = hd
elem_ref.append(hd.get_ref())
return elem_ref
def _add_formatted_list_item( def _add_formatted_list_item(
self, self,
@@ -1033,12 +1063,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
marker: str, marker: str,
enumerated: bool, enumerated: bool,
level: int, level: int,
) -> None: ) -> List[RefItem]:
elem_ref: List[RefItem] = []
# This should not happen by construction # This should not happen by construction
if not isinstance(self.parents[level], ListGroup): if not isinstance(self.parents[level], ListGroup):
return return elem_ref
if not elements: if not elements:
return return elem_ref
if len(elements) == 1: if len(elements) == 1:
text, format, hyperlink = elements[0] text, format, hyperlink = elements[0]
@@ -1068,6 +1099,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
formatting=format, formatting=format,
hyperlink=hyperlink, hyperlink=hyperlink,
) )
return elem_ref
def _add_list_item( def _add_list_item(
self, self,
@@ -1077,10 +1109,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
ilevel: int, ilevel: int,
elements: list, elements: list,
is_numbered: bool = False, is_numbered: bool = False,
) -> None: ) -> List[RefItem]:
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed. elem_ref: List[RefItem] = []
# this method is always called with is_numbered. Numbered lists should be properly addressed.
if not elements: if not elements:
return None return elem_ref
enum_marker = "" enum_marker = ""
level = self._get_level() level = self._get_level()
@@ -1091,9 +1124,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Reset counters for the new numbering sequence # Reset counters for the new numbering sequence
self._reset_list_counters_for_new_sequence(numid) self._reset_list_counters_for_new_sequence(numid)
self.parents[level] = doc.add_list_group( list_gr = doc.add_list_group(name="list", parent=self.parents[level - 1])
name="list", parent=self.parents[level - 1] self.parents[level] = list_gr
) elem_ref.append(list_gr.get_ref())
# Set marker and enumerated arguments if this is an enumeration element. # Set marker and enumerated arguments if this is an enumeration element.
if is_numbered: if is_numbered:
@@ -1114,9 +1147,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + prev_indent + 1, self.level_at_new_list + prev_indent + 1,
self.level_at_new_list + ilevel + 1, self.level_at_new_list + ilevel + 1,
): ):
self.parents[i] = doc.add_list_group( list_gr1 = doc.add_list_group(name="list", parent=self.parents[i - 1])
name="list", parent=self.parents[i - 1] self.parents[i] = list_gr1
) elem_ref.append(list_gr1.get_ref())
# TODO: Set marker and enumerated arguments if this is an enumeration element. # TODO: Set marker and enumerated arguments if this is an enumeration element.
if is_numbered: if is_numbered:
@@ -1156,7 +1189,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
elif self._prev_numid() == numid or prev_indent == ilevel: elif self._prev_numid() == numid or prev_indent == ilevel:
# TODO: Set marker and enumerated arguments if this is an enumeration element. # Set marker and enumerated arguments if this is an enumeration element.
if is_numbered: if is_numbered:
counter = self._get_list_counter(numid, ilevel) counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "." enum_marker = str(counter) + "."
@@ -1165,15 +1198,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._add_formatted_list_item( self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level - 1 doc, elements, enum_marker, is_numbered, level - 1
) )
return elem_ref
return
def _handle_tables( def _handle_tables(
self, self,
element: BaseOxmlElement, element: BaseOxmlElement,
docx_obj: DocxDocument, docx_obj: DocxDocument,
doc: DoclingDocument, doc: DoclingDocument,
) -> None: ) -> List[RefItem]:
elem_ref: List[RefItem] = []
table: Table = Table(element, docx_obj) table: Table = Table(element, docx_obj)
num_rows = len(table.rows) num_rows = len(table.rows)
num_cols = len(table.columns) num_cols = len(table.columns)
@@ -1184,9 +1217,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# In case we have a table of only 1 cell, we consider it furniture # In case we have a table of only 1 cell, we consider it furniture
# And proceed processing the content of the cell as though it's in the document body # And proceed processing the content of the cell as though it's in the document body
self._walk_linear(cell_element._element, docx_obj, doc) self._walk_linear(cell_element._element, docx_obj, doc)
return return elem_ref
data = TableData(num_rows=num_rows, num_cols=num_cols) data = TableData(num_rows=num_rows, num_cols=num_cols)
level = self._get_level()
docling_table = doc.add_table(data=data, parent=self.parents[level - 1])
elem_ref.append(docling_table.get_ref())
cell_set: set[CT_Tc] = set() cell_set: set[CT_Tc] = set()
for row_idx, row in enumerate(table.rows): for row_idx, row in enumerate(table.rows):
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells") _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
@@ -1223,27 +1260,87 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else: else:
text = text.replace("<eq>", "$").replace("</eq>", "$") text = text.replace("<eq>", "$").replace("</eq>", "$")
table_cell = TableCell( provs_in_cell: List[RefItem] = []
text=text, _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
row_span=spanned_idx - row_idx, ref_for_rich_cell = provs_in_cell[0]
col_span=cell.grid_span, rich_table_cell = False
start_row_offset_idx=row.grid_cols_before + row_idx,
end_row_offset_idx=row.grid_cols_before + spanned_idx,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + cell.grid_span,
column_header=row.grid_cols_before + row_idx == 0,
row_header=False,
)
data.table_cells.append(table_cell)
col_idx += cell.grid_span
level = self._get_level() def group_cell_elements(
doc.add_table(data=data, parent=self.parents[level - 1]) group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
return ) -> RefItem:
group_element = doc.add_group(
label=GroupLabel.UNSPECIFIED,
name=group_name,
parent=docling_table,
)
for prov in provs_in_cell:
group_element.children.append(prov)
pr_item = prov.resolve(doc)
item_parent = pr_item.parent.resolve(doc)
if pr_item.get_ref() in item_parent.children:
item_parent.children.remove(pr_item.get_ref())
pr_item.parent = group_element.get_ref()
ref_for_rich_cell = group_element.get_ref()
return ref_for_rich_cell
if len(provs_in_cell) > 1:
# Cell has multiple elements, we need to group them
rich_table_cell = True
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
ref_for_rich_cell = group_cell_elements(
group_name, doc, provs_in_cell
)
elif len(provs_in_cell) == 1:
item_ref = provs_in_cell[0]
pr_item = item_ref.resolve(doc)
if isinstance(pr_item, TextItem):
# Cell has only one element and it's just a text
rich_table_cell = False
doc.delete_items(node_items=[pr_item])
else:
rich_table_cell = True
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
ref_for_rich_cell = group_cell_elements(
group_name, doc, provs_in_cell
)
else:
rich_table_cell = False
if rich_table_cell:
rich_cell = RichTableCell(
text=text,
row_span=spanned_idx - row_idx,
col_span=cell.grid_span,
start_row_offset_idx=row.grid_cols_before + row_idx,
end_row_offset_idx=row.grid_cols_before + spanned_idx,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + cell.grid_span,
column_header=row.grid_cols_before + row_idx == 0,
row_header=False,
ref=ref_for_rich_cell, # points to an artificial group around children
)
doc.add_table_cell(table_item=docling_table, cell=rich_cell)
col_idx += cell.grid_span
else:
simple_cell = TableCell(
text=text,
row_span=spanned_idx - row_idx,
col_span=cell.grid_span,
start_row_offset_idx=row.grid_cols_before + row_idx,
end_row_offset_idx=row.grid_cols_before + spanned_idx,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + cell.grid_span,
column_header=row.grid_cols_before + row_idx == 0,
row_header=False,
)
doc.add_table_cell(table_item=docling_table, cell=simple_cell)
col_idx += cell.grid_span
return elem_ref
def _handle_pictures( def _handle_pictures(
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
) -> None: ) -> List[RefItem]:
def get_docx_image(drawing_blip: Any) -> Optional[bytes]: def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
image_data: Optional[bytes] = None image_data: Optional[bytes] = None
rId = drawing_blip[0].get( rId = drawing_blip[0].get(
@@ -1255,28 +1352,32 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
image_data = image_part.blob # Get the binary image data image_data = image_part.blob # Get the binary image data
return image_data return image_data
elem_ref: List[RefItem] = []
level = self._get_level() level = self._get_level()
# Open the BytesIO object with PIL to create an Image # Open the BytesIO object with PIL to create an Image
image_data: Optional[bytes] = get_docx_image(drawing_blip) image_data: Optional[bytes] = get_docx_image(drawing_blip)
if image_data is None: if image_data is None:
_log.warning("Warning: image cannot be found") _log.warning("Warning: image cannot be found")
doc.add_picture( p1 = doc.add_picture(
parent=self.parents[level - 1], parent=self.parents[level - 1],
caption=None, caption=None,
) )
elem_ref.append(p1.get_ref())
else: else:
try: try:
image_bytes = BytesIO(image_data) image_bytes = BytesIO(image_data)
pil_image = Image.open(image_bytes) pil_image = Image.open(image_bytes)
doc.add_picture( p2 = doc.add_picture(
parent=self.parents[level - 1], parent=self.parents[level - 1],
image=ImageRef.from_pil(image=pil_image, dpi=72), image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None, caption=None,
) )
elem_ref.append(p2.get_ref())
except (UnidentifiedImageError, OSError): except (UnidentifiedImageError, OSError):
_log.warning("Warning: image cannot be loaded by Pillow") _log.warning("Warning: image cannot be loaded by Pillow")
doc.add_picture( p3 = doc.add_picture(
parent=self.parents[level - 1], parent=self.parents[level - 1],
caption=None, caption=None,
) )
return elem_ref.append(p3.get_ref())
return elem_ref

View File

@@ -0,0 +1,572 @@
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Annotated, ClassVar, Literal, Optional, Union, cast
from docling_core.types.doc import (
ContentLayer,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
Formatting,
GroupLabel,
NodeItem,
)
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from pydantic.types import StringConstraints
from typing_extensions import Self, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class _WebVTTTimestamp(BaseModel):
"""Model representing a WebVTT timestamp.
A WebVTT timestamp is always interpreted relative to the current playback position
of the media data that the WebVTT file is to be synchronized with.
"""
model_config = ConfigDict(regex_engine="python-re")
raw: Annotated[
str,
Field(
description="A representation of the WebVTT Timestamp as a single string"
),
]
_pattern: ClassVar[re.Pattern] = re.compile(
r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
)
_hours: int
_minutes: int
_seconds: int
_millis: int
@model_validator(mode="after")
def validate_raw(self) -> Self:
m = self._pattern.match(self.raw)
if not m:
raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
self._hours = int(m.group(1)) if m.group(1) else 0
self._minutes = int(m.group(2))
self._seconds = int(m.group(3))
self._millis = int(m.group(4))
if self._minutes < 0 or self._minutes > 59:
raise ValueError("Minutes must be between 0 and 59")
if self._seconds < 0 or self._seconds > 59:
raise ValueError("Seconds must be between 0 and 59")
return self
@property
def seconds(self) -> float:
"""A representation of the WebVTT Timestamp in seconds"""
return (
self._hours * 3600
+ self._minutes * 60
+ self._seconds
+ self._millis / 1000.0
)
@override
def __str__(self) -> str:
return self.raw
_WebVTTCueIdentifier = Annotated[
str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
]
class _WebVTTCueTimings(BaseModel):
"""Model representating WebVTT cue timings."""
start: Annotated[
_WebVTTTimestamp, Field(description="Start time offset of the cue")
]
end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
@model_validator(mode="after")
def check_order(self) -> Self:
if self.start and self.end:
if self.end.seconds <= self.start.seconds:
raise ValueError("End timestamp must be greater than start timestamp")
return self
@override
def __str__(self):
return f"{self.start} --> {self.end}"
class _WebVTTCueTextSpan(BaseModel):
"""Model representing a WebVTT cue text span."""
text: str
span_type: Literal["text"] = "text"
@field_validator("text", mode="after")
@classmethod
def validate_text(cls, value: str) -> str:
if any(ch in value for ch in {"\n", "\r", "&", "<"}):
raise ValueError("Cue text span contains invalid characters")
if len(value) == 0:
raise ValueError("Cue text span cannot be empty")
return value
@override
def __str__(self):
return self.text
class _WebVTTCueVoiceSpan(BaseModel):
"""Model representing a WebVTT cue voice span."""
annotation: Annotated[
str,
Field(
description=(
"Cue span start tag annotation text representing the name of thevoice"
)
),
]
classes: Annotated[
list[str],
Field(description="List of classes representing the cue span's significance"),
] = []
components: Annotated[
list["_WebVTTCueComponent"],
Field(description="The components representing the cue internal text"),
] = []
span_type: Literal["v"] = "v"
@field_validator("annotation", mode="after")
@classmethod
def validate_annotation(cls, value: str) -> str:
if any(ch in value for ch in {"\n", "\r", "&", ">"}):
raise ValueError(
"Cue span start tag annotation contains invalid characters"
)
if not value:
raise ValueError("Cue text span cannot be empty")
return value
@field_validator("classes", mode="after")
@classmethod
def validate_classes(cls, value: list[str]) -> list[str]:
for item in value:
if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
raise ValueError(
"A cue span start tag class contains invalid characters"
)
if not item:
raise ValueError("Cue span start tag classes cannot be empty")
return value
@override
def __str__(self):
tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
inner = "".join(str(span) for span in self.components)
return f"<{tag} {self.annotation}>{inner}</v>"
class _WebVTTCueClassSpan(BaseModel):
span_type: Literal["c"] = "c"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<c>{inner}</c>"
class _WebVTTCueItalicSpan(BaseModel):
span_type: Literal["i"] = "i"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<i>{inner}</i>"
class _WebVTTCueBoldSpan(BaseModel):
span_type: Literal["b"] = "b"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<b>{inner}</b>"
class _WebVTTCueUnderlineSpan(BaseModel):
span_type: Literal["u"] = "u"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<u>{inner}</u>"
_WebVTTCueComponent = Annotated[
Union[
_WebVTTCueTextSpan,
_WebVTTCueClassSpan,
_WebVTTCueItalicSpan,
_WebVTTCueBoldSpan,
_WebVTTCueUnderlineSpan,
_WebVTTCueVoiceSpan,
],
Field(discriminator="span_type", description="The WebVTT cue component"),
]
class _WebVTTCueBlock(BaseModel):
"""Model representing a WebVTT cue block.
The optional WebVTT cue settings list is not supported.
The cue payload is limited to the following spans: text, class, italic, bold,
underline, and voice.
"""
model_config = ConfigDict(regex_engine="python-re")
identifier: Optional[_WebVTTCueIdentifier] = Field(
None, description="The WebVTT cue identifier"
)
timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
_pattern_block: ClassVar[re.Pattern] = re.compile(
r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
)
_pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
r"^<v(?P<class>\.[^\t\n\r &<>]+)?" # zero or more classes
r"[ \t]+(?P<annotation>[^\n\r&>]+)>" # required space and annotation
)
@field_validator("payload", mode="after")
@classmethod
def validate_payload(cls, payload):
for voice in payload:
if "-->" in str(voice):
raise ValueError("Cue payload must not contain '-->'")
return payload
@classmethod
def parse(cls, raw: str) -> "_WebVTTCueBlock":
lines = raw.strip().splitlines()
if not lines:
raise ValueError("Cue block must have at least one line")
identifier: Optional[_WebVTTCueIdentifier] = None
timing_line = lines[0]
if "-->" not in timing_line and len(lines) > 1:
identifier = timing_line
timing_line = lines[1]
cue_lines = lines[2:]
else:
cue_lines = lines[1:]
if "-->" not in timing_line:
raise ValueError("Cue block must contain WebVTT cue timings")
start, end = [t.strip() for t in timing_line.split("-->")]
end = re.split(" |\t", end)[0] # ignore the cue settings list
timings: _WebVTTCueTimings = _WebVTTCueTimings(
start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
)
cue_text = " ".join(cue_lines).strip()
if cue_text.startswith("<v") and "</v>" not in cue_text:
# adding close tag for cue voice spans without end tag
cue_text += "</v>"
stack: list[list[_WebVTTCueComponent]] = [[]]
tag_stack: list[Union[str, tuple]] = []
pos = 0
matches = list(cls._pattern_block.finditer(cue_text))
i = 0
while i < len(matches):
match = matches[i]
if match.start() > pos:
stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
tag = match.group(0)
if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
tag_type = tag[1:2]
tag_stack.append(tag_type)
stack.append([])
elif tag == "</i>":
children = stack.pop()
stack[-1].append(_WebVTTCueItalicSpan(components=children))
tag_stack.pop()
elif tag == "</b>":
children = stack.pop()
stack[-1].append(_WebVTTCueBoldSpan(components=children))
tag_stack.pop()
elif tag == "</u>":
children = stack.pop()
stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
tag_stack.pop()
elif tag == "</c>":
children = stack.pop()
stack[-1].append(_WebVTTCueClassSpan(components=children))
tag_stack.pop()
elif tag.startswith("<v"):
tag_stack.append(("v", tag))
stack.append([])
elif tag.startswith("</v"):
children = stack.pop() if stack else []
if (
tag_stack
and isinstance(tag_stack[-1], tuple)
and tag_stack[-1][0] == "v"
):
_, voice = cast(tuple, tag_stack.pop())
voice_match = cls._pattern_voice_tag.match(voice)
if voice_match:
class_string = voice_match.group("class")
annotation = voice_match.group("annotation")
if annotation:
classes: list[str] = []
if class_string:
classes = [c for c in class_string.split(".") if c]
stack[-1].append(
_WebVTTCueVoiceSpan(
annotation=annotation.strip(),
classes=classes,
components=children,
)
)
pos = match.end()
i += 1
if pos < len(cue_text):
stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
return cls(
identifier=identifier,
timings=timings,
payload=stack[0],
)
def __str__(self):
parts = []
if self.identifier:
parts.append(f"{self.identifier}\n")
timings_line = str(self.timings)
parts.append(timings_line + "\n")
for idx, span in enumerate(self.payload):
if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
# the end tag may be omitted for brevity
parts.append(str(span).removesuffix("</v>"))
else:
parts.append(str(span))
return "".join(parts)
class _WebVTTFile(BaseModel):
"""A model representing a WebVTT file."""
cue_blocks: list[_WebVTTCueBlock]
@staticmethod
def verify_signature(content: str) -> bool:
if not content:
return False
elif len(content) == 6:
return content == "WEBVTT"
elif len(content) > 6 and content.startswith("WEBVTT"):
return content[6] in (" ", "\t", "\n")
else:
return False
@classmethod
def parse(cls, raw: str) -> "_WebVTTFile":
# Normalize newlines to LF
raw = raw.replace("\r\n", "\n").replace("\r", "\n")
# Check WebVTT signature
if not cls.verify_signature(raw):
raise ValueError("Invalid WebVTT file signature")
# Strip "WEBVTT" header line
lines = raw.split("\n", 1)
body = lines[1] if len(lines) > 1 else ""
# Remove NOTE/STYLE/REGION blocks
body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
# Split into cue blocks
raw_blocks = re.split(r"\n\s*\n", body.strip())
cues: list[_WebVTTCueBlock] = []
for block in raw_blocks:
try:
cues.append(_WebVTTCueBlock.parse(block))
except ValueError as e:
_log.warning(f"Failed to parse cue block:\n{block}\n{e}")
return cls(cue_blocks=cues)
def __iter__(self):
return iter(self.cue_blocks)
def __getitem__(self, idx):
return self.cue_blocks[idx]
def __len__(self):
return len(self.cue_blocks)
class WebVTTDocumentBackend(DeclarativeDocumentBackend):
"""Declarative backend for WebVTT (.vtt) files.
This parser reads the content of a WebVTT file and converts
it to a DoclingDocument, following the W3C specs on https://www.w3.org/TR/webvtt1
Each cue becomes a TextItem and the items are appended to the
document body by the cue's start time.
"""
@override
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.content: str = ""
try:
if isinstance(self.path_or_stream, BytesIO):
self.content = self.path_or_stream.getvalue().decode("utf-8")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as f:
self.content = f.read()
except Exception as e:
raise RuntimeError(
"Could not initialize the WebVTT backend for file with hash "
f"{self.document_hash}."
) from e
@override
def is_valid(self) -> bool:
return _WebVTTFile.verify_signature(self.content)
@classmethod
@override
def supports_pagination(cls) -> bool:
return False
@override
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.VTT}
@staticmethod
def _add_text_from_component(
doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
) -> None:
"""Adds a TextItem to a document by extracting text from a cue span component.
TODO: address nesting
"""
formatting = Formatting()
text = ""
if isinstance(item, _WebVTTCueItalicSpan):
formatting.italic = True
elif isinstance(item, _WebVTTCueBoldSpan):
formatting.bold = True
elif isinstance(item, _WebVTTCueUnderlineSpan):
formatting.underline = True
if isinstance(item, _WebVTTCueTextSpan):
text = item.text
else:
# TODO: address nesting
text = "".join(
[t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
)
if text := text.strip():
doc.add_text(
label=DocItemLabel.TEXT,
text=text,
parent=parent,
content_layer=ContentLayer.BODY,
formatting=formatting,
)
@override
def convert(self) -> DoclingDocument:
_log.debug("Starting WebVTT conversion...")
if not self.is_valid():
raise RuntimeError("Invalid WebVTT document.")
origin = DocumentOrigin(
filename=self.file.name or "file",
mimetype="text/vtt",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
for block in vtt.cue_blocks:
block_group = doc.add_group(
label=GroupLabel.SECTION,
name="WebVTT cue block",
parent=None,
content_layer=ContentLayer.BODY,
)
if block.identifier:
doc.add_text(
label=DocItemLabel.TEXT,
text=str(block.identifier),
parent=block_group,
content_layer=ContentLayer.BODY,
)
doc.add_text(
label=DocItemLabel.TEXT,
text=str(block.timings),
parent=block_group,
content_layer=ContentLayer.BODY,
)
for cue_span in block.payload:
if isinstance(cue_span, _WebVTTCueVoiceSpan):
voice_group = doc.add_group(
label=GroupLabel.INLINE,
name="WebVTT cue voice span",
parent=block_group,
content_layer=ContentLayer.BODY,
)
voice = cue_span.annotation
if classes := cue_span.classes:
voice += f" ({', '.join(classes)})"
voice += ": "
doc.add_text(
label=DocItemLabel.TEXT,
text=voice,
parent=voice_group,
content_layer=ContentLayer.BODY,
)
for item in cue_span.components:
WebVTTDocumentBackend._add_text_from_component(
doc, item, voice_group
)
else:
WebVTTDocumentBackend._add_text_from_component(
doc, cue_span, block_group
)
return doc

View File

@@ -1,7 +1,6 @@
import math
from collections import defaultdict from collections import defaultdict
from enum import Enum from enum import Enum
from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union from typing import TYPE_CHECKING, Optional, Type, Union
import numpy as np import numpy as np
from docling_core.types.doc import ( from docling_core.types.doc import (
@@ -14,9 +13,7 @@ from docling_core.types.doc import (
) )
from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
from docling_core.types.doc.page import SegmentedPdfPage, TextCell from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_core.types.io import ( from docling_core.types.io import DocumentStream
DocumentStream,
)
# DO NOT REMOVE; explicitly exposed from this location # DO NOT REMOVE; explicitly exposed from this location
from PIL.Image import Image from PIL.Image import Image
@@ -71,6 +68,7 @@ class InputFormat(str, Enum):
METS_GBS = "mets_gbs" METS_GBS = "mets_gbs"
JSON_DOCLING = "json_docling" JSON_DOCLING = "json_docling"
AUDIO = "audio" AUDIO = "audio"
VTT = "vtt"
class OutputFormat(str, Enum): class OutputFormat(str, Enum):
@@ -82,7 +80,7 @@ class OutputFormat(str, Enum):
DOCTAGS = "doctags" DOCTAGS = "doctags"
FormatToExtensions: Dict[InputFormat, List[str]] = { FormatToExtensions: dict[InputFormat, list[str]] = {
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"], InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"], InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
InputFormat.PDF: ["pdf"], InputFormat.PDF: ["pdf"],
@@ -97,9 +95,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.METS_GBS: ["tar.gz"], InputFormat.METS_GBS: ["tar.gz"],
InputFormat.JSON_DOCLING: ["json"], InputFormat.JSON_DOCLING: ["json"],
InputFormat.AUDIO: ["wav", "mp3"], InputFormat.AUDIO: ["wav", "mp3"],
InputFormat.VTT: ["vtt"],
} }
FormatToMimeType: Dict[InputFormat, List[str]] = { FormatToMimeType: dict[InputFormat, list[str]] = {
InputFormat.DOCX: [ InputFormat.DOCX: [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template", "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
@@ -130,6 +129,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.METS_GBS: ["application/mets+xml"], InputFormat.METS_GBS: ["application/mets+xml"],
InputFormat.JSON_DOCLING: ["application/json"], InputFormat.JSON_DOCLING: ["application/json"],
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"], InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
InputFormat.VTT: ["text/vtt"],
} }
MimeTypeToFormat: dict[str, list[InputFormat]] = { MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -162,8 +162,8 @@ class Cluster(BaseModel):
label: DocItemLabel label: DocItemLabel
bbox: BoundingBox bbox: BoundingBox
confidence: float = 1.0 confidence: float = 1.0
cells: List[TextCell] = [] cells: list[TextCell] = []
children: List["Cluster"] = [] # Add child cluster support children: list["Cluster"] = [] # Add child cluster support
@field_serializer("confidence") @field_serializer("confidence")
def _serialize(self, value: float, info: FieldSerializationInfo) -> float: def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
@@ -179,7 +179,7 @@ class BasePageElement(BaseModel):
class LayoutPrediction(BaseModel): class LayoutPrediction(BaseModel):
clusters: List[Cluster] = [] clusters: list[Cluster] = []
class VlmPredictionToken(BaseModel): class VlmPredictionToken(BaseModel):
@@ -201,14 +201,14 @@ class ContainerElement(
class Table(BasePageElement): class Table(BasePageElement):
otsl_seq: List[str] otsl_seq: list[str]
num_rows: int = 0 num_rows: int = 0
num_cols: int = 0 num_cols: int = 0
table_cells: List[TableCell] table_cells: list[TableCell]
class TableStructurePrediction(BaseModel): class TableStructurePrediction(BaseModel):
table_map: Dict[int, Table] = {} table_map: dict[int, Table] = {}
class TextElement(BasePageElement): class TextElement(BasePageElement):
@@ -216,7 +216,7 @@ class TextElement(BasePageElement):
class FigureElement(BasePageElement): class FigureElement(BasePageElement):
annotations: List[PictureDataType] = [] annotations: list[PictureDataType] = []
provenance: Optional[str] = None provenance: Optional[str] = None
predicted_class: Optional[str] = None predicted_class: Optional[str] = None
confidence: Optional[float] = None confidence: Optional[float] = None
@@ -234,12 +234,12 @@ class FigureElement(BasePageElement):
class FigureClassificationPrediction(BaseModel): class FigureClassificationPrediction(BaseModel):
figure_count: int = 0 figure_count: int = 0
figure_map: Dict[int, FigureElement] = {} figure_map: dict[int, FigureElement] = {}
class EquationPrediction(BaseModel): class EquationPrediction(BaseModel):
equation_count: int = 0 equation_count: int = 0
equation_map: Dict[int, TextElement] = {} equation_map: dict[int, TextElement] = {}
class PagePredictions(BaseModel): class PagePredictions(BaseModel):
@@ -254,9 +254,9 @@ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
class AssembledUnit(BaseModel): class AssembledUnit(BaseModel):
elements: List[PageElement] = [] elements: list[PageElement] = []
body: List[PageElement] = [] body: list[PageElement] = []
headers: List[PageElement] = [] headers: list[PageElement] = []
class ItemAndImageEnrichmentElement(BaseModel): class ItemAndImageEnrichmentElement(BaseModel):
@@ -280,12 +280,12 @@ class Page(BaseModel):
None # Internal PDF backend. By default it is cleared during assembling. None # Internal PDF backend. By default it is cleared during assembling.
) )
_default_image_scale: float = 1.0 # Default image scale for external usage. _default_image_scale: float = 1.0 # Default image scale for external usage.
_image_cache: Dict[ _image_cache: dict[
float, Image float, Image
] = {} # Cache of images in different scales. By default it is cleared during assembling. ] = {} # Cache of images in different scales. By default it is cleared during assembling.
@property @property
def cells(self) -> List[TextCell]: def cells(self) -> list[TextCell]:
"""Return text cells as a read-only view of parsed_page.textline_cells.""" """Return text cells as a read-only view of parsed_page.textline_cells."""
if self.parsed_page is not None: if self.parsed_page is not None:
return self.parsed_page.textline_cells return self.parsed_page.textline_cells
@@ -354,7 +354,7 @@ class OpenAiApiResponse(BaseModel):
id: str id: str
model: Optional[str] = None # returned by openai model: Optional[str] = None # returned by openai
choices: List[OpenAiResponseChoice] choices: list[OpenAiResponseChoice]
created: int created: int
usage: OpenAiResponseUsage usage: OpenAiResponseUsage
@@ -430,7 +430,7 @@ class PageConfidenceScores(BaseModel):
class ConfidenceReport(PageConfidenceScores): class ConfidenceReport(PageConfidenceScores):
pages: Dict[int, PageConfidenceScores] = Field( pages: dict[int, PageConfidenceScores] = Field(
default_factory=lambda: defaultdict(PageConfidenceScores) default_factory=lambda: defaultdict(PageConfidenceScores)
) )

View File

@@ -394,6 +394,8 @@ class _DocumentConversionInput(BaseModel):
mime = FormatToMimeType[InputFormat.PPTX][0] mime = FormatToMimeType[InputFormat.PPTX][0]
elif ext in FormatToExtensions[InputFormat.XLSX]: elif ext in FormatToExtensions[InputFormat.XLSX]:
mime = FormatToMimeType[InputFormat.XLSX][0] mime = FormatToMimeType[InputFormat.XLSX][0]
elif ext in FormatToExtensions[InputFormat.VTT]:
mime = FormatToMimeType[InputFormat.VTT][0]
return mime return mime

View File

@@ -25,6 +25,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.noop_backend import NoOpBackend from docling.backend.noop_backend import NoOpBackend
from docling.backend.webvtt_backend import WebVTTDocumentBackend
from docling.backend.xml.jats_backend import JatsDocumentBackend from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
@@ -170,6 +171,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
), ),
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend), InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
InputFormat.VTT: FormatOption(
pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
),
} }
if (options := format_to_default_options.get(format)) is not None: if (options := format_to_default_options.get(format)) is not None:
return options return options

View File

@@ -3,7 +3,7 @@
# #
# What this example does # What this example does
# - Runs the VLM-powered pipeline on a PDF (by URL) and prints Markdown output. # - Runs the VLM-powered pipeline on a PDF (by URL) and prints Markdown output.
# - Shows two setups: default (Transformers/SmolDocling) and macOS MPS/MLX. # - Shows two setups: default (Transformers/GraniteDocling) and macOS MPS/MLX.
# #
# Prerequisites # Prerequisites
# - Install Docling with VLM extras and the appropriate backend (Transformers or MLX). # - Install Docling with VLM extras and the appropriate backend (Transformers or MLX).
@@ -15,7 +15,7 @@
# #
# Notes # Notes
# - `source` may be a local path or a URL to a PDF. # - `source` may be a local path or a URL to a PDF.
# - The second section demonstrates macOS MPS acceleration via MLX (`vlm_model_specs.SMOLDOCLING_MLX`). # - The second section demonstrates macOS MPS acceleration via MLX (`vlm_model_specs.GRANITEDOCLING_MLX`).
# - For more configurations and model comparisons, see `docs/examples/compare_vlm_models.py`. # - For more configurations and model comparisons, see `docs/examples/compare_vlm_models.py`.
# %% # %%

4
docs/index.md vendored
View File

@@ -21,7 +21,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features ## Features
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -37,13 +37,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
* 📤 Structured [information extraction][extraction] \[🧪 beta\] * 📤 Structured [information extraction][extraction] \[🧪 beta\]
* 📑 New layout model (**Heron**) by default, for faster PDF parsing * 📑 New layout model (**Heron**) by default, for faster PDF parsing
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
* 💬 Parsing of Web Video Text Tracks (WebVTT) files
### Coming soon ### Coming soon
* 📝 Metadata extraction, including title, authors, references & language * 📝 Metadata extraction, including title, authors, references & language
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc) * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
* 📝 Complex chemistry understanding (Molecular structures) * 📝 Complex chemistry understanding (Molecular structures)
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
## Get started ## Get started

View File

@@ -11,10 +11,11 @@ Below you can find a listing of all supported input and output formats.
| PDF | | | PDF | |
| DOCX, XLSX, PPTX | Default formats in MS Office 2007+, based on Office Open XML | | DOCX, XLSX, PPTX | Default formats in MS Office 2007+, based on Office Open XML |
| Markdown | | | Markdown | |
| AsciiDoc | | | AsciiDoc | Human-readable, plain-text markup language for structured technical content |
| HTML, XHTML | | | HTML, XHTML | |
| CSV | | | CSV | |
| PNG, JPEG, TIFF, BMP, WEBP | Image formats | | PNG, JPEG, TIFF, BMP, WEBP | Image formats |
| WebVTT | Web Video Text Tracks format for displaying timed text |
Schema-specific support: Schema-specific support:
@@ -32,4 +33,4 @@ Schema-specific support:
| Markdown | | | Markdown | |
| JSON | Lossless serialization of Docling Document | | JSON | Lossless serialization of Docling Document |
| Text | Plain text, i.e. without Markdown markers | | Text | Plain text, i.e. without Markdown markers |
| Doctags | | | [Doctags](https://arxiv.org/pdf/2503.11576) | Markup format for efficiently representing the full content and layout characteristics of a document |

View File

@@ -1,6 +1,6 @@
[project] [project]
name = "docling" name = "docling"
version = "2.53.0" # DO NOT EDIT, updated automatically version = "2.54.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT" license = "MIT"
keywords = [ keywords = [
@@ -44,7 +44,7 @@ authors = [
requires-python = '>=3.9,<4.0' requires-python = '>=3.9,<4.0'
dependencies = [ dependencies = [
'pydantic (>=2.0.0,<3.0.0)', 'pydantic (>=2.0.0,<3.0.0)',
'docling-core[chunking] (>=2.48.0,<3.0.0)', 'docling-core[chunking] (>=2.48.2,<3.0.0)',
'docling-parse (>=4.4.0,<5.0.0)', 'docling-parse (>=4.4.0,<5.0.0)',
"docling-ibm-models>=3.9.1,<4", "docling-ibm-models>=3.9.1,<4",
'filetype (>=1.2.0,<2.0.0)', 'filetype (>=1.2.0,<2.0.0)',

View File

@@ -1,40 +1,40 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: inline: group group item-1 at level 1: inline: group group
item-2 at level 2: paragraph: This is a word document and this is an inline equation: item-2 at level 2: text: This is a word document and this is an inline equation:
item-3 at level 2: formula: A= \pi r^{2} item-3 at level 2: formula: A= \pi r^{2}
item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this: item-4 at level 2: text: . If instead, I want an equation by line, I can do this:
item-5 at level 1: paragraph: item-5 at level 1: text:
item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23 item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
item-7 at level 1: paragraph: And that is an equation by itself. Cheers! item-7 at level 1: text: And that is an equation by itself. Cheers!
item-8 at level 1: paragraph: item-8 at level 1: text:
item-9 at level 1: paragraph: This is another equation: item-9 at level 1: text: This is another equation:
item-10 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right) item-10 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right)
item-11 at level 1: paragraph: item-11 at level 1: text:
item-12 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text. item-12 at level 1: text: This is text. This is text. This ... s is text. This is text. This is text.
item-13 at level 1: paragraph: item-13 at level 1: text:
item-14 at level 1: paragraph: item-14 at level 1: text:
item-15 at level 1: inline: group group item-15 at level 1: inline: group group
item-16 at level 2: paragraph: This is a word document and this is an inline equation: item-16 at level 2: text: This is a word document and this is an inline equation:
item-17 at level 2: formula: A= \pi r^{2} item-17 at level 2: formula: A= \pi r^{2}
item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this: item-18 at level 2: text: . If instead, I want an equation by line, I can do this:
item-19 at level 1: paragraph: item-19 at level 1: text:
item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k} item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
item-21 at level 1: paragraph: item-21 at level 1: text:
item-22 at level 1: paragraph: And that is an equation by itself. Cheers! item-22 at level 1: text: And that is an equation by itself. Cheers!
item-23 at level 1: paragraph: item-23 at level 1: text:
item-24 at level 1: paragraph: This is another equation: item-24 at level 1: text: This is another equation:
item-25 at level 1: paragraph: item-25 at level 1: text:
item-26 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ght)x^{2}}{2!}+ \text{ \textellipsis } item-26 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ght)x^{2}}{2!}+ \text{ \textellipsis }
item-27 at level 1: paragraph: item-27 at level 1: text:
item-28 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text. item-28 at level 1: text: This is text. This is text. This ... s is text. This is text. This is text.
item-29 at level 1: paragraph: item-29 at level 1: text:
item-30 at level 1: paragraph: item-30 at level 1: text:
item-31 at level 1: inline: group group item-31 at level 1: inline: group group
item-32 at level 2: paragraph: This is a word document and this is an inline equation: item-32 at level 2: text: This is a word document and this is an inline equation:
item-33 at level 2: formula: A= \pi r^{2} item-33 at level 2: formula: A= \pi r^{2}
item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this: item-34 at level 2: text: . If instead, I want an equation by line, I can do this:
item-35 at level 1: paragraph: item-35 at level 1: text:
item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty
item-37 at level 1: paragraph: item-37 at level 1: text:
item-38 at level 1: paragraph: And that is an equation by itself. Cheers! item-38 at level 1: text: And that is an equation by itself. Cheers!
item-39 at level 1: paragraph: item-39 at level 1: text:

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "equations", "name": "equations",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -182,7 +182,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "This is a word document and this is an inline equation: ", "orig": "This is a word document and this is an inline equation: ",
"text": "This is a word document and this is an inline equation: " "text": "This is a word document and this is an inline equation: "
@@ -206,7 +206,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": ". If instead, I want an equation by line, I can do this:", "orig": ". If instead, I want an equation by line, I can do this:",
"text": ". If instead, I want an equation by line, I can do this:" "text": ". If instead, I want an equation by line, I can do this:"
@@ -218,7 +218,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -242,7 +242,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "And that is an equation by itself. Cheers!", "orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!", "text": "And that is an equation by itself. Cheers!",
@@ -261,7 +261,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -273,7 +273,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "This is another equation:", "orig": "This is another equation:",
"text": "This is another equation:", "text": "This is another equation:",
@@ -304,7 +304,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -316,7 +316,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.", "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.", "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
@@ -335,7 +335,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -347,7 +347,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -359,7 +359,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "This is a word document and this is an inline equation: ", "orig": "This is a word document and this is an inline equation: ",
"text": "This is a word document and this is an inline equation: " "text": "This is a word document and this is an inline equation: "
@@ -383,7 +383,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": ". If instead, I want an equation by line, I can do this:", "orig": ". If instead, I want an equation by line, I can do this:",
"text": ". If instead, I want an equation by line, I can do this:" "text": ". If instead, I want an equation by line, I can do this:"
@@ -395,7 +395,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -419,7 +419,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -431,7 +431,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "And that is an equation by itself. Cheers!", "orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!", "text": "And that is an equation by itself. Cheers!",
@@ -450,7 +450,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -462,7 +462,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "This is another equation:", "orig": "This is another equation:",
"text": "This is another equation:", "text": "This is another equation:",
@@ -481,7 +481,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -505,7 +505,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -517,7 +517,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.", "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.", "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
@@ -536,7 +536,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -548,7 +548,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -560,7 +560,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "This is a word document and this is an inline equation: ", "orig": "This is a word document and this is an inline equation: ",
"text": "This is a word document and this is an inline equation: " "text": "This is a word document and this is an inline equation: "
@@ -584,7 +584,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": ". If instead, I want an equation by line, I can do this:", "orig": ". If instead, I want an equation by line, I can do this:",
"text": ". If instead, I want an equation by line, I can do this:" "text": ". If instead, I want an equation by line, I can do this:"
@@ -596,7 +596,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -620,7 +620,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -632,7 +632,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "And that is an equation by itself. Cheers!", "orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!", "text": "And that is an equation by itself. Cheers!",
@@ -651,7 +651,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""

View File

@@ -0,0 +1,675 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "escaped_characters",
"origin": {
"mimetype": "text/html",
"binary_hash": 10682185258371912110,
"filename": "escaped_characters.md"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/9"
},
{
"$ref": "#/texts/11"
},
{
"$ref": "#/texts/12"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/texts/4"
},
"children": [
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"name": "ordered list",
"label": "list"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/texts/4"
},
"children": [
{
"$ref": "#/texts/6"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "title",
"prov": [],
"orig": "escaped_characters",
"text": "escaped_characters"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Headers:",
"text": "Headers:"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/3"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"level": 1
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/texts/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Text: 00:16.000 ----> 00:18.000 & < > \" '",
"text": "Text: 00:16.000 ----> 00:18.000 & < > \" '"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Lists",
"text": "Lists"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"enumerated": true,
"marker": ""
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Inline code",
"text": "Inline code"
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/texts/7"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/10"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Code block",
"text": "Code block"
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/texts/9"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/tables/0"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Table",
"text": "Table"
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Raw HTML",
"text": "Raw HTML"
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/texts/12"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "& < > \" '/div>",
"text": "& < > \" '/div>"
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/texts/12"
},
"children": [
{
"$ref": "#/texts/15"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Link",
"text": "Link",
"level": 1
},
{
"self_ref": "#/texts/15",
"parent": {
"$ref": "#/texts/14"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"hyperlink": "https://en.wikipedia.org/wiki/Albert_Einstein"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/11"
},
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Key",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Example",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Ampersand",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "&",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Less-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "<",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Greater-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": ">",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Quotes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "\"",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Apostrophes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "'",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 6,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Key",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Example",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Ampersand",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "&",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Less-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "<",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Greater-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": ">",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Quotes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "\"",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Apostrophes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "'",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,41 @@
# Headers:
## &amp; &lt; &gt; " '
Text: 00:16.000 ----&gt; 00:18.000 &amp; &lt; &gt; " '
# Lists
1. &amp; &lt; &gt; " '
- &amp; &lt; &gt; " '
# Inline code
```
& < > " '
```
# Code block
```
& < > " '
```
# Table
| Key | Example |
|--------------|-----------|
| Ampersand | & |
| Less-than | < |
| Greater-than | > |
| Quotes | " |
| Apostrophes | ' |
# Raw HTML
&amp; &lt; &gt; " '/div&gt;
## Link
[&amp; &lt; &gt; " '](https://en.wikipedia.org/wiki/Albert_Einstein)

View File

@@ -186,6 +186,7 @@ tables:
column_header: true column_header: true
end_col_offset_idx: 1 end_col_offset_idx: 1
end_row_offset_idx: 1 end_row_offset_idx: 1
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -196,6 +197,7 @@ tables:
column_header: true column_header: true
end_col_offset_idx: 2 end_col_offset_idx: 2
end_row_offset_idx: 1 end_row_offset_idx: 1
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -206,6 +208,7 @@ tables:
column_header: false column_header: false
end_col_offset_idx: 1 end_col_offset_idx: 1
end_row_offset_idx: 2 end_row_offset_idx: 2
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -216,6 +219,7 @@ tables:
column_header: false column_header: false
end_col_offset_idx: 2 end_col_offset_idx: 2
end_row_offset_idx: 2 end_row_offset_idx: 2
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -229,6 +233,7 @@ tables:
column_header: true column_header: true
end_col_offset_idx: 1 end_col_offset_idx: 1
end_row_offset_idx: 1 end_row_offset_idx: 1
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -239,6 +244,7 @@ tables:
column_header: true column_header: true
end_col_offset_idx: 2 end_col_offset_idx: 2
end_row_offset_idx: 1 end_row_offset_idx: 1
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -249,6 +255,7 @@ tables:
column_header: false column_header: false
end_col_offset_idx: 1 end_col_offset_idx: 1
end_row_offset_idx: 2 end_row_offset_idx: 2
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -259,6 +266,7 @@ tables:
column_header: false column_header: false
end_col_offset_idx: 2 end_col_offset_idx: 2
end_row_offset_idx: 2 end_row_offset_idx: 2
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -269,6 +277,7 @@ tables:
column_header: true column_header: true
end_col_offset_idx: 1 end_col_offset_idx: 1
end_row_offset_idx: 1 end_row_offset_idx: 1
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -279,6 +288,7 @@ tables:
column_header: true column_header: true
end_col_offset_idx: 2 end_col_offset_idx: 2
end_row_offset_idx: 1 end_row_offset_idx: 1
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -289,6 +299,7 @@ tables:
column_header: false column_header: false
end_col_offset_idx: 1 end_col_offset_idx: 1
end_row_offset_idx: 2 end_row_offset_idx: 2
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -299,6 +310,7 @@ tables:
column_header: false column_header: false
end_col_offset_idx: 2 end_col_offset_idx: 2
end_row_offset_idx: 2 end_row_offset_idx: 2
fillable: false
row_header: false row_header: false
row_section: false row_section: false
row_span: 1 row_span: 1
@@ -878,4 +890,4 @@ texts:
prov: [] prov: []
self_ref: '#/texts/48' self_ref: '#/texts/48'
text: Table Heading text: Table Heading
version: 1.6.0 version: 1.7.0

View File

@@ -1,10 +1,10 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Lorem ipsum dolor sit amet, cons ... quam non, sodales sem. Nulla facilisi. item-1 at level 1: text: Lorem ipsum dolor sit amet, cons ... quam non, sodales sem. Nulla facilisi.
item-2 at level 1: paragraph: item-2 at level 1: text:
item-3 at level 1: paragraph: Duis condimentum dui eget ullamc ... cus tempor, et tristique ante aliquet. item-3 at level 1: text: Duis condimentum dui eget ullamc ... cus tempor, et tristique ante aliquet.
item-4 at level 1: paragraph: item-4 at level 1: text:
item-5 at level 1: paragraph: Maecenas id neque pharetra, elei ... ulla faucibus eu. Donec ut nisl metus. item-5 at level 1: text: Maecenas id neque pharetra, elei ... ulla faucibus eu. Donec ut nisl metus.
item-6 at level 1: paragraph: item-6 at level 1: text:
item-7 at level 1: paragraph: Duis ac tellus sed turpis feugia ... pellentesque rhoncus, blandit eu nisl. item-7 at level 1: text: Duis ac tellus sed turpis feugia ... pellentesque rhoncus, blandit eu nisl.
item-8 at level 1: paragraph: item-8 at level 1: text:
item-9 at level 1: paragraph: Nunc vehicula mattis erat ac con ... udin, vehicula turpis eu, tempus nibh. item-9 at level 1: text: Nunc vehicula mattis erat ac con ... udin, vehicula turpis eu, tempus nibh.

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "lorem_ipsum", "name": "lorem_ipsum",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -58,7 +58,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.", "orig": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.",
"text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.", "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.",
@@ -77,7 +77,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -89,7 +89,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.", "orig": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.",
"text": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.", "text": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.",
@@ -108,7 +108,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -120,7 +120,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.", "orig": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.",
"text": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.", "text": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.",
@@ -139,7 +139,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -151,7 +151,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.", "orig": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.",
"text": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.", "text": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.",
@@ -170,7 +170,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -182,7 +182,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.", "orig": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.",
"text": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.", "text": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.",

View File

@@ -136,4 +136,4 @@ texts:
prov: [] prov: []
self_ref: '#/texts/7' self_ref: '#/texts/7'
text: The end! text: The end!
version: 1.6.0 version: 1.7.0

View File

@@ -1,3 +1,3 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: table with [2x2] item-1 at level 1: table with [2x2]
item-2 at level 1: paragraph: item-2 at level 1: text:

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "table_with_equations", "name": "table_with_equations",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -37,7 +37,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -69,7 +69,8 @@
"text": "The next cell has an equation", "text": "The next cell has an equation",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -81,7 +82,8 @@
"text": "$A= \\pi r^{2}$", "text": "$A= \\pi r^{2}$",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -93,7 +95,8 @@
"text": "The next cell has another equation", "text": "The next cell has another equation",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -105,7 +108,8 @@
"text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$", "text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
], ],
"num_rows": 2, "num_rows": 2,
@@ -122,7 +126,8 @@
"text": "The next cell has an equation", "text": "The next cell has an equation",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -134,7 +139,8 @@
"text": "$A= \\pi r^{2}$", "text": "$A= \\pi r^{2}$",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
], ],
[ [
@@ -148,7 +154,8 @@
"text": "The next cell has another equation", "text": "The next cell has another equation",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -160,7 +167,8 @@
"text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$", "text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
] ]
] ]

View File

@@ -2,9 +2,9 @@ item-0 at level 0: unspecified: group _root_
item-1 at level 1: list: group list item-1 at level 1: list: group list
item-2 at level 2: list_item: Hello world1 item-2 at level 2: list_item: Hello world1
item-3 at level 2: list_item: Hello2 item-3 at level 2: list_item: Hello2
item-4 at level 1: paragraph: item-4 at level 1: text:
item-5 at level 1: paragraph: Some text before item-5 at level 1: text: Some text before
item-6 at level 1: table with [3x3] item-6 at level 1: table with [3x3]
item-7 at level 1: paragraph: item-7 at level 1: text:
item-8 at level 1: paragraph: item-8 at level 1: text:
item-9 at level 1: paragraph: Some text after item-9 at level 1: text: Some text after

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "tablecell", "name": "tablecell",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -112,7 +112,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -124,7 +124,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Some text before", "orig": "Some text before",
"text": "Some text before", "text": "Some text before",
@@ -143,7 +143,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -155,7 +155,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -167,7 +167,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Some text after", "orig": "Some text after",
"text": "Some text after", "text": "Some text after",
@@ -206,7 +206,8 @@
"text": "Tab1", "text": "Tab1",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -218,7 +219,8 @@
"text": "Tab2", "text": "Tab2",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -230,7 +232,8 @@
"text": "Tab3", "text": "Tab3",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -242,7 +245,8 @@
"text": "A", "text": "A",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -254,7 +258,8 @@
"text": "B", "text": "B",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -266,7 +271,8 @@
"text": "C", "text": "C",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -278,7 +284,8 @@
"text": "D", "text": "D",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -290,7 +297,8 @@
"text": "E", "text": "E",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -302,7 +310,8 @@
"text": "F", "text": "F",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
], ],
"num_rows": 3, "num_rows": 3,
@@ -319,7 +328,8 @@
"text": "Tab1", "text": "Tab1",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -331,7 +341,8 @@
"text": "Tab2", "text": "Tab2",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -343,7 +354,8 @@
"text": "Tab3", "text": "Tab3",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
], ],
[ [
@@ -357,7 +369,8 @@
"text": "A", "text": "A",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -369,7 +382,8 @@
"text": "B", "text": "B",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -381,7 +395,8 @@
"text": "C", "text": "C",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
], ],
[ [
@@ -395,7 +410,8 @@
"text": "D", "text": "D",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -407,7 +423,8 @@
"text": "E", "text": "E",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -419,7 +436,8 @@
"text": "F", "text": "F",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
] ]
] ]

View File

@@ -1,8 +1,8 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Test with three images in unusual formats item-1 at level 1: text: Test with three images in unusual formats
item-2 at level 1: paragraph: Raster in emf: item-2 at level 1: text: Raster in emf:
item-3 at level 1: picture item-3 at level 1: picture
item-4 at level 1: paragraph: Vector in emf: item-4 at level 1: text: Vector in emf:
item-5 at level 1: picture item-5 at level 1: picture
item-6 at level 1: paragraph: Raster in webp: item-6 at level 1: text: Raster in webp:
item-7 at level 1: picture item-7 at level 1: picture

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "test_emf_docx", "name": "test_emf_docx",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -52,7 +52,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Test with three images in unusual formats", "orig": "Test with three images in unusual formats",
"text": "Test with three images in unusual formats", "text": "Test with three images in unusual formats",
@@ -71,7 +71,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Raster in emf:", "orig": "Raster in emf:",
"text": "Raster in emf:", "text": "Raster in emf:",
@@ -90,7 +90,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Vector in emf:", "orig": "Vector in emf:",
"text": "Vector in emf:", "text": "Vector in emf:",
@@ -109,7 +109,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Raster in webp:", "orig": "Raster in webp:",
"text": "Raster in webp:", "text": "Raster in webp:",

View File

@@ -1,90 +1,90 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten item-1 at level 1: text: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
item-2 at level 1: paragraph: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten item-2 at level 1: text: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
item-3 at level 1: paragraph: item-3 at level 1: text:
item-4 at level 1: section: group textbox item-4 at level 1: section: group textbox
item-5 at level 2: paragraph: Student falls ill item-5 at level 2: text: Student falls ill
item-6 at level 2: paragraph: item-6 at level 2: text:
item-7 at level 2: list: group list item-7 at level 2: list: group list
item-8 at level 3: list_item: Suggested Reportable Symptoms: item-8 at level 3: list_item: Suggested Reportable Symptoms:
... sh ... sh
Blisters Blisters
Headache Headache
Sore throat Sore throat
item-9 at level 1: paragraph: item-9 at level 1: text:
item-10 at level 1: paragraph: item-10 at level 1: text:
item-11 at level 1: section: group textbox item-11 at level 1: section: group textbox
item-12 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms item-12 at level 2: text: If a caregiver suspects that wit ... the same suggested reportable symptoms
item-13 at level 1: paragraph: item-13 at level 1: text:
item-14 at level 1: paragraph: item-14 at level 1: text:
item-15 at level 1: paragraph: item-15 at level 1: text:
item-16 at level 1: paragraph: item-16 at level 1: text:
item-17 at level 1: section: group textbox item-17 at level 1: section: group textbox
item-18 at level 2: paragraph: Yes item-18 at level 2: text: Yes
item-19 at level 1: paragraph: item-19 at level 1: text:
item-20 at level 1: paragraph: item-20 at level 1: text:
item-21 at level 1: section: group textbox item-21 at level 1: section: group textbox
item-22 at level 2: list: group list item-22 at level 2: list: group list
item-23 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. item-23 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
item-24 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. item-24 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
item-25 at level 2: paragraph: item-25 at level 2: text:
item-26 at level 1: list: group list item-26 at level 1: list: group list
item-27 at level 1: paragraph: item-27 at level 1: text:
item-28 at level 1: paragraph: item-28 at level 1: text:
item-29 at level 1: paragraph: item-29 at level 1: text:
item-30 at level 1: paragraph: item-30 at level 1: text:
item-31 at level 1: paragraph: item-31 at level 1: text:
item-32 at level 1: section: group textbox item-32 at level 1: section: group textbox
item-33 at level 2: paragraph: Health Bureau: item-33 at level 2: text: Health Bureau:
item-34 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. item-34 at level 2: text: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-35 at level 2: list: group list item-35 at level 2: list: group list
item-36 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. item-36 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-37 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. item-37 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-38 at level 2: paragraph: item-38 at level 2: text:
item-39 at level 1: list: group list item-39 at level 1: list: group list
item-40 at level 1: paragraph: item-40 at level 1: text:
item-41 at level 1: section: group textbox item-41 at level 1: section: group textbox
item-42 at level 2: paragraph: Department of Education: item-42 at level 2: text: Department of Education:
Collabo ... vention measures at all school levels. Collabo ... vention measures at all school levels.
item-43 at level 1: paragraph: item-43 at level 1: text:
item-44 at level 1: paragraph: item-44 at level 1: text:
item-45 at level 1: paragraph: item-45 at level 1: text:
item-46 at level 1: paragraph: item-46 at level 1: text:
item-47 at level 1: paragraph: item-47 at level 1: text:
item-48 at level 1: paragraph: item-48 at level 1: text:
item-49 at level 1: paragraph: item-49 at level 1: text:
item-50 at level 1: section: group textbox item-50 at level 1: section: group textbox
item-51 at level 2: inline: group group item-51 at level 2: inline: group group
item-52 at level 3: paragraph: The Health Bureau will handle item-52 at level 3: text: The Health Bureau will handle
item-53 at level 3: paragraph: reporting and specimen collection item-53 at level 3: text: reporting and specimen collection
item-54 at level 3: paragraph: . item-54 at level 3: text: .
item-55 at level 2: paragraph: item-55 at level 2: text:
item-56 at level 1: paragraph: item-56 at level 1: text:
item-57 at level 1: paragraph: item-57 at level 1: text:
item-58 at level 1: paragraph: item-58 at level 1: text:
item-59 at level 1: section: group textbox item-59 at level 1: section: group textbox
item-60 at level 2: paragraph: Whether the epidemic has eased. item-60 at level 2: text: Whether the epidemic has eased.
item-61 at level 2: paragraph: item-61 at level 2: text:
item-62 at level 1: paragraph: item-62 at level 1: text:
item-63 at level 1: section: group textbox item-63 at level 1: section: group textbox
item-64 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. item-64 at level 2: text: Whether the test results are pos ... legally designated infectious disease.
item-65 at level 2: paragraph: No item-65 at level 2: text: No
item-66 at level 1: paragraph: item-66 at level 1: text:
item-67 at level 1: paragraph: item-67 at level 1: text:
item-68 at level 1: section: group textbox item-68 at level 1: section: group textbox
item-69 at level 2: paragraph: Yes item-69 at level 2: text: Yes
item-70 at level 1: paragraph: item-70 at level 1: text:
item-71 at level 1: section: group textbox item-71 at level 1: section: group textbox
item-72 at level 2: paragraph: Yes item-72 at level 2: text: Yes
item-73 at level 1: paragraph: item-73 at level 1: text:
item-74 at level 1: paragraph: item-74 at level 1: text:
item-75 at level 1: section: group textbox item-75 at level 1: section: group textbox
item-76 at level 2: paragraph: Case closed. item-76 at level 2: text: Case closed.
item-77 at level 2: paragraph: item-77 at level 2: text:
item-78 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. item-78 at level 2: text: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-79 at level 1: paragraph: item-79 at level 1: text:
item-80 at level 1: section: group textbox item-80 at level 1: section: group textbox
item-81 at level 2: paragraph: No item-81 at level 2: text: No
item-82 at level 1: paragraph: item-82 at level 1: text:
item-83 at level 1: paragraph: item-83 at level 1: text:
item-84 at level 1: paragraph: item-84 at level 1: text:

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "textbox", "name": "textbox",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -491,7 +491,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten", "orig": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten",
"text": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten", "text": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten",
@@ -510,7 +510,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten", "orig": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten",
"text": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten", "text": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten",
@@ -529,7 +529,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -541,7 +541,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Student falls ill", "orig": "Student falls ill",
"text": "Student falls ill", "text": "Student falls ill",
@@ -560,7 +560,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -593,7 +593,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -605,7 +605,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -617,7 +617,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms", "orig": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms",
"text": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms", "text": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms",
@@ -636,7 +636,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -648,7 +648,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -660,7 +660,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -672,7 +672,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -684,7 +684,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Yes", "orig": "Yes",
"text": "Yes", "text": "Yes",
@@ -703,7 +703,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -715,7 +715,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -769,7 +769,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -781,7 +781,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -793,7 +793,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -805,7 +805,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -817,7 +817,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -829,7 +829,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -841,7 +841,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Health Bureau:", "orig": "Health Bureau:",
"text": "Health Bureau:", "text": "Health Bureau:",
@@ -860,7 +860,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.", "orig": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.",
"text": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.", "text": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.",
@@ -921,7 +921,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -933,7 +933,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -945,7 +945,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", "orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.",
"text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", "text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.",
@@ -964,7 +964,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -976,7 +976,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -988,7 +988,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1000,7 +1000,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1012,7 +1012,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1024,7 +1024,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1036,7 +1036,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1048,7 +1048,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "The Health Bureau will handle", "orig": "The Health Bureau will handle",
"text": "The Health Bureau will handle", "text": "The Health Bureau will handle",
@@ -1067,7 +1067,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "reporting and specimen collection", "orig": "reporting and specimen collection",
"text": "reporting and specimen collection", "text": "reporting and specimen collection",
@@ -1086,7 +1086,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": ".", "orig": ".",
"text": ".", "text": ".",
@@ -1105,7 +1105,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1117,7 +1117,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1129,7 +1129,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1141,7 +1141,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1153,7 +1153,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Whether the epidemic has eased.", "orig": "Whether the epidemic has eased.",
"text": "Whether the epidemic has eased.", "text": "Whether the epidemic has eased.",
@@ -1172,7 +1172,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1184,7 +1184,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1196,7 +1196,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Whether the test results are positive for a legally designated infectious disease.", "orig": "Whether the test results are positive for a legally designated infectious disease.",
"text": "Whether the test results are positive for a legally designated infectious disease.", "text": "Whether the test results are positive for a legally designated infectious disease.",
@@ -1215,7 +1215,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "No", "orig": "No",
"text": "No", "text": "No",
@@ -1234,7 +1234,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1246,7 +1246,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1258,7 +1258,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Yes", "orig": "Yes",
"text": "Yes", "text": "Yes",
@@ -1277,7 +1277,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1289,7 +1289,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Yes", "orig": "Yes",
"text": "Yes", "text": "Yes",
@@ -1308,7 +1308,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1320,7 +1320,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1332,7 +1332,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Case closed.", "orig": "Case closed.",
"text": "Case closed.", "text": "Case closed.",
@@ -1351,7 +1351,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1363,7 +1363,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.", "orig": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.",
"text": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.", "text": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.",
@@ -1382,7 +1382,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1394,7 +1394,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "No", "orig": "No",
"text": "No", "text": "No",
@@ -1413,7 +1413,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1425,7 +1425,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1437,7 +1437,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""

View File

@@ -1,18 +1,18 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: italic item-1 at level 1: text: italic
item-2 at level 1: paragraph: bold item-2 at level 1: text: bold
item-3 at level 1: paragraph: underline item-3 at level 1: text: underline
item-4 at level 1: paragraph: hyperlink item-4 at level 1: text: hyperlink
item-5 at level 1: paragraph: italic and bold hyperlink item-5 at level 1: text: italic and bold hyperlink
item-6 at level 1: inline: group group item-6 at level 1: inline: group group
item-7 at level 2: paragraph: Normal item-7 at level 2: text: Normal
item-8 at level 2: paragraph: italic item-8 at level 2: text: italic
item-9 at level 2: paragraph: bold item-9 at level 2: text: bold
item-10 at level 2: paragraph: underline item-10 at level 2: text: underline
item-11 at level 2: paragraph: and item-11 at level 2: text: and
item-12 at level 2: paragraph: hyperlink item-12 at level 2: text: hyperlink
item-13 at level 2: paragraph: on the same line item-13 at level 2: text: on the same line
item-14 at level 1: paragraph: item-14 at level 1: text:
item-15 at level 1: list: group list item-15 at level 1: list: group list
item-16 at level 2: list_item: Italic bullet 1 item-16 at level 2: list_item: Italic bullet 1
item-17 at level 2: list_item: Bold bullet 2 item-17 at level 2: list_item: Bold bullet 2
@@ -29,4 +29,4 @@ item-0 at level 0: unspecified: group _root_
item-28 at level 5: text: Nested item-28 at level 5: text: Nested
item-29 at level 5: text: italic item-29 at level 5: text: italic
item-30 at level 5: text: bold item-30 at level 5: text: bold
item-31 at level 1: paragraph: item-31 at level 1: text:

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "unit_test_formatting", "name": "unit_test_formatting",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -174,7 +174,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "italic", "orig": "italic",
"text": "italic", "text": "italic",
@@ -193,7 +193,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "bold", "orig": "bold",
"text": "bold", "text": "bold",
@@ -212,7 +212,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "underline", "orig": "underline",
"text": "underline", "text": "underline",
@@ -231,7 +231,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "hyperlink", "orig": "hyperlink",
"text": "hyperlink", "text": "hyperlink",
@@ -251,7 +251,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "italic and bold hyperlink", "orig": "italic and bold hyperlink",
"text": "italic and bold hyperlink", "text": "italic and bold hyperlink",
@@ -271,7 +271,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Normal", "orig": "Normal",
"text": "Normal", "text": "Normal",
@@ -290,7 +290,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "italic", "orig": "italic",
"text": "italic", "text": "italic",
@@ -309,7 +309,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "bold", "orig": "bold",
"text": "bold", "text": "bold",
@@ -328,7 +328,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "underline", "orig": "underline",
"text": "underline", "text": "underline",
@@ -347,7 +347,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "and", "orig": "and",
"text": "and", "text": "and",
@@ -366,7 +366,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "hyperlink", "orig": "hyperlink",
"text": "hyperlink", "text": "hyperlink",
@@ -386,7 +386,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "on the same line", "orig": "on the same line",
"text": "on the same line", "text": "on the same line",
@@ -405,7 +405,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -649,7 +649,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""

View File

@@ -1,48 +1,48 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Test Document item-1 at level 1: title: Test Document
item-2 at level 2: paragraph: item-2 at level 2: text:
item-3 at level 2: section_header: Section 1 item-3 at level 2: section_header: Section 1
item-4 at level 3: paragraph: item-4 at level 3: text:
item-5 at level 3: paragraph: Paragraph 1.1 item-5 at level 3: text: Paragraph 1.1
item-6 at level 3: paragraph: item-6 at level 3: text:
item-7 at level 3: paragraph: Paragraph 1.2 item-7 at level 3: text: Paragraph 1.2
item-8 at level 3: paragraph: item-8 at level 3: text:
item-9 at level 3: section_header: Section 1.1 item-9 at level 3: section_header: Section 1.1
item-10 at level 4: paragraph: item-10 at level 4: text:
item-11 at level 4: paragraph: Paragraph 1.1.1 item-11 at level 4: text: Paragraph 1.1.1
item-12 at level 4: paragraph: item-12 at level 4: text:
item-13 at level 4: paragraph: Paragraph 1.1.2 item-13 at level 4: text: Paragraph 1.1.2
item-14 at level 4: paragraph: item-14 at level 4: text:
item-15 at level 3: section_header: Section 1.2 item-15 at level 3: section_header: Section 1.2
item-16 at level 4: paragraph: item-16 at level 4: text:
item-17 at level 4: paragraph: Paragraph 1.1.1 item-17 at level 4: text: Paragraph 1.1.1
item-18 at level 4: paragraph: item-18 at level 4: text:
item-19 at level 4: paragraph: Paragraph 1.1.2 item-19 at level 4: text: Paragraph 1.1.2
item-20 at level 4: paragraph: item-20 at level 4: text:
item-21 at level 4: section_header: Section 1.2.3 item-21 at level 4: section_header: Section 1.2.3
item-22 at level 5: paragraph: item-22 at level 5: text:
item-23 at level 5: paragraph: Paragraph 1.2.3.1 item-23 at level 5: text: Paragraph 1.2.3.1
item-24 at level 5: paragraph: item-24 at level 5: text:
item-25 at level 5: paragraph: Paragraph 1.2.3.1 item-25 at level 5: text: Paragraph 1.2.3.1
item-26 at level 5: paragraph: item-26 at level 5: text:
item-27 at level 5: paragraph: item-27 at level 5: text:
item-28 at level 2: section_header: Section 2 item-28 at level 2: section_header: Section 2
item-29 at level 3: paragraph: item-29 at level 3: text:
item-30 at level 3: paragraph: Paragraph 2.1 item-30 at level 3: text: Paragraph 2.1
item-31 at level 3: paragraph: item-31 at level 3: text:
item-32 at level 3: paragraph: Paragraph 2.2 item-32 at level 3: text: Paragraph 2.2
item-33 at level 3: paragraph: item-33 at level 3: text:
item-34 at level 3: section: group header-2 item-34 at level 3: section: group header-2
item-35 at level 4: section_header: Section 2.1.1 item-35 at level 4: section_header: Section 2.1.1
item-36 at level 5: paragraph: item-36 at level 5: text:
item-37 at level 5: paragraph: Paragraph 2.1.1.1 item-37 at level 5: text: Paragraph 2.1.1.1
item-38 at level 5: paragraph: item-38 at level 5: text:
item-39 at level 5: paragraph: Paragraph 2.1.1.1 item-39 at level 5: text: Paragraph 2.1.1.1
item-40 at level 5: paragraph: item-40 at level 5: text:
item-41 at level 3: section_header: Section 2.1 item-41 at level 3: section_header: Section 2.1
item-42 at level 4: paragraph: item-42 at level 4: text:
item-43 at level 4: paragraph: Paragraph 2.1.1 item-43 at level 4: text: Paragraph 2.1.1
item-44 at level 4: paragraph: item-44 at level 4: text:
item-45 at level 4: paragraph: Paragraph 2.1.2 item-45 at level 4: text: Paragraph 2.1.2
item-46 at level 4: paragraph: item-46 at level 4: text:
item-47 at level 4: paragraph: item-47 at level 4: text:

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "unit_test_headers", "name": "unit_test_headers",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -71,7 +71,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -118,7 +118,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -130,7 +130,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.1", "orig": "Paragraph 1.1",
"text": "Paragraph 1.1", "text": "Paragraph 1.1",
@@ -149,7 +149,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -161,7 +161,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.2", "orig": "Paragraph 1.2",
"text": "Paragraph 1.2", "text": "Paragraph 1.2",
@@ -180,7 +180,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -221,7 +221,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -233,7 +233,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.1.1", "orig": "Paragraph 1.1.1",
"text": "Paragraph 1.1.1", "text": "Paragraph 1.1.1",
@@ -252,7 +252,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -264,7 +264,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.1.2", "orig": "Paragraph 1.1.2",
"text": "Paragraph 1.1.2", "text": "Paragraph 1.1.2",
@@ -283,7 +283,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -327,7 +327,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -339,7 +339,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.1.1", "orig": "Paragraph 1.1.1",
"text": "Paragraph 1.1.1", "text": "Paragraph 1.1.1",
@@ -358,7 +358,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -370,7 +370,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.1.2", "orig": "Paragraph 1.1.2",
"text": "Paragraph 1.1.2", "text": "Paragraph 1.1.2",
@@ -389,7 +389,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -433,7 +433,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -445,7 +445,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.2.3.1", "orig": "Paragraph 1.2.3.1",
"text": "Paragraph 1.2.3.1", "text": "Paragraph 1.2.3.1",
@@ -464,7 +464,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -476,7 +476,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.2.3.1", "orig": "Paragraph 1.2.3.1",
"text": "Paragraph 1.2.3.1", "text": "Paragraph 1.2.3.1",
@@ -495,7 +495,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -507,7 +507,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -554,7 +554,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -566,7 +566,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1", "orig": "Paragraph 2.1",
"text": "Paragraph 2.1", "text": "Paragraph 2.1",
@@ -585,7 +585,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -597,7 +597,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.2", "orig": "Paragraph 2.2",
"text": "Paragraph 2.2", "text": "Paragraph 2.2",
@@ -616,7 +616,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -657,7 +657,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -669,7 +669,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1.1.1", "orig": "Paragraph 2.1.1.1",
"text": "Paragraph 2.1.1.1", "text": "Paragraph 2.1.1.1",
@@ -688,7 +688,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -700,7 +700,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1.1.1", "orig": "Paragraph 2.1.1.1",
"text": "Paragraph 2.1.1.1", "text": "Paragraph 2.1.1.1",
@@ -719,7 +719,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -763,7 +763,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -775,7 +775,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1.1", "orig": "Paragraph 2.1.1",
"text": "Paragraph 2.1.1", "text": "Paragraph 2.1.1",
@@ -794,7 +794,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -806,7 +806,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1.2", "orig": "Paragraph 2.1.2",
"text": "Paragraph 2.1.2", "text": "Paragraph 2.1.2",
@@ -825,7 +825,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -837,7 +837,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""

View File

@@ -1,52 +1,52 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Test Document item-1 at level 1: title: Test Document
item-2 at level 2: paragraph: item-2 at level 2: text:
item-3 at level 2: section_header: 1 Section 1 item-3 at level 2: section_header: 1 Section 1
item-4 at level 1: paragraph: item-4 at level 1: text:
item-5 at level 1: paragraph: Paragraph 1.1 item-5 at level 1: text: Paragraph 1.1
item-6 at level 1: paragraph: item-6 at level 1: text:
item-7 at level 1: paragraph: Paragraph 1.2 item-7 at level 1: text: Paragraph 1.2
item-8 at level 1: paragraph: item-8 at level 1: text:
item-9 at level 1: section: group header-0 item-9 at level 1: section: group header-0
item-10 at level 2: section: group header-1 item-10 at level 2: section: group header-1
item-11 at level 3: section_header: 1.1 Section 1.1 item-11 at level 3: section_header: 1.1 Section 1.1
item-12 at level 4: paragraph: item-12 at level 4: text:
item-13 at level 4: paragraph: Paragraph 1.1.1 item-13 at level 4: text: Paragraph 1.1.1
item-14 at level 4: paragraph: item-14 at level 4: text:
item-15 at level 4: paragraph: Paragraph 1.1.2 item-15 at level 4: text: Paragraph 1.1.2
item-16 at level 4: paragraph: item-16 at level 4: text:
item-17 at level 3: section_header: 1.2 Section 1.2 item-17 at level 3: section_header: 1.2 Section 1.2
item-18 at level 4: paragraph: item-18 at level 4: text:
item-19 at level 4: paragraph: Paragraph 1.1.1 item-19 at level 4: text: Paragraph 1.1.1
item-20 at level 4: paragraph: item-20 at level 4: text:
item-21 at level 4: paragraph: Paragraph 1.1.2 item-21 at level 4: text: Paragraph 1.1.2
item-22 at level 4: paragraph: item-22 at level 4: text:
item-23 at level 4: section_header: 1.2.1 Section 1.2.3 item-23 at level 4: section_header: 1.2.1 Section 1.2.3
item-24 at level 5: paragraph: item-24 at level 5: text:
item-25 at level 5: paragraph: Paragraph 1.2.3.1 item-25 at level 5: text: Paragraph 1.2.3.1
item-26 at level 5: paragraph: item-26 at level 5: text:
item-27 at level 5: paragraph: Paragraph 1.2.3.1 item-27 at level 5: text: Paragraph 1.2.3.1
item-28 at level 5: paragraph: item-28 at level 5: text:
item-29 at level 5: paragraph: item-29 at level 5: text:
item-30 at level 2: section_header: 2 Section 2 item-30 at level 2: section_header: 2 Section 2
item-31 at level 1: paragraph: item-31 at level 1: text:
item-32 at level 1: paragraph: Paragraph 2.1 item-32 at level 1: text: Paragraph 2.1
item-33 at level 1: paragraph: item-33 at level 1: text:
item-34 at level 1: paragraph: Paragraph 2.2 item-34 at level 1: text: Paragraph 2.2
item-35 at level 1: paragraph: item-35 at level 1: text:
item-36 at level 1: section: group header-0 item-36 at level 1: section: group header-0
item-37 at level 2: section: group header-1 item-37 at level 2: section: group header-1
item-38 at level 3: section: group header-2 item-38 at level 3: section: group header-2
item-39 at level 4: section_header: 2.1.1 Section 2.1.1 item-39 at level 4: section_header: 2.1.1 Section 2.1.1
item-40 at level 5: paragraph: item-40 at level 5: text:
item-41 at level 5: paragraph: Paragraph 2.1.1.1 item-41 at level 5: text: Paragraph 2.1.1.1
item-42 at level 5: paragraph: item-42 at level 5: text:
item-43 at level 5: paragraph: Paragraph 2.1.1.1 item-43 at level 5: text: Paragraph 2.1.1.1
item-44 at level 5: paragraph: item-44 at level 5: text:
item-45 at level 3: section_header: 2.2 Section 2.1 item-45 at level 3: section_header: 2.2 Section 2.1
item-46 at level 4: paragraph: item-46 at level 4: text:
item-47 at level 4: paragraph: Paragraph 2.1.1 item-47 at level 4: text: Paragraph 2.1.1
item-48 at level 4: paragraph: item-48 at level 4: text:
item-49 at level 4: paragraph: Paragraph 2.1.2 item-49 at level 4: text: Paragraph 2.1.2
item-50 at level 4: paragraph: item-50 at level 4: text:
item-51 at level 4: paragraph: item-51 at level 4: text:

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "unit_test_headers_numbered", "name": "unit_test_headers_numbered",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -169,7 +169,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -194,7 +194,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -206,7 +206,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.1", "orig": "Paragraph 1.1",
"text": "Paragraph 1.1", "text": "Paragraph 1.1",
@@ -225,7 +225,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -237,7 +237,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.2", "orig": "Paragraph 1.2",
"text": "Paragraph 1.2", "text": "Paragraph 1.2",
@@ -256,7 +256,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -297,7 +297,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -309,7 +309,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.1.1", "orig": "Paragraph 1.1.1",
"text": "Paragraph 1.1.1", "text": "Paragraph 1.1.1",
@@ -328,7 +328,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -340,7 +340,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.1.2", "orig": "Paragraph 1.1.2",
"text": "Paragraph 1.1.2", "text": "Paragraph 1.1.2",
@@ -359,7 +359,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -403,7 +403,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -415,7 +415,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.1.1", "orig": "Paragraph 1.1.1",
"text": "Paragraph 1.1.1", "text": "Paragraph 1.1.1",
@@ -434,7 +434,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -446,7 +446,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.1.2", "orig": "Paragraph 1.1.2",
"text": "Paragraph 1.1.2", "text": "Paragraph 1.1.2",
@@ -465,7 +465,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -509,7 +509,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -521,7 +521,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.2.3.1", "orig": "Paragraph 1.2.3.1",
"text": "Paragraph 1.2.3.1", "text": "Paragraph 1.2.3.1",
@@ -540,7 +540,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -552,7 +552,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 1.2.3.1", "orig": "Paragraph 1.2.3.1",
"text": "Paragraph 1.2.3.1", "text": "Paragraph 1.2.3.1",
@@ -571,7 +571,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -583,7 +583,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -608,7 +608,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -620,7 +620,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1", "orig": "Paragraph 2.1",
"text": "Paragraph 2.1", "text": "Paragraph 2.1",
@@ -639,7 +639,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -651,7 +651,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.2", "orig": "Paragraph 2.2",
"text": "Paragraph 2.2", "text": "Paragraph 2.2",
@@ -670,7 +670,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -711,7 +711,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -723,7 +723,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1.1.1", "orig": "Paragraph 2.1.1.1",
"text": "Paragraph 2.1.1.1", "text": "Paragraph 2.1.1.1",
@@ -742,7 +742,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -754,7 +754,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1.1.1", "orig": "Paragraph 2.1.1.1",
"text": "Paragraph 2.1.1.1", "text": "Paragraph 2.1.1.1",
@@ -773,7 +773,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -817,7 +817,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -829,7 +829,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1.1", "orig": "Paragraph 2.1.1",
"text": "Paragraph 2.1.1", "text": "Paragraph 2.1.1",
@@ -848,7 +848,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -860,7 +860,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1.2", "orig": "Paragraph 2.1.2",
"text": "Paragraph 2.1.2", "text": "Paragraph 2.1.2",
@@ -879,7 +879,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -891,7 +891,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""

View File

@@ -1,25 +1,25 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-0 item-1 at level 1: section: group header-0
item-2 at level 2: section_header: Test Document item-2 at level 2: section_header: Test Document
item-3 at level 3: paragraph: item-3 at level 3: text:
item-4 at level 3: paragraph: item-4 at level 3: text:
item-5 at level 3: paragraph: Paragraph 2.1.1 item-5 at level 3: text: Paragraph 2.1.1
item-6 at level 3: paragraph: item-6 at level 3: text:
item-7 at level 3: paragraph: Paragraph 2.1.2 item-7 at level 3: text: Paragraph 2.1.2
item-8 at level 3: paragraph: item-8 at level 3: text:
item-9 at level 3: section: group header-2 item-9 at level 3: section: group header-2
item-10 at level 4: section_header: Test 1: item-10 at level 4: section_header: Test 1:
item-11 at level 5: list: group list item-11 at level 5: list: group list
item-12 at level 6: list_item: List item 1 item-12 at level 6: list_item: List item 1
item-13 at level 6: list_item: List item 2 item-13 at level 6: list_item: List item 2
item-14 at level 6: list_item: List item 3 item-14 at level 6: list_item: List item 3
item-15 at level 5: paragraph: item-15 at level 5: text:
item-16 at level 4: section_header: Test 2: item-16 at level 4: section_header: Test 2:
item-17 at level 5: list: group list item-17 at level 5: list: group list
item-18 at level 6: list_item: List item a item-18 at level 6: list_item: List item a
item-19 at level 6: list_item: List item b item-19 at level 6: list_item: List item b
item-20 at level 6: list_item: List item c item-20 at level 6: list_item: List item c
item-21 at level 5: paragraph: item-21 at level 5: text:
item-22 at level 4: section_header: Test 3: item-22 at level 4: section_header: Test 3:
item-23 at level 5: list: group list item-23 at level 5: list: group list
item-24 at level 6: list_item: List item 1 item-24 at level 6: list_item: List item 1
@@ -29,14 +29,14 @@ item-0 at level 0: unspecified: group _root_
item-28 at level 7: list_item: List item 1.2 item-28 at level 7: list_item: List item 1.2
item-29 at level 7: list_item: List item 1.3 item-29 at level 7: list_item: List item 1.3
item-30 at level 6: list_item: List item 3 item-30 at level 6: list_item: List item 3
item-31 at level 5: paragraph: item-31 at level 5: text:
item-32 at level 4: section_header: Test 4: item-32 at level 4: section_header: Test 4:
item-33 at level 5: list: group list item-33 at level 5: list: group list
item-34 at level 6: list_item: List item 1 item-34 at level 6: list_item: List item 1
item-35 at level 6: list: group list item-35 at level 6: list: group list
item-36 at level 7: list_item: List item 1.1 item-36 at level 7: list_item: List item 1.1
item-37 at level 6: list_item: List item 2 item-37 at level 6: list_item: List item 2
item-38 at level 5: paragraph: item-38 at level 5: text:
item-39 at level 4: section_header: Test 5: item-39 at level 4: section_header: Test 5:
item-40 at level 5: list: group list item-40 at level 5: list: group list
item-41 at level 6: list_item: List item 1 item-41 at level 6: list_item: List item 1
@@ -45,7 +45,7 @@ item-0 at level 0: unspecified: group _root_
item-44 at level 7: list: group list item-44 at level 7: list: group list
item-45 at level 8: list_item: List item 1.1.1 item-45 at level 8: list_item: List item 1.1.1
item-46 at level 6: list_item: List item 3 item-46 at level 6: list_item: List item 3
item-47 at level 5: paragraph: item-47 at level 5: text:
item-48 at level 4: section_header: Test 6: item-48 at level 4: section_header: Test 6:
item-49 at level 5: list: group list item-49 at level 5: list: group list
item-50 at level 6: list_item: List item 1 item-50 at level 6: list_item: List item 1
@@ -56,6 +56,6 @@ item-0 at level 0: unspecified: group _root_
item-55 at level 7: list: group list item-55 at level 7: list: group list
item-56 at level 8: list_item: List item 1.2.1 item-56 at level 8: list_item: List item 1.2.1
item-57 at level 6: list_item: List item 3 item-57 at level 6: list_item: List item 3
item-58 at level 5: paragraph: item-58 at level 5: text:
item-59 at level 5: paragraph: item-59 at level 5: text:
item-60 at level 5: paragraph: item-60 at level 5: text:

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "unit_test_lists", "name": "unit_test_lists",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -338,7 +338,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -350,7 +350,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -362,7 +362,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1.1", "orig": "Paragraph 2.1.1",
"text": "Paragraph 2.1.1", "text": "Paragraph 2.1.1",
@@ -381,7 +381,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -393,7 +393,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Paragraph 2.1.2", "orig": "Paragraph 2.1.2",
"text": "Paragraph 2.1.2", "text": "Paragraph 2.1.2",
@@ -412,7 +412,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -507,7 +507,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -602,7 +602,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -760,7 +760,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -855,7 +855,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -971,7 +971,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1135,7 +1135,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1147,7 +1147,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -1159,7 +1159,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""

View File

@@ -0,0 +1,66 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group WebVTT cue block
item-2 at level 2: text: 00:11.000 --> 00:13.000
item-3 at level 2: inline: group WebVTT cue voice span
item-4 at level 3: text: Roger Bingham:
item-5 at level 3: text: We are in New York City
item-6 at level 1: section: group WebVTT cue block
item-7 at level 2: text: 00:13.000 --> 00:16.000
item-8 at level 2: inline: group WebVTT cue voice span
item-9 at level 3: text: Roger Bingham:
item-10 at level 3: text: Were actually at the Lucern Hotel, just down the street
item-11 at level 1: section: group WebVTT cue block
item-12 at level 2: text: 00:16.000 --> 00:18.000
item-13 at level 2: inline: group WebVTT cue voice span
item-14 at level 3: text: Roger Bingham:
item-15 at level 3: text: from the American Museum of Natural History
item-16 at level 1: section: group WebVTT cue block
item-17 at level 2: text: 00:18.000 --> 00:20.000
item-18 at level 2: inline: group WebVTT cue voice span
item-19 at level 3: text: Roger Bingham:
item-20 at level 3: text: And with me is Neil deGrasse Tyson
item-21 at level 1: section: group WebVTT cue block
item-22 at level 2: text: 00:20.000 --> 00:22.000
item-23 at level 2: inline: group WebVTT cue voice span
item-24 at level 3: text: Roger Bingham:
item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
item-26 at level 1: section: group WebVTT cue block
item-27 at level 2: text: 00:22.000 --> 00:24.000
item-28 at level 2: inline: group WebVTT cue voice span
item-29 at level 3: text: Roger Bingham:
item-30 at level 3: text: at the AMNH.
item-31 at level 1: section: group WebVTT cue block
item-32 at level 2: text: 00:24.000 --> 00:26.000
item-33 at level 2: inline: group WebVTT cue voice span
item-34 at level 3: text: Roger Bingham:
item-35 at level 3: text: Thank you for walking down here.
item-36 at level 1: section: group WebVTT cue block
item-37 at level 2: text: 00:27.000 --> 00:30.000
item-38 at level 2: inline: group WebVTT cue voice span
item-39 at level 3: text: Roger Bingham:
item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
item-41 at level 1: section: group WebVTT cue block
item-42 at level 2: text: 00:30.000 --> 00:31.500
item-43 at level 2: inline: group WebVTT cue voice span
item-44 at level 3: text: Roger Bingham:
item-45 at level 3: text: When we e-mailed—
item-46 at level 1: section: group WebVTT cue block
item-47 at level 2: text: 00:30.500 --> 00:32.500
item-48 at level 2: inline: group WebVTT cue voice span
item-49 at level 3: text: Neil deGrasse Tyson:
item-50 at level 3: text: Didnt we talk about enough in that conversation?
item-51 at level 1: section: group WebVTT cue block
item-52 at level 2: text: 00:32.000 --> 00:35.500
item-53 at level 2: inline: group WebVTT cue voice span
item-54 at level 3: text: Roger Bingham:
item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
item-56 at level 1: section: group WebVTT cue block
item-57 at level 2: text: 00:32.500 --> 00:33.500
item-58 at level 2: inline: group WebVTT cue voice span
item-59 at level 3: text: Neil deGrasse Tyson:
item-60 at level 3: text: Laughs
item-61 at level 1: section: group WebVTT cue block
item-62 at level 2: text: 00:35.500 --> 00:38.000
item-63 at level 2: inline: group WebVTT cue voice span
item-64 at level 3: text: Roger Bingham:
item-65 at level 3: text: You know Im so excited my glasses are falling off here.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,51 @@
00:11.000 --> 00:13.000
Roger Bingham: We are in New York City
00:13.000 --> 00:16.000
Roger Bingham: Were actually at the Lucern Hotel, just down the street
00:16.000 --> 00:18.000
Roger Bingham: from the American Museum of Natural History
00:18.000 --> 00:20.000
Roger Bingham: And with me is Neil deGrasse Tyson
00:20.000 --> 00:22.000
Roger Bingham: Astrophysicist, Director of the Hayden Planetarium
00:22.000 --> 00:24.000
Roger Bingham: at the AMNH.
00:24.000 --> 00:26.000
Roger Bingham: Thank you for walking down here.
00:27.000 --> 00:30.000
Roger Bingham: And I want to do a follow-up on the last conversation we did.
00:30.000 --> 00:31.500
Roger Bingham: When we e-mailed—
00:30.500 --> 00:32.500
Neil deGrasse Tyson: Didnt we talk about enough in that conversation?
00:32.000 --> 00:35.500
Roger Bingham: No! No no no no; 'cos 'cos obviously 'cos
00:32.500 --> 00:33.500
Neil deGrasse Tyson: *Laughs*
00:35.500 --> 00:38.000
Roger Bingham: You know Im so excited my glasses are falling off here.

View File

@@ -0,0 +1,22 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group WebVTT cue block
item-2 at level 2: text: 00:00.000 --> 00:02.000
item-3 at level 2: inline: group WebVTT cue voice span
item-4 at level 3: text: Esme (first, loud):
item-5 at level 3: text: Its a blue apple tree!
item-6 at level 1: section: group WebVTT cue block
item-7 at level 2: text: 00:02.000 --> 00:04.000
item-8 at level 2: inline: group WebVTT cue voice span
item-9 at level 3: text: Mary:
item-10 at level 3: text: No way!
item-11 at level 1: section: group WebVTT cue block
item-12 at level 2: text: 00:04.000 --> 00:06.000
item-13 at level 2: inline: group WebVTT cue voice span
item-14 at level 3: text: Esme:
item-15 at level 3: text: Hee!
item-16 at level 2: text: laughter
item-17 at level 1: section: group WebVTT cue block
item-18 at level 2: text: 00:06.000 --> 00:08.000
item-19 at level 2: inline: group WebVTT cue voice span
item-20 at level 3: text: Mary (loud):
item-21 at level 3: text: Thats awesome!

View File

@@ -0,0 +1,376 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"name": "webvtt_example_02",
"origin": {
"mimetype": "text/vtt",
"binary_hash": 12867774546881601731,
"filename": "webvtt_example_02.vtt"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/2"
},
{
"$ref": "#/groups/4"
},
{
"$ref": "#/groups/6"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/groups/0"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/3"
},
{
"$ref": "#/groups/3"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/groups/2"
},
"children": [
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
},
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/6"
},
{
"$ref": "#/groups/5"
},
{
"$ref": "#/texts/9"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/5",
"parent": {
"$ref": "#/groups/4"
},
"children": [
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
},
{
"self_ref": "#/groups/6",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/10"
},
{
"$ref": "#/groups/7"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/7",
"parent": {
"$ref": "#/groups/6"
},
"children": [
{
"$ref": "#/texts/11"
},
{
"$ref": "#/texts/12"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:00.000 --> 00:02.000",
"text": "00:00.000 --> 00:02.000"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Esme (first, loud): ",
"text": "Esme (first, loud): "
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Its a blue apple tree!",
"text": "Its a blue apple tree!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:02.000 --> 00:04.000",
"text": "00:02.000 --> 00:04.000"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Mary: ",
"text": "Mary: "
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "No way!",
"text": "No way!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:04.000 --> 00:06.000",
"text": "00:04.000 --> 00:06.000"
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Esme: ",
"text": "Esme: "
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Hee!",
"text": "Hee!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "laughter",
"text": "laughter",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:06.000 --> 00:08.000",
"text": "00:06.000 --> 00:08.000"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Mary (loud): ",
"text": "Mary (loud): "
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Thats awesome!",
"text": "Thats awesome!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,17 @@
00:00.000 --> 00:02.000
Esme (first, loud): Its a blue apple tree!
00:02.000 --> 00:04.000
Mary: No way!
00:04.000 --> 00:06.000
Esme: Hee!
*laughter*
00:06.000 --> 00:08.000
Mary (loud): Thats awesome!

View File

@@ -0,0 +1,77 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group WebVTT cue block
item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
item-4 at level 2: inline: group WebVTT cue voice span
item-5 at level 3: text: Speaker A:
item-6 at level 3: text: OK, I think now we should be recording
item-7 at level 1: section: group WebVTT cue block
item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
item-10 at level 2: inline: group WebVTT cue voice span
item-11 at level 3: text: Speaker A:
item-12 at level 3: text: properly.
item-13 at level 1: section: group WebVTT cue block
item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
item-16 at level 2: text: Good.
item-17 at level 1: section: group WebVTT cue block
item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
item-20 at level 2: inline: group WebVTT cue voice span
item-21 at level 3: text: Speaker A:
item-22 at level 3: text: Yeah.
item-23 at level 1: section: group WebVTT cue block
item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
item-26 at level 2: inline: group WebVTT cue voice span
item-27 at level 3: text: Speaker B:
item-28 at level 3: text: I was also thinking.
item-29 at level 1: section: group WebVTT cue block
item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
item-32 at level 2: inline: group WebVTT cue voice span
item-33 at level 3: text: Speaker B:
item-34 at level 3: text: Would be maybe good to create items,
item-35 at level 1: section: group WebVTT cue block
item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
item-38 at level 2: inline: group WebVTT cue voice span
item-39 at level 3: text: Speaker B:
item-40 at level 3: text: some metadata, some options that can be specific.
item-41 at level 1: section: group WebVTT cue block
item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
item-44 at level 2: inline: group WebVTT cue voice span
item-45 at level 3: text: Speaker A:
item-46 at level 3: text: Yeah, I mean I think you went even more than
item-47 at level 1: section: group WebVTT cue block
item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
item-50 at level 2: inline: group WebVTT cue voice span
item-51 at level 3: text: Speaker B:
item-52 at level 3: text: But we preserved the atoms.
item-53 at level 1: section: group WebVTT cue block
item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
item-56 at level 2: inline: group WebVTT cue voice span
item-57 at level 3: text: Speaker A:
item-58 at level 3: text: than me. I just opened the format.
item-59 at level 1: section: group WebVTT cue block
item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
item-62 at level 2: inline: group WebVTT cue voice span
item-63 at level 3: text: Speaker A:
item-64 at level 3: text: give it a try, yeah.
item-65 at level 1: section: group WebVTT cue block
item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
item-68 at level 2: inline: group WebVTT cue voice span
item-69 at level 3: text: Speaker B:
item-70 at level 3: text: Okay, talk to you later.
item-71 at level 1: section: group WebVTT cue block
item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
item-74 at level 2: inline: group WebVTT cue voice span
item-75 at level 3: text: Speaker A:
item-76 at level 3: text: See you.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,77 @@
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
00:00:04.963 --> 00:00:08.571
Speaker A: OK, I think now we should be recording
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
00:00:08.571 --> 00:00:09.403
Speaker A: properly.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
00:00:10.683 --> 00:00:11.563
Good.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
00:00:13.363 --> 00:00:13.803
Speaker A: Yeah.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
00:00:49.603 --> 00:00:53.363
Speaker B: I was also thinking.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
00:00:54.963 --> 00:01:02.072
Speaker B: Would be maybe good to create items,
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
00:01:02.072 --> 00:01:06.811
Speaker B: some metadata, some options that can be specific.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
00:01:10.243 --> 00:01:13.014
Speaker A: Yeah, I mean I think you went even more than
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
00:01:10.563 --> 00:01:12.643
Speaker B: But we preserved the atoms.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
00:01:13.014 --> 00:01:15.907
Speaker A: than me. I just opened the format.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
00:01:50.222 --> 00:01:51.643
Speaker A: give it a try, yeah.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
00:01:52.043 --> 00:01:55.043
Speaker B: Okay, talk to you later.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
00:01:54.603 --> 00:01:55.283
Speaker A: See you.

View File

@@ -1,16 +1,16 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Transcript item-1 at level 1: text: Transcript
item-2 at level 1: paragraph: February 20, 2025, 8:32PM item-2 at level 1: text: February 20, 2025, 8:32PM
item-3 at level 1: picture item-3 at level 1: picture
item-4 at level 1: inline: group group item-4 at level 1: inline: group group
item-5 at level 2: paragraph: This is test 1 item-5 at level 2: text: This is test 1
item-6 at level 2: paragraph: 0:08 item-6 at level 2: text: 0:08
Correct, he is not. Correct, he is not.
item-7 at level 1: paragraph: item-7 at level 1: text:
item-8 at level 1: picture item-8 at level 1: picture
item-9 at level 1: inline: group group item-9 at level 1: inline: group group
item-10 at level 2: paragraph: This is test 2 item-10 at level 2: text: This is test 2
item-11 at level 2: paragraph: 0:16 item-11 at level 2: text: 0:16
Yeah, exactly. Yeah, exactly.
item-12 at level 1: paragraph: item-12 at level 1: text:
item-13 at level 1: paragraph: item-13 at level 1: text:

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "word_image_anchors", "name": "word_image_anchors",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -93,7 +93,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Transcript", "orig": "Transcript",
"text": "Transcript", "text": "Transcript",
@@ -112,7 +112,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "February 20, 2025, 8:32PM", "orig": "February 20, 2025, 8:32PM",
"text": "February 20, 2025, 8:32PM", "text": "February 20, 2025, 8:32PM",
@@ -131,7 +131,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "This is test 1", "orig": "This is test 1",
"text": "This is test 1", "text": "This is test 1",
@@ -150,7 +150,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "0:08\nCorrect, he is not.", "orig": "0:08\nCorrect, he is not.",
"text": "0:08\nCorrect, he is not.", "text": "0:08\nCorrect, he is not.",
@@ -169,7 +169,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -181,7 +181,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "This is test 2", "orig": "This is test 2",
"text": "This is test 2", "text": "This is test 2",
@@ -200,7 +200,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "0:16\nYeah, exactly.", "orig": "0:16\nYeah, exactly.",
"text": "0:16\nYeah, exactly.", "text": "0:16\nYeah, exactly.",
@@ -219,7 +219,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -231,7 +231,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""

View File

@@ -1,28 +1,28 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Summer activities item-1 at level 1: text: Summer activities
item-2 at level 1: title: Swimming in the lake item-2 at level 1: title: Swimming in the lake
item-3 at level 2: paragraph: Duck item-3 at level 2: text: Duck
item-4 at level 2: picture item-4 at level 2: picture
item-5 at level 2: paragraph: Figure 1: This is a cute duckling item-5 at level 2: text: Figure 1: This is a cute duckling
item-6 at level 2: section_header: Lets swim! item-6 at level 2: section_header: Lets swim!
item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown: item-7 at level 3: text: To get started with swimming, fi ... down in a water and try not to drown:
item-8 at level 3: list: group list item-8 at level 3: list: group list
item-9 at level 4: list_item: You can relax and look around item-9 at level 4: list_item: You can relax and look around
item-10 at level 4: list_item: Paddle about item-10 at level 4: list_item: Paddle about
item-11 at level 4: list_item: Enjoy summer warmth item-11 at level 4: list_item: Enjoy summer warmth
item-12 at level 3: paragraph: Also, dont forget: item-12 at level 3: text: Also, dont forget:
item-13 at level 3: list: group list item-13 at level 3: list: group list
item-14 at level 4: list_item: Wear sunglasses item-14 at level 4: list_item: Wear sunglasses
item-15 at level 4: list_item: Dont forget to drink water item-15 at level 4: list_item: Dont forget to drink water
item-16 at level 4: list_item: Use sun cream item-16 at level 4: list_item: Use sun cream
item-17 at level 3: paragraph: Hmm, what else… item-17 at level 3: text: Hmm, what else…
item-18 at level 3: section_header: Lets eat item-18 at level 3: section_header: Lets eat
item-19 at level 4: paragraph: After we had a good day of swimm ... , its important to eat something nice item-19 at level 4: text: After we had a good day of swimm ... , its important to eat something nice
item-20 at level 4: paragraph: I like to eat leaves item-20 at level 4: text: I like to eat leaves
item-21 at level 4: paragraph: Here are some interesting things a respectful duck could eat: item-21 at level 4: text: Here are some interesting things a respectful duck could eat:
item-22 at level 4: table with [4x3] item-22 at level 4: table with [4x3]
item-23 at level 4: paragraph: item-23 at level 4: text:
item-24 at level 4: paragraph: And lets add another list in the end: item-24 at level 4: text: And lets add another list in the end:
item-25 at level 4: list: group list item-25 at level 4: list: group list
item-26 at level 5: list_item: Leaves item-26 at level 5: list_item: Leaves
item-27 at level 5: list_item: Berries item-27 at level 5: list_item: Berries

View File

@@ -1,6 +1,6 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.6.0", "version": "1.7.0",
"name": "word_sample", "name": "word_sample",
"origin": { "origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -98,7 +98,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Summer activities", "orig": "Summer activities",
"text": "Summer activities", "text": "Summer activities",
@@ -142,7 +142,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Duck", "orig": "Duck",
"text": "Duck", "text": "Duck",
@@ -161,7 +161,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Figure 1: This is a cute duckling", "orig": "Figure 1: This is a cute duckling",
"text": "Figure 1: This is a cute duckling", "text": "Figure 1: This is a cute duckling",
@@ -212,7 +212,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "To get started with swimming, first lay down in a water and try not to drown:", "orig": "To get started with swimming, first lay down in a water and try not to drown:",
"text": "To get started with swimming, first lay down in a water and try not to drown:", "text": "To get started with swimming, first lay down in a water and try not to drown:",
@@ -294,7 +294,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Also, dont forget:", "orig": "Also, dont forget:",
"text": "Also, dont forget:", "text": "Also, dont forget:",
@@ -376,7 +376,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Hmm, what else…", "orig": "Hmm, what else…",
"text": "Hmm, what else…", "text": "Hmm, what else…",
@@ -430,7 +430,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "After we had a good day of swimming in the lake, its important to eat something nice", "orig": "After we had a good day of swimming in the lake, its important to eat something nice",
"text": "After we had a good day of swimming in the lake, its important to eat something nice", "text": "After we had a good day of swimming in the lake, its important to eat something nice",
@@ -449,7 +449,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "I like to eat leaves", "orig": "I like to eat leaves",
"text": "I like to eat leaves", "text": "I like to eat leaves",
@@ -468,7 +468,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "Here are some interesting things a respectful duck could eat:", "orig": "Here are some interesting things a respectful duck could eat:",
"text": "Here are some interesting things a respectful duck could eat:", "text": "Here are some interesting things a respectful duck could eat:",
@@ -487,7 +487,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "", "orig": "",
"text": "" "text": ""
@@ -499,7 +499,7 @@
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
"label": "paragraph", "label": "text",
"prov": [], "prov": [],
"orig": "And lets add another list in the end:", "orig": "And lets add another list in the end:",
"text": "And lets add another list in the end:", "text": "And lets add another list in the end:",
@@ -625,7 +625,8 @@
"text": "", "text": "",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -637,7 +638,8 @@
"text": "Food", "text": "Food",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -649,7 +651,8 @@
"text": "Calories per portion", "text": "Calories per portion",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -661,7 +664,8 @@
"text": "Leaves", "text": "Leaves",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -673,7 +677,8 @@
"text": "Ash, Elm, Maple", "text": "Ash, Elm, Maple",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -685,7 +690,8 @@
"text": "50", "text": "50",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -697,7 +703,8 @@
"text": "Berries", "text": "Berries",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -709,7 +716,8 @@
"text": "Blueberry, Strawberry, Cranberry", "text": "Blueberry, Strawberry, Cranberry",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -721,7 +729,8 @@
"text": "150", "text": "150",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -733,7 +742,8 @@
"text": "Grain", "text": "Grain",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -745,7 +755,8 @@
"text": "Corn, Buckwheat, Barley", "text": "Corn, Buckwheat, Barley",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -757,7 +768,8 @@
"text": "200", "text": "200",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
], ],
"num_rows": 4, "num_rows": 4,
@@ -774,7 +786,8 @@
"text": "", "text": "",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -786,7 +799,8 @@
"text": "Food", "text": "Food",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -798,7 +812,8 @@
"text": "Calories per portion", "text": "Calories per portion",
"column_header": true, "column_header": true,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
], ],
[ [
@@ -812,7 +827,8 @@
"text": "Leaves", "text": "Leaves",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -824,7 +840,8 @@
"text": "Ash, Elm, Maple", "text": "Ash, Elm, Maple",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -836,7 +853,8 @@
"text": "50", "text": "50",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
], ],
[ [
@@ -850,7 +868,8 @@
"text": "Berries", "text": "Berries",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -862,7 +881,8 @@
"text": "Blueberry, Strawberry, Cranberry", "text": "Blueberry, Strawberry, Cranberry",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -874,7 +894,8 @@
"text": "150", "text": "150",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
], ],
[ [
@@ -888,7 +909,8 @@
"text": "Grain", "text": "Grain",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -900,7 +922,8 @@
"text": "Corn, Buckwheat, Barley", "text": "Corn, Buckwheat, Barley",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
}, },
{ {
"row_span": 1, "row_span": 1,
@@ -912,7 +935,8 @@
"text": "200", "text": "200",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false,
"fillable": false
} }
] ]
] ]

View File

@@ -1,19 +1,19 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-0 item-1 at level 1: section: group header-0
item-2 at level 2: section_header: Test with tables item-2 at level 2: section_header: Test with tables
item-3 at level 3: paragraph: A uniform table item-3 at level 3: text: A uniform table
item-4 at level 3: table with [3x3] item-4 at level 3: table with [3x3]
item-5 at level 3: paragraph: item-5 at level 3: text:
item-6 at level 3: paragraph: A non-uniform table with horizontal spans item-6 at level 3: text: A non-uniform table with horizontal spans
item-7 at level 3: table with [3x3] item-7 at level 3: table with [3x3]
item-8 at level 3: paragraph: item-8 at level 3: text:
item-9 at level 3: paragraph: A non-uniform table with horizontal spans in inner columns item-9 at level 3: text: A non-uniform table with horizontal spans in inner columns
item-10 at level 3: table with [3x4] item-10 at level 3: table with [3x4]
item-11 at level 3: paragraph: item-11 at level 3: text:
item-12 at level 3: paragraph: A non-uniform table with vertical spans item-12 at level 3: text: A non-uniform table with vertical spans
item-13 at level 3: table with [5x3] item-13 at level 3: table with [5x3]
item-14 at level 3: paragraph: item-14 at level 3: text:
item-15 at level 3: paragraph: A non-uniform table with all kinds of spans and empty cells item-15 at level 3: text: A non-uniform table with all kinds of spans and empty cells
item-16 at level 3: table with [9x5] item-16 at level 3: table with [9x5]
item-17 at level 3: paragraph: item-17 at level 3: text:
item-18 at level 3: paragraph: item-18 at level 3: text:

File diff suppressed because it is too large Load Diff

33
tests/data/md/escaped_characters.md vendored Normal file
View File

@@ -0,0 +1,33 @@
# Headers:
## &amp; &lt; &gt; &quot; &#39;
Text:
00:16.000 ----&gt; 00:18.000
&amp; &lt; &gt; &quot; &#39;
# Lists
1. &amp; &lt; &gt; &quot; &#39;
- &amp; &lt; &gt; &quot; &#39;
# Inline code
`&amp; &lt; &gt; &quot; &#39; `
# Code block
```
&amp; &lt; &gt; &quot; &#39;
```
# Table
| Key | Example |
| ------------------- | ----------------- |
| Ampersand | &amp; |
| Less-than | &lt; |
| Greater-than | &gt; |
| Quotes | &quot; |
| Apostrophes | &#39; |
# Raw HTML
<div title="">&amp; &lt; &gt; &quot; &#39;/div>
## Link
[&amp; &lt; &gt; &quot; &#39;](https://en.wikipedia.org/wiki/Albert_Einstein)

42
tests/data/webvtt/webvtt_example_01.vtt vendored Normal file
View File

@@ -0,0 +1,42 @@
WEBVTT
NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
00:11.000 --> 00:13.000
<v Roger Bingham>We are in New York City
00:13.000 --> 00:16.000
<v Roger Bingham>Were actually at the Lucern Hotel, just down the street
00:16.000 --> 00:18.000
<v Roger Bingham>from the American Museum of Natural History
00:18.000 --> 00:20.000
<v Roger Bingham>And with me is Neil deGrasse Tyson
00:20.000 --> 00:22.000
<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
00:22.000 --> 00:24.000
<v Roger Bingham>at the AMNH.
00:24.000 --> 00:26.000
<v Roger Bingham>Thank you for walking down here.
00:27.000 --> 00:30.000
<v Roger Bingham>And I want to do a follow-up on the last conversation we did.
00:30.000 --> 00:31.500 align:right size:50%
<v Roger Bingham>When we e-mailed—
00:30.500 --> 00:32.500 align:left size:50%
<v Neil deGrasse Tyson>Didnt we talk about enough in that conversation?
00:32.000 --> 00:35.500 align:right size:50%
<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
00:32.500 --> 00:33.500 align:left size:50%
<v Neil deGrasse Tyson><i>Laughs</i>
00:35.500 --> 00:38.000
<v Roger Bingham>You know Im so excited my glasses are falling off here.

15
tests/data/webvtt/webvtt_example_02.vtt vendored Normal file
View File

@@ -0,0 +1,15 @@
WEBVTT
NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
00:00.000 --> 00:02.000
<v.first.loud Esme>Its a blue apple tree!
00:02.000 --> 00:04.000
<v Mary>No way!
00:04.000 --> 00:06.000
<v Esme>Hee!</v> <i>laughter</i>
00:06.000 --> 00:08.000
<v.loud Mary>Thats awesome!

57
tests/data/webvtt/webvtt_example_03.vtt vendored Normal file
View File

@@ -0,0 +1,57 @@
WEBVTT
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
00:00:04.963 --> 00:00:08.571
<v Speaker A>OK,
I think now we should be recording</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
00:00:08.571 --> 00:00:09.403
<v Speaker A>properly.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
00:00:10.683 --> 00:00:11.563
Good.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
00:00:13.363 --> 00:00:13.803
<v Speaker A>Yeah.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
00:00:49.603 --> 00:00:53.363
<v Speaker B>I was also thinking.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
00:00:54.963 --> 00:01:02.072
<v Speaker B>Would be maybe good to create items,</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
00:01:02.072 --> 00:01:06.811
<v Speaker B>some metadata,
some options that can be specific.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
00:01:10.243 --> 00:01:13.014
<v Speaker A>Yeah,
I mean I think you went even more than</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
00:01:10.563 --> 00:01:12.643
<v Speaker B>But we preserved the atoms.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
00:01:13.014 --> 00:01:15.907
<v Speaker A>than me.
I just opened the format.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
00:01:50.222 --> 00:01:51.643
<v Speaker A>give it a try, yeah.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
00:01:52.043 --> 00:01:55.043
<v Speaker B>Okay, talk to you later.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
00:01:54.603 --> 00:01:55.283
<v Speaker A>See you.</v>

View File

@@ -26,10 +26,12 @@ def test_convert_valid():
assert len(relevant_paths) > 0 assert len(relevant_paths) > 0
yaml_filter = ["inline_and_formatting", "mixed_without_h1"] yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
json_filter = ["escaped_characters"]
for in_path in relevant_paths: for in_path in relevant_paths:
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md" md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml" yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
json_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.json"
in_doc = InputDocument( in_doc = InputDocument(
path_or_stream=in_path, path_or_stream=in_path,
@@ -45,6 +47,9 @@ def test_convert_valid():
act_doc = backend.convert() act_doc = backend.convert()
act_data = act_doc.export_to_markdown() act_data = act_doc.export_to_markdown()
if in_path.stem in json_filter:
assert verify_document(act_doc, json_gt_path, GENERATE), "export to json"
if GEN_TEST_DATA: if GEN_TEST_DATA:
with open(md_gt_path, mode="w", encoding="utf-8") as f: with open(md_gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n") f.write(f"{act_data}\n")

232
tests/test_backend_vtt.py Normal file
View File

@@ -0,0 +1,232 @@
# Assisted by watsonx Code Assistant
from pathlib import Path
import pytest
from docling_core.types.doc import DoclingDocument
from pydantic import ValidationError
from docling.backend.webvtt_backend import (
_WebVTTCueItalicSpan,
_WebVTTCueTextSpan,
_WebVTTCueTimings,
_WebVTTCueVoiceSpan,
_WebVTTFile,
_WebVTTTimestamp,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
def test_vtt_cue_commponents():
"""Test WebVTT components."""
valid_timestamps = [
"00:01:02.345",
"12:34:56.789",
"02:34.567",
"00:00:00.000",
]
valid_total_seconds = [
1 * 60 + 2.345,
12 * 3600 + 34 * 60 + 56.789,
2 * 60 + 34.567,
0.0,
]
for idx, ts in enumerate(valid_timestamps):
model = _WebVTTTimestamp(raw=ts)
assert model.seconds == valid_total_seconds[idx]
"""Test invalid WebVTT timestamps."""
invalid_timestamps = [
"00:60:02.345", # minutes > 59
"00:01:60.345", # seconds > 59
"00:01:02.1000", # milliseconds > 999
"01:02:03", # missing milliseconds
"01:02", # missing milliseconds
":01:02.345", # extra : for missing hours
"abc:01:02.345", # invalid format
]
for ts in invalid_timestamps:
with pytest.raises(ValidationError):
_WebVTTTimestamp(raw=ts)
"""Test the timestamp __str__ method."""
model = _WebVTTTimestamp(raw="00:01:02.345")
assert str(model) == "00:01:02.345"
"""Test valid cue timings."""
start = _WebVTTTimestamp(raw="00:10.005")
end = _WebVTTTimestamp(raw="00:14.007")
cue_timings = _WebVTTCueTimings(start=start, end=end)
assert cue_timings.start == start
assert cue_timings.end == end
assert str(cue_timings) == "00:10.005 --> 00:14.007"
"""Test invalid cue timings with end timestamp before start."""
start = _WebVTTTimestamp(raw="00:10.700")
end = _WebVTTTimestamp(raw="00:10.500")
with pytest.raises(ValidationError) as excinfo:
_WebVTTCueTimings(start=start, end=end)
assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
"""Test invalid cue timings with missing end."""
start = _WebVTTTimestamp(raw="00:10.500")
with pytest.raises(ValidationError) as excinfo:
_WebVTTCueTimings(start=start)
assert "Field required" in str(excinfo.value)
"""Test invalid cue timings with missing start."""
end = _WebVTTTimestamp(raw="00:10.500")
with pytest.raises(ValidationError) as excinfo:
_WebVTTCueTimings(end=end)
assert "Field required" in str(excinfo.value)
"""Test with valid text."""
valid_text = "This is a valid cue text span."
span = _WebVTTCueTextSpan(text=valid_text)
assert span.text == valid_text
assert str(span) == valid_text
"""Test with text containing newline characters."""
invalid_text = "This cue text span\ncontains a newline."
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text=invalid_text)
"""Test with text containing ampersand."""
invalid_text = "This cue text span contains &."
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text=invalid_text)
"""Test with text containing less-than sign."""
invalid_text = "This cue text span contains <."
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text=invalid_text)
"""Test with empty text."""
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text="")
"""Test that annotation validation works correctly."""
valid_annotation = "valid-annotation"
invalid_annotation = "invalid\nannotation"
with pytest.raises(ValidationError):
_WebVTTCueVoiceSpan(annotation=invalid_annotation)
assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
"""Test that classes validation works correctly."""
annotation = "speaker name"
valid_classes = ["class1", "class2"]
invalid_classes = ["class\nwith\nnewlines", ""]
with pytest.raises(ValidationError):
_WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
"""Test that components validation works correctly."""
annotation = "speaker name"
valid_components = [_WebVTTCueTextSpan(text="random text")]
invalid_components = [123, "not a component"]
with pytest.raises(ValidationError):
_WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
"""Test valid cue voice spans."""
cue_span = _WebVTTCueVoiceSpan(
annotation="speaker",
classes=["loud", "clear"],
components=[_WebVTTCueTextSpan(text="random text")],
)
expected_str = "<v.loud.clear speaker>random text</v>"
assert str(cue_span) == expected_str
cue_span = _WebVTTCueVoiceSpan(
annotation="speaker",
components=[_WebVTTCueTextSpan(text="random text")],
)
expected_str = "<v speaker>random text</v>"
assert str(cue_span) == expected_str
def test_webvtt_file():
"""Test WebVTT files."""
with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
content = f.read()
vtt = _WebVTTFile.parse(content)
assert len(vtt) == 13
block = vtt.cue_blocks[11]
assert str(block.timings) == "00:32.500 --> 00:33.500"
assert len(block.payload) == 1
cue_span = block.payload[0]
assert isinstance(cue_span, _WebVTTCueVoiceSpan)
assert cue_span.annotation == "Neil deGrasse Tyson"
assert not cue_span.classes
assert len(cue_span.components) == 1
comp = cue_span.components[0]
assert isinstance(comp, _WebVTTCueItalicSpan)
assert len(comp.components) == 1
comp2 = comp.components[0]
assert isinstance(comp2, _WebVTTCueTextSpan)
assert comp2.text == "Laughs"
with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
content = f.read()
vtt = _WebVTTFile.parse(content)
assert len(vtt) == 4
reverse = (
"WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
"https://www.w3.org/TR/webvtt1/\n\n"
)
reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
assert content == reverse
with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
content = f.read()
vtt = _WebVTTFile.parse(content)
assert len(vtt) == 13
for block in vtt:
assert block.identifier
block = vtt.cue_blocks[0]
assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
assert len(block.payload) == 1
assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
block = vtt.cue_blocks[2]
assert isinstance(cue_span, _WebVTTCueVoiceSpan)
assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
assert len(block.payload) == 1
assert isinstance(block.payload[0], _WebVTTCueTextSpan)
assert block.payload[0].text == "Good."
def test_e2e_vtt_conversions():
directory = Path("./tests/data/webvtt/")
vtt_paths = sorted(directory.rglob("*.vtt"))
converter = DocumentConverter(allowed_formats=[InputFormat.VTT])
for vtt in vtt_paths:
gt_path = vtt.parent.parent / "groundtruth" / "docling_v2" / vtt.name
conv_result: ConversionResult = converter.convert(vtt)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown(escape_html=False)
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
"export to md"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
"export to indented-text"
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE)

View File

@@ -206,6 +206,11 @@ def test_guess_format(tmp_path):
doc_path.write_text("xyz", encoding="utf-8") doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) is None assert dci._guess_format(doc_path) is None
# Valid WebVTT
buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read())
stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf)
assert dci._guess_format(stream) == InputFormat.VTT
# Valid Docling JSON # Valid Docling JSON
test_str = '{"name": ""}' test_str = '{"name": ""}'
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode())) stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))

10
uv.lock generated
View File

@@ -1049,7 +1049,7 @@ wheels = [
[[package]] [[package]]
name = "docling" name = "docling"
version = "2.53.0" version = "2.54.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "accelerate" }, { name = "accelerate" },
@@ -1154,7 +1154,7 @@ requires-dist = [
{ name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" }, { name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
{ name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" }, { name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
{ name = "certifi", specifier = ">=2024.7.4" }, { name = "certifi", specifier = ">=2024.7.4" },
{ name = "docling-core", extras = ["chunking"], specifier = ">=2.48.0,<3.0.0" }, { name = "docling-core", extras = ["chunking"], specifier = ">=2.48.2,<3.0.0" },
{ name = "docling-ibm-models", specifier = ">=3.9.1,<4" }, { name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
{ name = "docling-parse", specifier = ">=4.4.0,<5.0.0" }, { name = "docling-parse", specifier = ">=4.4.0,<5.0.0" },
{ name = "easyocr", specifier = ">=1.7,<2.0" }, { name = "easyocr", specifier = ">=1.7,<2.0" },
@@ -1233,7 +1233,7 @@ examples = [
[[package]] [[package]]
name = "docling-core" name = "docling-core"
version = "2.48.1" version = "2.48.2"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "jsonref" }, { name = "jsonref" },
@@ -1247,9 +1247,9 @@ dependencies = [
{ name = "typer" }, { name = "typer" },
{ name = "typing-extensions" }, { name = "typing-extensions" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/f9/0c/dce7f80e99e56570d143885fc40536107e8a39ef4de2888959e055b39607/docling_core-2.48.1.tar.gz", hash = "sha256:48cb77575dfd020a51413957e96b165e45f6d1027c641710fddb389dcb9b189c", size = 161311, upload-time = "2025-09-11T12:33:22.46Z" } sdist = { url = "https://files.pythonhosted.org/packages/dd/e6/922de61f2a7b7d337ffc781f8e85f5581b12801fe193827066ccd6c5ba04/docling_core-2.48.2.tar.gz", hash = "sha256:01c12a1d3c9877c6658d0d6adf5cdcefd56cb814d8083860ba2d77ab882ac2d0", size = 161344, upload-time = "2025-09-22T08:39:41.431Z" }
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/90/fe/1b96120c9d94c97016716ccf46ad2708a2e76157e52dfcca4101db70fc21/docling_core-2.48.1-py3-none-any.whl", hash = "sha256:a3985999ac2067e15e589ef0f11ccde264deacaea403c0f94049242f10a6189a", size = 164330, upload-time = "2025-09-11T12:33:20.935Z" }, { url = "https://files.pythonhosted.org/packages/97/bc/a77739cc31d7de2be9d6682f880761083a2038355e513e813a73a041c644/docling_core-2.48.2-py3-none-any.whl", hash = "sha256:d1f2fe9be9a9f7e7a2fb6ddcc9d9fcbf437bfb02e0c6005cdec1ece1cf4aed44", size = 164376, upload-time = "2025-09-22T08:39:39.704Z" },
] ]
[package.optional-dependencies] [package.optional-dependencies]