This commit is contained in:
utsavMongoDB
2025-09-29 12:56:11 +05:30
59 changed files with 5859 additions and 828 deletions

View File

@@ -1,3 +1,19 @@
## [v2.54.0](https://github.com/docling-project/docling/releases/tag/v2.54.0) - 2025-09-22
### Feature
* Rich tables for MSWord backend ([#2291](https://github.com/docling-project/docling/issues/2291)) ([`e2482a2`](https://github.com/docling-project/docling/commit/e2482a2ada52b2b8a41c4402b27e125adbe4385f))
* Add a backend parser for WebVTT files ([#2288](https://github.com/docling-project/docling/issues/2288)) ([`46efaae`](https://github.com/docling-project/docling/commit/46efaaefee17a6b83e02a050f9f3c8a51afbbd53))
### Fix
* Correct y-axis scaling in draw_table_cells ([#2287](https://github.com/docling-project/docling/issues/2287)) ([`b5628f1`](https://github.com/docling-project/docling/commit/b5628f12273297d9db1393f4b734cfa337caa8c9))
### Documentation
* Update API VLM example with granite-docling ([#2294](https://github.com/docling-project/docling/issues/2294)) ([`8b7e83a`](https://github.com/docling-project/docling/commit/8b7e83a8c7b9e333c31d5ae0b96213e3c70c6bf3))
* Fix examples rendering ([#2281](https://github.com/docling-project/docling/issues/2281)) ([`8322c2e`](https://github.com/docling-project/docling/commit/8322c2ea9b4fbb1625bcbf1ec1b3dea6c1cd3ed0))
## [v2.53.0](https://github.com/docling-project/docling/releases/tag/v2.53.0) - 2025-09-17
### Feature

View File

@@ -29,7 +29,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -45,13 +45,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
* 💬 Parsing of Web Video Text Tracks (WebVTT) files
### Coming soon
* 📝 Metadata extraction, including title, authors, references & language
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
* 📝 Complex chemistry understanding (Molecular structures)
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
## Installation

View File

@@ -3,6 +3,7 @@ import re
import warnings
from copy import deepcopy
from enum import Enum
from html import unescape
from io import BytesIO
from pathlib import Path
from typing import Literal, Optional, Union, cast
@@ -321,9 +322,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "":
title = unescape(element.title)
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION,
text=element.title,
text=title,
formatting=formatting,
hyperlink=hyperlink,
)
@@ -351,6 +353,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
snippet_text = (
element.children.strip() if isinstance(element.children, str) else ""
)
snippet_text = unescape(snippet_text)
# Detect start of the table:
if "|" in snippet_text or self.in_table:
# most likely part of the markdown table

View File

@@ -12,8 +12,11 @@ from docling_core.types.doc import (
ImageRef,
ListGroup,
NodeItem,
RefItem,
RichTableCell,
TableCell,
TableData,
TextItem,
)
from docling_core.types.doc.document import Formatting
from docx import Document
@@ -128,7 +131,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid():
assert self.docx_obj is not None
doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
doc, _ = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
# doc, _ = doc_info
return doc
else:
raise RuntimeError(
@@ -172,7 +176,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
body: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> DoclingDocument:
# parent:
) -> tuple[DoclingDocument, list[RefItem]]:
added_elements = []
for element in body:
tag_name = etree.QName(element).localname
# Check for Inline Images (blip elements)
@@ -230,8 +236,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[level - 1],
name="shape-text",
)
added_elements.append(shape_group.get_ref())
doc.add_text(
label=DocItemLabel.PARAGRAPH,
label=DocItemLabel.TEXT,
parent=shape_group,
text=text_content,
)
@@ -246,23 +253,27 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
_log.debug(
f"Found textbox content with {len(textbox_elements)} elements"
)
self._handle_textbox_content(textbox_elements, docx_obj, doc)
tbc = self._handle_textbox_content(textbox_elements, docx_obj, doc)
added_elements.extend(tbc)
# Check for Tables
if element.tag.endswith("tbl"):
try:
self._handle_tables(element, docx_obj, doc)
t = self._handle_tables(element, docx_obj, doc)
added_elements.extend(t)
except Exception:
_log.debug("could not parse a table, broken docx table")
# Check for Image
elif drawing_blip:
self._handle_pictures(docx_obj, drawing_blip, doc)
pics = self._handle_pictures(docx_obj, drawing_blip, doc)
added_elements.extend(pics)
# Check for Text after the Image
if (
tag_name in ["p"]
and element.find(".//w:t", namespaces=namespaces) is not None
):
self._handle_text_elements(element, docx_obj, doc)
te1 = self._handle_text_elements(element, docx_obj, doc)
added_elements.extend(te1)
# Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -270,15 +281,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
for p in paragraphs:
self._handle_text_elements(p, docx_obj, doc)
te = self._handle_text_elements(p, docx_obj, doc)
added_elements.extend(te)
# Check for Text
elif tag_name in ["p"]:
# "tcPr", "sectPr"
self._handle_text_elements(element, docx_obj, doc)
te = self._handle_text_elements(element, docx_obj, doc)
added_elements.extend(te)
else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc
return doc, added_elements
def _str_to_int(
self, s: Optional[str], default: Optional[int] = 0
@@ -674,14 +687,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
textbox_elements: list,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> None:
) -> List[RefItem]:
elem_ref: List[RefItem] = []
"""Process textbox content and add it to the document structure."""
level = self._get_level()
# Create a textbox group to contain all text from the textbox
textbox_group = doc.add_group(
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
)
elem_ref.append(textbox_group.get_ref())
# Set this as the current parent to ensure textbox content
# is properly nested in document structure
original_parent = self.parents[level]
@@ -729,11 +743,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Mark this paragraph as processed
processed_paragraphs.add(paragraph_id)
self._handle_text_elements(p, docx_obj, doc)
elem_ref.extend(self._handle_text_elements(p, docx_obj, doc))
# Restore original parent
self.parents[level] = original_parent
return
return elem_ref
def _handle_equations_in_text(self, element, text):
only_texts = []
@@ -803,7 +817,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
element: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> None:
) -> List[RefItem]:
elem_ref: List[RefItem] = []
paragraph = Paragraph(element, docx_obj)
paragraph_elements = self._get_paragraph_elements(paragraph)
text, equations = self._handle_equations_in_text(
@@ -811,7 +826,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
if text is None:
return
return elem_ref
text = text.strip()
# Common styles for bullet and numbered lists.
@@ -832,15 +847,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Check if this is actually a numbered list by examining the numFmt
is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
self._add_list_item(
li = self._add_list_item(
doc=doc,
numid=numid,
ilevel=ilevel,
elements=paragraph_elements,
is_numbered=is_numbered,
)
elem_ref.extend(li) # MUST BE REF!!!
self._update_history(p_style_id, p_level, numid, ilevel)
return
return elem_ref
elif (
numid is None
and self._prev_numid() is not None
@@ -860,9 +876,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if p_style_id in ["Title"]:
for key in range(len(self.parents)):
self.parents[key] = None
self.parents[0] = doc.add_text(
parent=None, label=DocItemLabel.TITLE, text=text
)
te = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text)
self.parents[0] = te
elem_ref.append(te.get_ref())
elif "Heading" in p_style_id:
style_element = getattr(paragraph.style, "element", None)
if style_element is not None:
@@ -871,7 +887,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
else:
is_numbered_style = False
self._add_header(doc, p_level, text, is_numbered_style)
h1 = self._add_header(doc, p_level, text, is_numbered_style)
elem_ref.extend(h1)
elif len(equations) > 0:
if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
@@ -879,15 +896,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) > 0:
# Standalone equation
level = self._get_level()
doc.add_text(
t1 = doc.add_text(
label=DocItemLabel.FORMULA,
parent=self.parents[level - 1],
text=text.replace("<eq>", "").replace("</eq>", ""),
)
elem_ref.append(t1.get_ref())
else:
# Inline equation
level = self._get_level()
inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
elem_ref.append(inline_equation.get_ref())
text_tmp = text
for eq in equations:
if len(text_tmp) == 0:
@@ -899,23 +918,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
if len(pre_eq_text) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
e1 = doc.add_text(
label=DocItemLabel.TEXT,
parent=inline_equation,
text=pre_eq_text,
)
doc.add_text(
elem_ref.append(e1.get_ref())
e2 = doc.add_text(
label=DocItemLabel.FORMULA,
parent=inline_equation,
text=eq.replace("<eq>", "").replace("</eq>", ""),
)
elem_ref.append(e2.get_ref())
if len(text_tmp) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
e3 = doc.add_text(
label=DocItemLabel.TEXT,
parent=inline_equation,
text=text_tmp.strip(),
)
elem_ref.append(e3.get_ref())
elif p_style_id in [
"Paragraph",
@@ -934,13 +956,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph_elements=paragraph_elements,
)
for text, format, hyperlink in paragraph_elements:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
t2 = doc.add_text(
label=DocItemLabel.TEXT,
parent=parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elem_ref.append(t2.get_ref())
else:
# Text style names can, and will have, not only default values but user values too
@@ -952,16 +975,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph_elements=paragraph_elements,
)
for text, format, hyperlink in paragraph_elements:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
t3 = doc.add_text(
label=DocItemLabel.TEXT,
parent=parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elem_ref.append(t3.get_ref())
self._update_history(p_style_id, p_level, numid, ilevel)
return
return elem_ref
def _add_header(
self,
@@ -969,17 +993,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
curr_level: Optional[int],
text: str,
is_numbered_style: bool = False,
) -> None:
) -> List[RefItem]:
elem_ref: List[RefItem] = []
level = self._get_level()
if isinstance(curr_level, int):
if curr_level > level:
# add invisible group
for i in range(level, curr_level):
self.parents[i] = doc.add_group(
gr1 = doc.add_group(
parent=self.parents[i - 1],
label=GroupLabel.SECTION,
name=f"header-{i}",
)
elem_ref.append(gr1.get_ref())
self.parents[i] = gr1
elif curr_level < level:
# remove the tail
for key in range(len(self.parents)):
@@ -1019,12 +1047,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
text = f"{self.numbered_headers[previous_level]}.{text}"
previous_level -= 1
self.parents[current_level] = doc.add_heading(
hd = doc.add_heading(
parent=self.parents[parent_level],
text=text,
level=add_level,
)
return
self.parents[current_level] = hd
elem_ref.append(hd.get_ref())
return elem_ref
def _add_formatted_list_item(
self,
@@ -1033,12 +1063,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
marker: str,
enumerated: bool,
level: int,
) -> None:
) -> List[RefItem]:
elem_ref: List[RefItem] = []
# This should not happen by construction
if not isinstance(self.parents[level], ListGroup):
return
return elem_ref
if not elements:
return
return elem_ref
if len(elements) == 1:
text, format, hyperlink = elements[0]
@@ -1068,6 +1099,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
formatting=format,
hyperlink=hyperlink,
)
return elem_ref
def _add_list_item(
self,
@@ -1077,10 +1109,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
ilevel: int,
elements: list,
is_numbered: bool = False,
) -> None:
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
) -> List[RefItem]:
elem_ref: List[RefItem] = []
# this method is always called with is_numbered. Numbered lists should be properly addressed.
if not elements:
return None
return elem_ref
enum_marker = ""
level = self._get_level()
@@ -1091,9 +1124,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Reset counters for the new numbering sequence
self._reset_list_counters_for_new_sequence(numid)
self.parents[level] = doc.add_list_group(
name="list", parent=self.parents[level - 1]
)
list_gr = doc.add_list_group(name="list", parent=self.parents[level - 1])
self.parents[level] = list_gr
elem_ref.append(list_gr.get_ref())
# Set marker and enumerated arguments if this is an enumeration element.
if is_numbered:
@@ -1114,9 +1147,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + prev_indent + 1,
self.level_at_new_list + ilevel + 1,
):
self.parents[i] = doc.add_list_group(
name="list", parent=self.parents[i - 1]
)
list_gr1 = doc.add_list_group(name="list", parent=self.parents[i - 1])
self.parents[i] = list_gr1
elem_ref.append(list_gr1.get_ref())
# TODO: Set marker and enumerated arguments if this is an enumeration element.
if is_numbered:
@@ -1156,7 +1189,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
elif self._prev_numid() == numid or prev_indent == ilevel:
# TODO: Set marker and enumerated arguments if this is an enumeration element.
# Set marker and enumerated arguments if this is an enumeration element.
if is_numbered:
counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "."
@@ -1165,15 +1198,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level - 1
)
return
return elem_ref
def _handle_tables(
self,
element: BaseOxmlElement,
docx_obj: DocxDocument,
doc: DoclingDocument,
) -> None:
) -> List[RefItem]:
elem_ref: List[RefItem] = []
table: Table = Table(element, docx_obj)
num_rows = len(table.rows)
num_cols = len(table.columns)
@@ -1184,9 +1217,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# In case we have a table of only 1 cell, we consider it furniture
# And proceed processing the content of the cell as though it's in the document body
self._walk_linear(cell_element._element, docx_obj, doc)
return
return elem_ref
data = TableData(num_rows=num_rows, num_cols=num_cols)
level = self._get_level()
docling_table = doc.add_table(data=data, parent=self.parents[level - 1])
elem_ref.append(docling_table.get_ref())
cell_set: set[CT_Tc] = set()
for row_idx, row in enumerate(table.rows):
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
@@ -1223,7 +1260,70 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else:
text = text.replace("<eq>", "$").replace("</eq>", "$")
table_cell = TableCell(
provs_in_cell: List[RefItem] = []
_, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
ref_for_rich_cell = provs_in_cell[0]
rich_table_cell = False
def group_cell_elements(
group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
) -> RefItem:
group_element = doc.add_group(
label=GroupLabel.UNSPECIFIED,
name=group_name,
parent=docling_table,
)
for prov in provs_in_cell:
group_element.children.append(prov)
pr_item = prov.resolve(doc)
item_parent = pr_item.parent.resolve(doc)
if pr_item.get_ref() in item_parent.children:
item_parent.children.remove(pr_item.get_ref())
pr_item.parent = group_element.get_ref()
ref_for_rich_cell = group_element.get_ref()
return ref_for_rich_cell
if len(provs_in_cell) > 1:
# Cell has multiple elements, we need to group them
rich_table_cell = True
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
ref_for_rich_cell = group_cell_elements(
group_name, doc, provs_in_cell
)
elif len(provs_in_cell) == 1:
item_ref = provs_in_cell[0]
pr_item = item_ref.resolve(doc)
if isinstance(pr_item, TextItem):
# Cell has only one element and it's just a text
rich_table_cell = False
doc.delete_items(node_items=[pr_item])
else:
rich_table_cell = True
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
ref_for_rich_cell = group_cell_elements(
group_name, doc, provs_in_cell
)
else:
rich_table_cell = False
if rich_table_cell:
rich_cell = RichTableCell(
text=text,
row_span=spanned_idx - row_idx,
col_span=cell.grid_span,
start_row_offset_idx=row.grid_cols_before + row_idx,
end_row_offset_idx=row.grid_cols_before + spanned_idx,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + cell.grid_span,
column_header=row.grid_cols_before + row_idx == 0,
row_header=False,
ref=ref_for_rich_cell, # points to an artificial group around children
)
doc.add_table_cell(table_item=docling_table, cell=rich_cell)
col_idx += cell.grid_span
else:
simple_cell = TableCell(
text=text,
row_span=spanned_idx - row_idx,
col_span=cell.grid_span,
@@ -1234,16 +1334,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
column_header=row.grid_cols_before + row_idx == 0,
row_header=False,
)
data.table_cells.append(table_cell)
doc.add_table_cell(table_item=docling_table, cell=simple_cell)
col_idx += cell.grid_span
level = self._get_level()
doc.add_table(data=data, parent=self.parents[level - 1])
return
return elem_ref
def _handle_pictures(
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
) -> None:
) -> List[RefItem]:
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
image_data: Optional[bytes] = None
rId = drawing_blip[0].get(
@@ -1255,28 +1352,32 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
image_data = image_part.blob # Get the binary image data
return image_data
elem_ref: List[RefItem] = []
level = self._get_level()
# Open the BytesIO object with PIL to create an Image
image_data: Optional[bytes] = get_docx_image(drawing_blip)
if image_data is None:
_log.warning("Warning: image cannot be found")
doc.add_picture(
p1 = doc.add_picture(
parent=self.parents[level - 1],
caption=None,
)
elem_ref.append(p1.get_ref())
else:
try:
image_bytes = BytesIO(image_data)
pil_image = Image.open(image_bytes)
doc.add_picture(
p2 = doc.add_picture(
parent=self.parents[level - 1],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
elem_ref.append(p2.get_ref())
except (UnidentifiedImageError, OSError):
_log.warning("Warning: image cannot be loaded by Pillow")
doc.add_picture(
p3 = doc.add_picture(
parent=self.parents[level - 1],
caption=None,
)
return
elem_ref.append(p3.get_ref())
return elem_ref

View File

@@ -0,0 +1,572 @@
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Annotated, ClassVar, Literal, Optional, Union, cast
from docling_core.types.doc import (
ContentLayer,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
Formatting,
GroupLabel,
NodeItem,
)
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from pydantic.types import StringConstraints
from typing_extensions import Self, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class _WebVTTTimestamp(BaseModel):
"""Model representing a WebVTT timestamp.
A WebVTT timestamp is always interpreted relative to the current playback position
of the media data that the WebVTT file is to be synchronized with.
"""
model_config = ConfigDict(regex_engine="python-re")
raw: Annotated[
str,
Field(
description="A representation of the WebVTT Timestamp as a single string"
),
]
_pattern: ClassVar[re.Pattern] = re.compile(
r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
)
_hours: int
_minutes: int
_seconds: int
_millis: int
@model_validator(mode="after")
def validate_raw(self) -> Self:
m = self._pattern.match(self.raw)
if not m:
raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
self._hours = int(m.group(1)) if m.group(1) else 0
self._minutes = int(m.group(2))
self._seconds = int(m.group(3))
self._millis = int(m.group(4))
if self._minutes < 0 or self._minutes > 59:
raise ValueError("Minutes must be between 0 and 59")
if self._seconds < 0 or self._seconds > 59:
raise ValueError("Seconds must be between 0 and 59")
return self
@property
def seconds(self) -> float:
"""A representation of the WebVTT Timestamp in seconds"""
return (
self._hours * 3600
+ self._minutes * 60
+ self._seconds
+ self._millis / 1000.0
)
@override
def __str__(self) -> str:
return self.raw
_WebVTTCueIdentifier = Annotated[
str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
]
class _WebVTTCueTimings(BaseModel):
"""Model representating WebVTT cue timings."""
start: Annotated[
_WebVTTTimestamp, Field(description="Start time offset of the cue")
]
end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
@model_validator(mode="after")
def check_order(self) -> Self:
if self.start and self.end:
if self.end.seconds <= self.start.seconds:
raise ValueError("End timestamp must be greater than start timestamp")
return self
@override
def __str__(self):
return f"{self.start} --> {self.end}"
class _WebVTTCueTextSpan(BaseModel):
"""Model representing a WebVTT cue text span."""
text: str
span_type: Literal["text"] = "text"
@field_validator("text", mode="after")
@classmethod
def validate_text(cls, value: str) -> str:
if any(ch in value for ch in {"\n", "\r", "&", "<"}):
raise ValueError("Cue text span contains invalid characters")
if len(value) == 0:
raise ValueError("Cue text span cannot be empty")
return value
@override
def __str__(self):
return self.text
class _WebVTTCueVoiceSpan(BaseModel):
"""Model representing a WebVTT cue voice span."""
annotation: Annotated[
str,
Field(
description=(
"Cue span start tag annotation text representing the name of thevoice"
)
),
]
classes: Annotated[
list[str],
Field(description="List of classes representing the cue span's significance"),
] = []
components: Annotated[
list["_WebVTTCueComponent"],
Field(description="The components representing the cue internal text"),
] = []
span_type: Literal["v"] = "v"
@field_validator("annotation", mode="after")
@classmethod
def validate_annotation(cls, value: str) -> str:
if any(ch in value for ch in {"\n", "\r", "&", ">"}):
raise ValueError(
"Cue span start tag annotation contains invalid characters"
)
if not value:
raise ValueError("Cue text span cannot be empty")
return value
@field_validator("classes", mode="after")
@classmethod
def validate_classes(cls, value: list[str]) -> list[str]:
for item in value:
if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
raise ValueError(
"A cue span start tag class contains invalid characters"
)
if not item:
raise ValueError("Cue span start tag classes cannot be empty")
return value
@override
def __str__(self):
tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
inner = "".join(str(span) for span in self.components)
return f"<{tag} {self.annotation}>{inner}</v>"
class _WebVTTCueClassSpan(BaseModel):
span_type: Literal["c"] = "c"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<c>{inner}</c>"
class _WebVTTCueItalicSpan(BaseModel):
span_type: Literal["i"] = "i"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<i>{inner}</i>"
class _WebVTTCueBoldSpan(BaseModel):
span_type: Literal["b"] = "b"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<b>{inner}</b>"
class _WebVTTCueUnderlineSpan(BaseModel):
span_type: Literal["u"] = "u"
components: list["_WebVTTCueComponent"]
@override
def __str__(self):
inner = "".join(str(span) for span in self.components)
return f"<u>{inner}</u>"
_WebVTTCueComponent = Annotated[
Union[
_WebVTTCueTextSpan,
_WebVTTCueClassSpan,
_WebVTTCueItalicSpan,
_WebVTTCueBoldSpan,
_WebVTTCueUnderlineSpan,
_WebVTTCueVoiceSpan,
],
Field(discriminator="span_type", description="The WebVTT cue component"),
]
class _WebVTTCueBlock(BaseModel):
"""Model representing a WebVTT cue block.
The optional WebVTT cue settings list is not supported.
The cue payload is limited to the following spans: text, class, italic, bold,
underline, and voice.
"""
model_config = ConfigDict(regex_engine="python-re")
identifier: Optional[_WebVTTCueIdentifier] = Field(
None, description="The WebVTT cue identifier"
)
timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
_pattern_block: ClassVar[re.Pattern] = re.compile(
r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
)
_pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
r"^<v(?P<class>\.[^\t\n\r &<>]+)?" # zero or more classes
r"[ \t]+(?P<annotation>[^\n\r&>]+)>" # required space and annotation
)
@field_validator("payload", mode="after")
@classmethod
def validate_payload(cls, payload):
for voice in payload:
if "-->" in str(voice):
raise ValueError("Cue payload must not contain '-->'")
return payload
@classmethod
def parse(cls, raw: str) -> "_WebVTTCueBlock":
lines = raw.strip().splitlines()
if not lines:
raise ValueError("Cue block must have at least one line")
identifier: Optional[_WebVTTCueIdentifier] = None
timing_line = lines[0]
if "-->" not in timing_line and len(lines) > 1:
identifier = timing_line
timing_line = lines[1]
cue_lines = lines[2:]
else:
cue_lines = lines[1:]
if "-->" not in timing_line:
raise ValueError("Cue block must contain WebVTT cue timings")
start, end = [t.strip() for t in timing_line.split("-->")]
end = re.split(" |\t", end)[0] # ignore the cue settings list
timings: _WebVTTCueTimings = _WebVTTCueTimings(
start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
)
cue_text = " ".join(cue_lines).strip()
if cue_text.startswith("<v") and "</v>" not in cue_text:
# adding close tag for cue voice spans without end tag
cue_text += "</v>"
stack: list[list[_WebVTTCueComponent]] = [[]]
tag_stack: list[Union[str, tuple]] = []
pos = 0
matches = list(cls._pattern_block.finditer(cue_text))
i = 0
while i < len(matches):
match = matches[i]
if match.start() > pos:
stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
tag = match.group(0)
if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
tag_type = tag[1:2]
tag_stack.append(tag_type)
stack.append([])
elif tag == "</i>":
children = stack.pop()
stack[-1].append(_WebVTTCueItalicSpan(components=children))
tag_stack.pop()
elif tag == "</b>":
children = stack.pop()
stack[-1].append(_WebVTTCueBoldSpan(components=children))
tag_stack.pop()
elif tag == "</u>":
children = stack.pop()
stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
tag_stack.pop()
elif tag == "</c>":
children = stack.pop()
stack[-1].append(_WebVTTCueClassSpan(components=children))
tag_stack.pop()
elif tag.startswith("<v"):
tag_stack.append(("v", tag))
stack.append([])
elif tag.startswith("</v"):
children = stack.pop() if stack else []
if (
tag_stack
and isinstance(tag_stack[-1], tuple)
and tag_stack[-1][0] == "v"
):
_, voice = cast(tuple, tag_stack.pop())
voice_match = cls._pattern_voice_tag.match(voice)
if voice_match:
class_string = voice_match.group("class")
annotation = voice_match.group("annotation")
if annotation:
classes: list[str] = []
if class_string:
classes = [c for c in class_string.split(".") if c]
stack[-1].append(
_WebVTTCueVoiceSpan(
annotation=annotation.strip(),
classes=classes,
components=children,
)
)
pos = match.end()
i += 1
if pos < len(cue_text):
stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
return cls(
identifier=identifier,
timings=timings,
payload=stack[0],
)
def __str__(self):
parts = []
if self.identifier:
parts.append(f"{self.identifier}\n")
timings_line = str(self.timings)
parts.append(timings_line + "\n")
for idx, span in enumerate(self.payload):
if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
# the end tag may be omitted for brevity
parts.append(str(span).removesuffix("</v>"))
else:
parts.append(str(span))
return "".join(parts)
class _WebVTTFile(BaseModel):
"""A model representing a WebVTT file."""
cue_blocks: list[_WebVTTCueBlock]
@staticmethod
def verify_signature(content: str) -> bool:
if not content:
return False
elif len(content) == 6:
return content == "WEBVTT"
elif len(content) > 6 and content.startswith("WEBVTT"):
return content[6] in (" ", "\t", "\n")
else:
return False
@classmethod
def parse(cls, raw: str) -> "_WebVTTFile":
# Normalize newlines to LF
raw = raw.replace("\r\n", "\n").replace("\r", "\n")
# Check WebVTT signature
if not cls.verify_signature(raw):
raise ValueError("Invalid WebVTT file signature")
# Strip "WEBVTT" header line
lines = raw.split("\n", 1)
body = lines[1] if len(lines) > 1 else ""
# Remove NOTE/STYLE/REGION blocks
body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
# Split into cue blocks
raw_blocks = re.split(r"\n\s*\n", body.strip())
cues: list[_WebVTTCueBlock] = []
for block in raw_blocks:
try:
cues.append(_WebVTTCueBlock.parse(block))
except ValueError as e:
_log.warning(f"Failed to parse cue block:\n{block}\n{e}")
return cls(cue_blocks=cues)
def __iter__(self):
return iter(self.cue_blocks)
def __getitem__(self, idx):
return self.cue_blocks[idx]
def __len__(self):
return len(self.cue_blocks)
class WebVTTDocumentBackend(DeclarativeDocumentBackend):
"""Declarative backend for WebVTT (.vtt) files.
This parser reads the content of a WebVTT file and converts
it to a DoclingDocument, following the W3C specs on https://www.w3.org/TR/webvtt1
Each cue becomes a TextItem and the items are appended to the
document body by the cue's start time.
"""
@override
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.content: str = ""
try:
if isinstance(self.path_or_stream, BytesIO):
self.content = self.path_or_stream.getvalue().decode("utf-8")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as f:
self.content = f.read()
except Exception as e:
raise RuntimeError(
"Could not initialize the WebVTT backend for file with hash "
f"{self.document_hash}."
) from e
@override
def is_valid(self) -> bool:
return _WebVTTFile.verify_signature(self.content)
@classmethod
@override
def supports_pagination(cls) -> bool:
return False
@override
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.VTT}
@staticmethod
def _add_text_from_component(
doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
) -> None:
"""Adds a TextItem to a document by extracting text from a cue span component.
TODO: address nesting
"""
formatting = Formatting()
text = ""
if isinstance(item, _WebVTTCueItalicSpan):
formatting.italic = True
elif isinstance(item, _WebVTTCueBoldSpan):
formatting.bold = True
elif isinstance(item, _WebVTTCueUnderlineSpan):
formatting.underline = True
if isinstance(item, _WebVTTCueTextSpan):
text = item.text
else:
# TODO: address nesting
text = "".join(
[t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
)
if text := text.strip():
doc.add_text(
label=DocItemLabel.TEXT,
text=text,
parent=parent,
content_layer=ContentLayer.BODY,
formatting=formatting,
)
@override
def convert(self) -> DoclingDocument:
_log.debug("Starting WebVTT conversion...")
if not self.is_valid():
raise RuntimeError("Invalid WebVTT document.")
origin = DocumentOrigin(
filename=self.file.name or "file",
mimetype="text/vtt",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
for block in vtt.cue_blocks:
block_group = doc.add_group(
label=GroupLabel.SECTION,
name="WebVTT cue block",
parent=None,
content_layer=ContentLayer.BODY,
)
if block.identifier:
doc.add_text(
label=DocItemLabel.TEXT,
text=str(block.identifier),
parent=block_group,
content_layer=ContentLayer.BODY,
)
doc.add_text(
label=DocItemLabel.TEXT,
text=str(block.timings),
parent=block_group,
content_layer=ContentLayer.BODY,
)
for cue_span in block.payload:
if isinstance(cue_span, _WebVTTCueVoiceSpan):
voice_group = doc.add_group(
label=GroupLabel.INLINE,
name="WebVTT cue voice span",
parent=block_group,
content_layer=ContentLayer.BODY,
)
voice = cue_span.annotation
if classes := cue_span.classes:
voice += f" ({', '.join(classes)})"
voice += ": "
doc.add_text(
label=DocItemLabel.TEXT,
text=voice,
parent=voice_group,
content_layer=ContentLayer.BODY,
)
for item in cue_span.components:
WebVTTDocumentBackend._add_text_from_component(
doc, item, voice_group
)
else:
WebVTTDocumentBackend._add_text_from_component(
doc, cue_span, block_group
)
return doc

View File

@@ -1,7 +1,6 @@
import math
from collections import defaultdict
from enum import Enum
from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
from typing import TYPE_CHECKING, Optional, Type, Union
import numpy as np
from docling_core.types.doc import (
@@ -14,9 +13,7 @@ from docling_core.types.doc import (
)
from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_core.types.io import (
DocumentStream,
)
from docling_core.types.io import DocumentStream
# DO NOT REMOVE; explicitly exposed from this location
from PIL.Image import Image
@@ -71,6 +68,7 @@ class InputFormat(str, Enum):
METS_GBS = "mets_gbs"
JSON_DOCLING = "json_docling"
AUDIO = "audio"
VTT = "vtt"
class OutputFormat(str, Enum):
@@ -82,7 +80,7 @@ class OutputFormat(str, Enum):
DOCTAGS = "doctags"
FormatToExtensions: Dict[InputFormat, List[str]] = {
FormatToExtensions: dict[InputFormat, list[str]] = {
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
InputFormat.PDF: ["pdf"],
@@ -97,9 +95,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.METS_GBS: ["tar.gz"],
InputFormat.JSON_DOCLING: ["json"],
InputFormat.AUDIO: ["wav", "mp3"],
InputFormat.VTT: ["vtt"],
}
FormatToMimeType: Dict[InputFormat, List[str]] = {
FormatToMimeType: dict[InputFormat, list[str]] = {
InputFormat.DOCX: [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
@@ -130,6 +129,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.METS_GBS: ["application/mets+xml"],
InputFormat.JSON_DOCLING: ["application/json"],
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
InputFormat.VTT: ["text/vtt"],
}
MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -162,8 +162,8 @@ class Cluster(BaseModel):
label: DocItemLabel
bbox: BoundingBox
confidence: float = 1.0
cells: List[TextCell] = []
children: List["Cluster"] = [] # Add child cluster support
cells: list[TextCell] = []
children: list["Cluster"] = [] # Add child cluster support
@field_serializer("confidence")
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
@@ -179,7 +179,7 @@ class BasePageElement(BaseModel):
class LayoutPrediction(BaseModel):
clusters: List[Cluster] = []
clusters: list[Cluster] = []
class VlmPredictionToken(BaseModel):
@@ -201,14 +201,14 @@ class ContainerElement(
class Table(BasePageElement):
otsl_seq: List[str]
otsl_seq: list[str]
num_rows: int = 0
num_cols: int = 0
table_cells: List[TableCell]
table_cells: list[TableCell]
class TableStructurePrediction(BaseModel):
table_map: Dict[int, Table] = {}
table_map: dict[int, Table] = {}
class TextElement(BasePageElement):
@@ -216,7 +216,7 @@ class TextElement(BasePageElement):
class FigureElement(BasePageElement):
annotations: List[PictureDataType] = []
annotations: list[PictureDataType] = []
provenance: Optional[str] = None
predicted_class: Optional[str] = None
confidence: Optional[float] = None
@@ -234,12 +234,12 @@ class FigureElement(BasePageElement):
class FigureClassificationPrediction(BaseModel):
figure_count: int = 0
figure_map: Dict[int, FigureElement] = {}
figure_map: dict[int, FigureElement] = {}
class EquationPrediction(BaseModel):
equation_count: int = 0
equation_map: Dict[int, TextElement] = {}
equation_map: dict[int, TextElement] = {}
class PagePredictions(BaseModel):
@@ -254,9 +254,9 @@ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
class AssembledUnit(BaseModel):
elements: List[PageElement] = []
body: List[PageElement] = []
headers: List[PageElement] = []
elements: list[PageElement] = []
body: list[PageElement] = []
headers: list[PageElement] = []
class ItemAndImageEnrichmentElement(BaseModel):
@@ -280,12 +280,12 @@ class Page(BaseModel):
None # Internal PDF backend. By default it is cleared during assembling.
)
_default_image_scale: float = 1.0 # Default image scale for external usage.
_image_cache: Dict[
_image_cache: dict[
float, Image
] = {} # Cache of images in different scales. By default it is cleared during assembling.
@property
def cells(self) -> List[TextCell]:
def cells(self) -> list[TextCell]:
"""Return text cells as a read-only view of parsed_page.textline_cells."""
if self.parsed_page is not None:
return self.parsed_page.textline_cells
@@ -354,7 +354,7 @@ class OpenAiApiResponse(BaseModel):
id: str
model: Optional[str] = None # returned by openai
choices: List[OpenAiResponseChoice]
choices: list[OpenAiResponseChoice]
created: int
usage: OpenAiResponseUsage
@@ -430,7 +430,7 @@ class PageConfidenceScores(BaseModel):
class ConfidenceReport(PageConfidenceScores):
pages: Dict[int, PageConfidenceScores] = Field(
pages: dict[int, PageConfidenceScores] = Field(
default_factory=lambda: defaultdict(PageConfidenceScores)
)

View File

@@ -394,6 +394,8 @@ class _DocumentConversionInput(BaseModel):
mime = FormatToMimeType[InputFormat.PPTX][0]
elif ext in FormatToExtensions[InputFormat.XLSX]:
mime = FormatToMimeType[InputFormat.XLSX][0]
elif ext in FormatToExtensions[InputFormat.VTT]:
mime = FormatToMimeType[InputFormat.VTT][0]
return mime

View File

@@ -25,6 +25,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.noop_backend import NoOpBackend
from docling.backend.webvtt_backend import WebVTTDocumentBackend
from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
@@ -170,6 +171,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
),
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
InputFormat.VTT: FormatOption(
pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
),
}
if (options := format_to_default_options.get(format)) is not None:
return options

View File

@@ -3,7 +3,7 @@
#
# What this example does
# - Runs the VLM-powered pipeline on a PDF (by URL) and prints Markdown output.
# - Shows two setups: default (Transformers/SmolDocling) and macOS MPS/MLX.
# - Shows two setups: default (Transformers/GraniteDocling) and macOS MPS/MLX.
#
# Prerequisites
# - Install Docling with VLM extras and the appropriate backend (Transformers or MLX).
@@ -15,7 +15,7 @@
#
# Notes
# - `source` may be a local path or a URL to a PDF.
# - The second section demonstrates macOS MPS acceleration via MLX (`vlm_model_specs.SMOLDOCLING_MLX`).
# - The second section demonstrates macOS MPS acceleration via MLX (`vlm_model_specs.GRANITEDOCLING_MLX`).
# - For more configurations and model comparisons, see `docs/examples/compare_vlm_models.py`.
# %%

4
docs/index.md vendored
View File

@@ -21,7 +21,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -37,13 +37,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
* 💬 Parsing of Web Video Text Tracks (WebVTT) files
### Coming soon
* 📝 Metadata extraction, including title, authors, references & language
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
* 📝 Complex chemistry understanding (Molecular structures)
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
## Get started

View File

@@ -11,10 +11,11 @@ Below you can find a listing of all supported input and output formats.
| PDF | |
| DOCX, XLSX, PPTX | Default formats in MS Office 2007+, based on Office Open XML |
| Markdown | |
| AsciiDoc | |
| AsciiDoc | Human-readable, plain-text markup language for structured technical content |
| HTML, XHTML | |
| CSV | |
| PNG, JPEG, TIFF, BMP, WEBP | Image formats |
| WebVTT | Web Video Text Tracks format for displaying timed text |
Schema-specific support:
@@ -32,4 +33,4 @@ Schema-specific support:
| Markdown | |
| JSON | Lossless serialization of Docling Document |
| Text | Plain text, i.e. without Markdown markers |
| Doctags | |
| [Doctags](https://arxiv.org/pdf/2503.11576) | Markup format for efficiently representing the full content and layout characteristics of a document |

View File

@@ -1,6 +1,6 @@
[project]
name = "docling"
version = "2.53.0" # DO NOT EDIT, updated automatically
version = "2.54.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT"
keywords = [
@@ -44,7 +44,7 @@ authors = [
requires-python = '>=3.9,<4.0'
dependencies = [
'pydantic (>=2.0.0,<3.0.0)',
'docling-core[chunking] (>=2.48.0,<3.0.0)',
'docling-core[chunking] (>=2.48.2,<3.0.0)',
'docling-parse (>=4.4.0,<5.0.0)',
"docling-ibm-models>=3.9.1,<4",
'filetype (>=1.2.0,<2.0.0)',

View File

@@ -1,40 +1,40 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: inline: group group
item-2 at level 2: paragraph: This is a word document and this is an inline equation:
item-2 at level 2: text: This is a word document and this is an inline equation:
item-3 at level 2: formula: A= \pi r^{2}
item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-5 at level 1: paragraph:
item-4 at level 2: text: . If instead, I want an equation by line, I can do this:
item-5 at level 1: text:
item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23
item-7 at level 1: paragraph: And that is an equation by itself. Cheers!
item-8 at level 1: paragraph:
item-9 at level 1: paragraph: This is another equation:
item-7 at level 1: text: And that is an equation by itself. Cheers!
item-8 at level 1: text:
item-9 at level 1: text: This is another equation:
item-10 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right)
item-11 at level 1: paragraph:
item-12 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
item-13 at level 1: paragraph:
item-14 at level 1: paragraph:
item-11 at level 1: text:
item-12 at level 1: text: This is text. This is text. This ... s is text. This is text. This is text.
item-13 at level 1: text:
item-14 at level 1: text:
item-15 at level 1: inline: group group
item-16 at level 2: paragraph: This is a word document and this is an inline equation:
item-16 at level 2: text: This is a word document and this is an inline equation:
item-17 at level 2: formula: A= \pi r^{2}
item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-19 at level 1: paragraph:
item-18 at level 2: text: . If instead, I want an equation by line, I can do this:
item-19 at level 1: text:
item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}
item-21 at level 1: paragraph:
item-22 at level 1: paragraph: And that is an equation by itself. Cheers!
item-23 at level 1: paragraph:
item-24 at level 1: paragraph: This is another equation:
item-25 at level 1: paragraph:
item-21 at level 1: text:
item-22 at level 1: text: And that is an equation by itself. Cheers!
item-23 at level 1: text:
item-24 at level 1: text: This is another equation:
item-25 at level 1: text:
item-26 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ght)x^{2}}{2!}+ \text{ \textellipsis }
item-27 at level 1: paragraph:
item-28 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text.
item-29 at level 1: paragraph:
item-30 at level 1: paragraph:
item-27 at level 1: text:
item-28 at level 1: text: This is text. This is text. This ... s is text. This is text. This is text.
item-29 at level 1: text:
item-30 at level 1: text:
item-31 at level 1: inline: group group
item-32 at level 2: paragraph: This is a word document and this is an inline equation:
item-32 at level 2: text: This is a word document and this is an inline equation:
item-33 at level 2: formula: A= \pi r^{2}
item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this:
item-35 at level 1: paragraph:
item-34 at level 2: text: . If instead, I want an equation by line, I can do this:
item-35 at level 1: text:
item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty
item-37 at level 1: paragraph:
item-38 at level 1: paragraph: And that is an equation by itself. Cheers!
item-39 at level 1: paragraph:
item-37 at level 1: text:
item-38 at level 1: text: And that is an equation by itself. Cheers!
item-39 at level 1: text:

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "equations",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -182,7 +182,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is a word document and this is an inline equation: ",
"text": "This is a word document and this is an inline equation: "
@@ -206,7 +206,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": ". If instead, I want an equation by line, I can do this:",
"text": ". If instead, I want an equation by line, I can do this:"
@@ -218,7 +218,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -242,7 +242,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!",
@@ -261,7 +261,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -273,7 +273,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is another equation:",
"text": "This is another equation:",
@@ -304,7 +304,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -316,7 +316,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
@@ -335,7 +335,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -347,7 +347,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -359,7 +359,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is a word document and this is an inline equation: ",
"text": "This is a word document and this is an inline equation: "
@@ -383,7 +383,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": ". If instead, I want an equation by line, I can do this:",
"text": ". If instead, I want an equation by line, I can do this:"
@@ -395,7 +395,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -419,7 +419,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -431,7 +431,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!",
@@ -450,7 +450,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -462,7 +462,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is another equation:",
"text": "This is another equation:",
@@ -481,7 +481,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -505,7 +505,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -517,7 +517,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
"text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.",
@@ -536,7 +536,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -548,7 +548,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -560,7 +560,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is a word document and this is an inline equation: ",
"text": "This is a word document and this is an inline equation: "
@@ -584,7 +584,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": ". If instead, I want an equation by line, I can do this:",
"text": ". If instead, I want an equation by line, I can do this:"
@@ -596,7 +596,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -620,7 +620,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -632,7 +632,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "And that is an equation by itself. Cheers!",
"text": "And that is an equation by itself. Cheers!",
@@ -651,7 +651,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""

View File

@@ -0,0 +1,675 @@
{
"schema_name": "DoclingDocument",
"version": "1.7.0",
"name": "escaped_characters",
"origin": {
"mimetype": "text/html",
"binary_hash": 10682185258371912110,
"filename": "escaped_characters.md"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/9"
},
{
"$ref": "#/texts/11"
},
{
"$ref": "#/texts/12"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/texts/4"
},
"children": [
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"name": "ordered list",
"label": "list"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/texts/4"
},
"children": [
{
"$ref": "#/texts/6"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "title",
"prov": [],
"orig": "escaped_characters",
"text": "escaped_characters"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Headers:",
"text": "Headers:"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/texts/3"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"level": 1
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/texts/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Text: 00:16.000 ----> 00:18.000 & < > \" '",
"text": "Text: 00:16.000 ----> 00:18.000 & < > \" '"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Lists",
"text": "Lists"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"enumerated": true,
"marker": ""
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"enumerated": false,
"marker": ""
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Inline code",
"text": "Inline code"
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/texts/7"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/10"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Code block",
"text": "Code block"
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/texts/9"
},
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"captions": [],
"references": [],
"footnotes": [],
"code_language": "unknown"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/tables/0"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Table",
"text": "Table"
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "Raw HTML",
"text": "Raw HTML"
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/texts/12"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "& < > \" '/div>",
"text": "& < > \" '/div>"
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/texts/12"
},
"children": [
{
"$ref": "#/texts/15"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Link",
"text": "Link",
"level": 1
},
{
"self_ref": "#/texts/15",
"parent": {
"$ref": "#/texts/14"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "& < > \" '",
"text": "& < > \" '",
"hyperlink": "https://en.wikipedia.org/wiki/Albert_Einstein"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/11"
},
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Key",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Example",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Ampersand",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "&",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Less-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "<",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Greater-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": ">",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Quotes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "\"",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Apostrophes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "'",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
"num_rows": 6,
"num_cols": 2,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Key",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "Example",
"column_header": true,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Ampersand",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "&",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Less-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "<",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Greater-than",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": ">",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Quotes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "\"",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "Apostrophes",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "'",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,41 @@
# Headers:
## &amp; &lt; &gt; " '
Text: 00:16.000 ----&gt; 00:18.000 &amp; &lt; &gt; " '
# Lists
1. &amp; &lt; &gt; " '
- &amp; &lt; &gt; " '
# Inline code
```
& < > " '
```
# Code block
```
& < > " '
```
# Table
| Key | Example |
|--------------|-----------|
| Ampersand | & |
| Less-than | < |
| Greater-than | > |
| Quotes | " |
| Apostrophes | ' |
# Raw HTML
&amp; &lt; &gt; " '/div&gt;
## Link
[&amp; &lt; &gt; " '](https://en.wikipedia.org/wiki/Albert_Einstein)

View File

@@ -186,6 +186,7 @@ tables:
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -196,6 +197,7 @@ tables:
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -206,6 +208,7 @@ tables:
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -216,6 +219,7 @@ tables:
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -229,6 +233,7 @@ tables:
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -239,6 +244,7 @@ tables:
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -249,6 +255,7 @@ tables:
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -259,6 +266,7 @@ tables:
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -269,6 +277,7 @@ tables:
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -279,6 +288,7 @@ tables:
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -289,6 +299,7 @@ tables:
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -299,6 +310,7 @@ tables:
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
fillable: false
row_header: false
row_section: false
row_span: 1
@@ -878,4 +890,4 @@ texts:
prov: []
self_ref: '#/texts/48'
text: Table Heading
version: 1.6.0
version: 1.7.0

View File

@@ -1,10 +1,10 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Lorem ipsum dolor sit amet, cons ... quam non, sodales sem. Nulla facilisi.
item-2 at level 1: paragraph:
item-3 at level 1: paragraph: Duis condimentum dui eget ullamc ... cus tempor, et tristique ante aliquet.
item-4 at level 1: paragraph:
item-5 at level 1: paragraph: Maecenas id neque pharetra, elei ... ulla faucibus eu. Donec ut nisl metus.
item-6 at level 1: paragraph:
item-7 at level 1: paragraph: Duis ac tellus sed turpis feugia ... pellentesque rhoncus, blandit eu nisl.
item-8 at level 1: paragraph:
item-9 at level 1: paragraph: Nunc vehicula mattis erat ac con ... udin, vehicula turpis eu, tempus nibh.
item-1 at level 1: text: Lorem ipsum dolor sit amet, cons ... quam non, sodales sem. Nulla facilisi.
item-2 at level 1: text:
item-3 at level 1: text: Duis condimentum dui eget ullamc ... cus tempor, et tristique ante aliquet.
item-4 at level 1: text:
item-5 at level 1: text: Maecenas id neque pharetra, elei ... ulla faucibus eu. Donec ut nisl metus.
item-6 at level 1: text:
item-7 at level 1: text: Duis ac tellus sed turpis feugia ... pellentesque rhoncus, blandit eu nisl.
item-8 at level 1: text:
item-9 at level 1: text: Nunc vehicula mattis erat ac con ... udin, vehicula turpis eu, tempus nibh.

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "lorem_ipsum",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -58,7 +58,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.",
"text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.",
@@ -77,7 +77,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -89,7 +89,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.",
"text": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.",
@@ -108,7 +108,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -120,7 +120,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.",
"text": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.",
@@ -139,7 +139,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -151,7 +151,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.",
"text": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.",
@@ -170,7 +170,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -182,7 +182,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.",
"text": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.",

View File

@@ -136,4 +136,4 @@ texts:
prov: []
self_ref: '#/texts/7'
text: The end!
version: 1.6.0
version: 1.7.0

View File

@@ -1,3 +1,3 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: table with [2x2]
item-2 at level 1: paragraph:
item-2 at level 1: text:

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "table_with_equations",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -37,7 +37,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -69,7 +69,8 @@
"text": "The next cell has an equation",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -81,7 +82,8 @@
"text": "$A= \\pi r^{2}$",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -93,7 +95,8 @@
"text": "The next cell has another equation",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -105,7 +108,8 @@
"text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
"num_rows": 2,
@@ -122,7 +126,8 @@
"text": "The next cell has an equation",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -134,7 +139,8 @@
"text": "$A= \\pi r^{2}$",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -148,7 +154,8 @@
"text": "The next cell has another equation",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -160,7 +167,8 @@
"text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
]
]

View File

@@ -2,9 +2,9 @@ item-0 at level 0: unspecified: group _root_
item-1 at level 1: list: group list
item-2 at level 2: list_item: Hello world1
item-3 at level 2: list_item: Hello2
item-4 at level 1: paragraph:
item-5 at level 1: paragraph: Some text before
item-4 at level 1: text:
item-5 at level 1: text: Some text before
item-6 at level 1: table with [3x3]
item-7 at level 1: paragraph:
item-8 at level 1: paragraph:
item-9 at level 1: paragraph: Some text after
item-7 at level 1: text:
item-8 at level 1: text:
item-9 at level 1: text: Some text after

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "tablecell",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -112,7 +112,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -124,7 +124,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Some text before",
"text": "Some text before",
@@ -143,7 +143,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -155,7 +155,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -167,7 +167,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Some text after",
"text": "Some text after",
@@ -206,7 +206,8 @@
"text": "Tab1",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -218,7 +219,8 @@
"text": "Tab2",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -230,7 +232,8 @@
"text": "Tab3",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -242,7 +245,8 @@
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -254,7 +258,8 @@
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -266,7 +271,8 @@
"text": "C",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -278,7 +284,8 @@
"text": "D",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -290,7 +297,8 @@
"text": "E",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -302,7 +310,8 @@
"text": "F",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
"num_rows": 3,
@@ -319,7 +328,8 @@
"text": "Tab1",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -331,7 +341,8 @@
"text": "Tab2",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -343,7 +354,8 @@
"text": "Tab3",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -357,7 +369,8 @@
"text": "A",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -369,7 +382,8 @@
"text": "B",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -381,7 +395,8 @@
"text": "C",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -395,7 +410,8 @@
"text": "D",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -407,7 +423,8 @@
"text": "E",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -419,7 +436,8 @@
"text": "F",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
]
]

View File

@@ -1,8 +1,8 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Test with three images in unusual formats
item-2 at level 1: paragraph: Raster in emf:
item-1 at level 1: text: Test with three images in unusual formats
item-2 at level 1: text: Raster in emf:
item-3 at level 1: picture
item-4 at level 1: paragraph: Vector in emf:
item-4 at level 1: text: Vector in emf:
item-5 at level 1: picture
item-6 at level 1: paragraph: Raster in webp:
item-6 at level 1: text: Raster in webp:
item-7 at level 1: picture

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "test_emf_docx",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -52,7 +52,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Test with three images in unusual formats",
"text": "Test with three images in unusual formats",
@@ -71,7 +71,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Raster in emf:",
"text": "Raster in emf:",
@@ -90,7 +90,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Vector in emf:",
"text": "Vector in emf:",
@@ -109,7 +109,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Raster in webp:",
"text": "Raster in webp:",

View File

@@ -1,90 +1,90 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
item-2 at level 1: paragraph: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
item-3 at level 1: paragraph:
item-1 at level 1: text: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
item-2 at level 1: text: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
item-3 at level 1: text:
item-4 at level 1: section: group textbox
item-5 at level 2: paragraph: Student falls ill
item-6 at level 2: paragraph:
item-5 at level 2: text: Student falls ill
item-6 at level 2: text:
item-7 at level 2: list: group list
item-8 at level 3: list_item: Suggested Reportable Symptoms:
... sh
Blisters
Headache
Sore throat
item-9 at level 1: paragraph:
item-10 at level 1: paragraph:
item-9 at level 1: text:
item-10 at level 1: text:
item-11 at level 1: section: group textbox
item-12 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
item-13 at level 1: paragraph:
item-14 at level 1: paragraph:
item-15 at level 1: paragraph:
item-16 at level 1: paragraph:
item-12 at level 2: text: If a caregiver suspects that wit ... the same suggested reportable symptoms
item-13 at level 1: text:
item-14 at level 1: text:
item-15 at level 1: text:
item-16 at level 1: text:
item-17 at level 1: section: group textbox
item-18 at level 2: paragraph: Yes
item-19 at level 1: paragraph:
item-20 at level 1: paragraph:
item-18 at level 2: text: Yes
item-19 at level 1: text:
item-20 at level 1: text:
item-21 at level 1: section: group textbox
item-22 at level 2: list: group list
item-23 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
item-24 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
item-25 at level 2: paragraph:
item-25 at level 2: text:
item-26 at level 1: list: group list
item-27 at level 1: paragraph:
item-28 at level 1: paragraph:
item-29 at level 1: paragraph:
item-30 at level 1: paragraph:
item-31 at level 1: paragraph:
item-27 at level 1: text:
item-28 at level 1: text:
item-29 at level 1: text:
item-30 at level 1: text:
item-31 at level 1: text:
item-32 at level 1: section: group textbox
item-33 at level 2: paragraph: Health Bureau:
item-34 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-33 at level 2: text: Health Bureau:
item-34 at level 2: text: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-35 at level 2: list: group list
item-36 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-37 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-38 at level 2: paragraph:
item-38 at level 2: text:
item-39 at level 1: list: group list
item-40 at level 1: paragraph:
item-40 at level 1: text:
item-41 at level 1: section: group textbox
item-42 at level 2: paragraph: Department of Education:
item-42 at level 2: text: Department of Education:
Collabo ... vention measures at all school levels.
item-43 at level 1: paragraph:
item-44 at level 1: paragraph:
item-45 at level 1: paragraph:
item-46 at level 1: paragraph:
item-47 at level 1: paragraph:
item-48 at level 1: paragraph:
item-49 at level 1: paragraph:
item-43 at level 1: text:
item-44 at level 1: text:
item-45 at level 1: text:
item-46 at level 1: text:
item-47 at level 1: text:
item-48 at level 1: text:
item-49 at level 1: text:
item-50 at level 1: section: group textbox
item-51 at level 2: inline: group group
item-52 at level 3: paragraph: The Health Bureau will handle
item-53 at level 3: paragraph: reporting and specimen collection
item-54 at level 3: paragraph: .
item-55 at level 2: paragraph:
item-56 at level 1: paragraph:
item-57 at level 1: paragraph:
item-58 at level 1: paragraph:
item-52 at level 3: text: The Health Bureau will handle
item-53 at level 3: text: reporting and specimen collection
item-54 at level 3: text: .
item-55 at level 2: text:
item-56 at level 1: text:
item-57 at level 1: text:
item-58 at level 1: text:
item-59 at level 1: section: group textbox
item-60 at level 2: paragraph: Whether the epidemic has eased.
item-61 at level 2: paragraph:
item-62 at level 1: paragraph:
item-60 at level 2: text: Whether the epidemic has eased.
item-61 at level 2: text:
item-62 at level 1: text:
item-63 at level 1: section: group textbox
item-64 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
item-65 at level 2: paragraph: No
item-66 at level 1: paragraph:
item-67 at level 1: paragraph:
item-64 at level 2: text: Whether the test results are pos ... legally designated infectious disease.
item-65 at level 2: text: No
item-66 at level 1: text:
item-67 at level 1: text:
item-68 at level 1: section: group textbox
item-69 at level 2: paragraph: Yes
item-70 at level 1: paragraph:
item-69 at level 2: text: Yes
item-70 at level 1: text:
item-71 at level 1: section: group textbox
item-72 at level 2: paragraph: Yes
item-73 at level 1: paragraph:
item-74 at level 1: paragraph:
item-72 at level 2: text: Yes
item-73 at level 1: text:
item-74 at level 1: text:
item-75 at level 1: section: group textbox
item-76 at level 2: paragraph: Case closed.
item-77 at level 2: paragraph:
item-78 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-79 at level 1: paragraph:
item-76 at level 2: text: Case closed.
item-77 at level 2: text:
item-78 at level 2: text: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-79 at level 1: text:
item-80 at level 1: section: group textbox
item-81 at level 2: paragraph: No
item-82 at level 1: paragraph:
item-83 at level 1: paragraph:
item-84 at level 1: paragraph:
item-81 at level 2: text: No
item-82 at level 1: text:
item-83 at level 1: text:
item-84 at level 1: text:

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "textbox",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -491,7 +491,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten",
"text": "Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten",
@@ -510,7 +510,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten",
"text": "Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten",
@@ -529,7 +529,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -541,7 +541,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Student falls ill",
"text": "Student falls ill",
@@ -560,7 +560,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -593,7 +593,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -605,7 +605,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -617,7 +617,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms",
"text": "If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)\nshow the same suggested reportable symptoms",
@@ -636,7 +636,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -648,7 +648,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -660,7 +660,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -672,7 +672,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -684,7 +684,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Yes",
"text": "Yes",
@@ -703,7 +703,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -715,7 +715,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -769,7 +769,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -781,7 +781,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -793,7 +793,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -805,7 +805,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -817,7 +817,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -829,7 +829,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -841,7 +841,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Health Bureau:",
"text": "Health Bureau:",
@@ -860,7 +860,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.",
"text": "Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.",
@@ -921,7 +921,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -933,7 +933,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -945,7 +945,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.",
"text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.",
@@ -964,7 +964,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -976,7 +976,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -988,7 +988,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1000,7 +1000,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1012,7 +1012,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1024,7 +1024,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1036,7 +1036,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1048,7 +1048,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "The Health Bureau will handle",
"text": "The Health Bureau will handle",
@@ -1067,7 +1067,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "reporting and specimen collection",
"text": "reporting and specimen collection",
@@ -1086,7 +1086,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": ".",
"text": ".",
@@ -1105,7 +1105,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1117,7 +1117,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1129,7 +1129,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1141,7 +1141,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1153,7 +1153,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Whether the epidemic has eased.",
"text": "Whether the epidemic has eased.",
@@ -1172,7 +1172,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1184,7 +1184,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1196,7 +1196,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Whether the test results are positive for a legally designated infectious disease.",
"text": "Whether the test results are positive for a legally designated infectious disease.",
@@ -1215,7 +1215,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "No",
"text": "No",
@@ -1234,7 +1234,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1246,7 +1246,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1258,7 +1258,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Yes",
"text": "Yes",
@@ -1277,7 +1277,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1289,7 +1289,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Yes",
"text": "Yes",
@@ -1308,7 +1308,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1320,7 +1320,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1332,7 +1332,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Case closed.",
"text": "Case closed.",
@@ -1351,7 +1351,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1363,7 +1363,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.",
"text": "The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.",
@@ -1382,7 +1382,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1394,7 +1394,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "No",
"text": "No",
@@ -1413,7 +1413,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1425,7 +1425,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1437,7 +1437,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""

View File

@@ -1,18 +1,18 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: italic
item-2 at level 1: paragraph: bold
item-3 at level 1: paragraph: underline
item-4 at level 1: paragraph: hyperlink
item-5 at level 1: paragraph: italic and bold hyperlink
item-1 at level 1: text: italic
item-2 at level 1: text: bold
item-3 at level 1: text: underline
item-4 at level 1: text: hyperlink
item-5 at level 1: text: italic and bold hyperlink
item-6 at level 1: inline: group group
item-7 at level 2: paragraph: Normal
item-8 at level 2: paragraph: italic
item-9 at level 2: paragraph: bold
item-10 at level 2: paragraph: underline
item-11 at level 2: paragraph: and
item-12 at level 2: paragraph: hyperlink
item-13 at level 2: paragraph: on the same line
item-14 at level 1: paragraph:
item-7 at level 2: text: Normal
item-8 at level 2: text: italic
item-9 at level 2: text: bold
item-10 at level 2: text: underline
item-11 at level 2: text: and
item-12 at level 2: text: hyperlink
item-13 at level 2: text: on the same line
item-14 at level 1: text:
item-15 at level 1: list: group list
item-16 at level 2: list_item: Italic bullet 1
item-17 at level 2: list_item: Bold bullet 2
@@ -29,4 +29,4 @@ item-0 at level 0: unspecified: group _root_
item-28 at level 5: text: Nested
item-29 at level 5: text: italic
item-30 at level 5: text: bold
item-31 at level 1: paragraph:
item-31 at level 1: text:

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "unit_test_formatting",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -174,7 +174,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "italic",
"text": "italic",
@@ -193,7 +193,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "bold",
"text": "bold",
@@ -212,7 +212,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "underline",
"text": "underline",
@@ -231,7 +231,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "hyperlink",
"text": "hyperlink",
@@ -251,7 +251,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "italic and bold hyperlink",
"text": "italic and bold hyperlink",
@@ -271,7 +271,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Normal",
"text": "Normal",
@@ -290,7 +290,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "italic",
"text": "italic",
@@ -309,7 +309,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "bold",
"text": "bold",
@@ -328,7 +328,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "underline",
"text": "underline",
@@ -347,7 +347,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "and",
"text": "and",
@@ -366,7 +366,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "hyperlink",
"text": "hyperlink",
@@ -386,7 +386,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "on the same line",
"text": "on the same line",
@@ -405,7 +405,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -649,7 +649,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""

View File

@@ -1,48 +1,48 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Test Document
item-2 at level 2: paragraph:
item-2 at level 2: text:
item-3 at level 2: section_header: Section 1
item-4 at level 3: paragraph:
item-5 at level 3: paragraph: Paragraph 1.1
item-6 at level 3: paragraph:
item-7 at level 3: paragraph: Paragraph 1.2
item-8 at level 3: paragraph:
item-4 at level 3: text:
item-5 at level 3: text: Paragraph 1.1
item-6 at level 3: text:
item-7 at level 3: text: Paragraph 1.2
item-8 at level 3: text:
item-9 at level 3: section_header: Section 1.1
item-10 at level 4: paragraph:
item-11 at level 4: paragraph: Paragraph 1.1.1
item-12 at level 4: paragraph:
item-13 at level 4: paragraph: Paragraph 1.1.2
item-14 at level 4: paragraph:
item-10 at level 4: text:
item-11 at level 4: text: Paragraph 1.1.1
item-12 at level 4: text:
item-13 at level 4: text: Paragraph 1.1.2
item-14 at level 4: text:
item-15 at level 3: section_header: Section 1.2
item-16 at level 4: paragraph:
item-17 at level 4: paragraph: Paragraph 1.1.1
item-18 at level 4: paragraph:
item-19 at level 4: paragraph: Paragraph 1.1.2
item-20 at level 4: paragraph:
item-16 at level 4: text:
item-17 at level 4: text: Paragraph 1.1.1
item-18 at level 4: text:
item-19 at level 4: text: Paragraph 1.1.2
item-20 at level 4: text:
item-21 at level 4: section_header: Section 1.2.3
item-22 at level 5: paragraph:
item-23 at level 5: paragraph: Paragraph 1.2.3.1
item-24 at level 5: paragraph:
item-25 at level 5: paragraph: Paragraph 1.2.3.1
item-26 at level 5: paragraph:
item-27 at level 5: paragraph:
item-22 at level 5: text:
item-23 at level 5: text: Paragraph 1.2.3.1
item-24 at level 5: text:
item-25 at level 5: text: Paragraph 1.2.3.1
item-26 at level 5: text:
item-27 at level 5: text:
item-28 at level 2: section_header: Section 2
item-29 at level 3: paragraph:
item-30 at level 3: paragraph: Paragraph 2.1
item-31 at level 3: paragraph:
item-32 at level 3: paragraph: Paragraph 2.2
item-33 at level 3: paragraph:
item-29 at level 3: text:
item-30 at level 3: text: Paragraph 2.1
item-31 at level 3: text:
item-32 at level 3: text: Paragraph 2.2
item-33 at level 3: text:
item-34 at level 3: section: group header-2
item-35 at level 4: section_header: Section 2.1.1
item-36 at level 5: paragraph:
item-37 at level 5: paragraph: Paragraph 2.1.1.1
item-38 at level 5: paragraph:
item-39 at level 5: paragraph: Paragraph 2.1.1.1
item-40 at level 5: paragraph:
item-36 at level 5: text:
item-37 at level 5: text: Paragraph 2.1.1.1
item-38 at level 5: text:
item-39 at level 5: text: Paragraph 2.1.1.1
item-40 at level 5: text:
item-41 at level 3: section_header: Section 2.1
item-42 at level 4: paragraph:
item-43 at level 4: paragraph: Paragraph 2.1.1
item-44 at level 4: paragraph:
item-45 at level 4: paragraph: Paragraph 2.1.2
item-46 at level 4: paragraph:
item-47 at level 4: paragraph:
item-42 at level 4: text:
item-43 at level 4: text: Paragraph 2.1.1
item-44 at level 4: text:
item-45 at level 4: text: Paragraph 2.1.2
item-46 at level 4: text:
item-47 at level 4: text:

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "unit_test_headers",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -71,7 +71,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -118,7 +118,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -130,7 +130,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.1",
"text": "Paragraph 1.1",
@@ -149,7 +149,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -161,7 +161,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.2",
"text": "Paragraph 1.2",
@@ -180,7 +180,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -221,7 +221,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -233,7 +233,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.1.1",
"text": "Paragraph 1.1.1",
@@ -252,7 +252,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -264,7 +264,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.1.2",
"text": "Paragraph 1.1.2",
@@ -283,7 +283,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -327,7 +327,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -339,7 +339,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.1.1",
"text": "Paragraph 1.1.1",
@@ -358,7 +358,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -370,7 +370,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.1.2",
"text": "Paragraph 1.1.2",
@@ -389,7 +389,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -433,7 +433,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -445,7 +445,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.2.3.1",
"text": "Paragraph 1.2.3.1",
@@ -464,7 +464,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -476,7 +476,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.2.3.1",
"text": "Paragraph 1.2.3.1",
@@ -495,7 +495,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -507,7 +507,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -554,7 +554,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -566,7 +566,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1",
"text": "Paragraph 2.1",
@@ -585,7 +585,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -597,7 +597,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.2",
"text": "Paragraph 2.2",
@@ -616,7 +616,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -657,7 +657,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -669,7 +669,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1.1.1",
"text": "Paragraph 2.1.1.1",
@@ -688,7 +688,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -700,7 +700,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1.1.1",
"text": "Paragraph 2.1.1.1",
@@ -719,7 +719,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -763,7 +763,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -775,7 +775,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1.1",
"text": "Paragraph 2.1.1",
@@ -794,7 +794,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -806,7 +806,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1.2",
"text": "Paragraph 2.1.2",
@@ -825,7 +825,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -837,7 +837,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""

View File

@@ -1,52 +1,52 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Test Document
item-2 at level 2: paragraph:
item-2 at level 2: text:
item-3 at level 2: section_header: 1 Section 1
item-4 at level 1: paragraph:
item-5 at level 1: paragraph: Paragraph 1.1
item-6 at level 1: paragraph:
item-7 at level 1: paragraph: Paragraph 1.2
item-8 at level 1: paragraph:
item-4 at level 1: text:
item-5 at level 1: text: Paragraph 1.1
item-6 at level 1: text:
item-7 at level 1: text: Paragraph 1.2
item-8 at level 1: text:
item-9 at level 1: section: group header-0
item-10 at level 2: section: group header-1
item-11 at level 3: section_header: 1.1 Section 1.1
item-12 at level 4: paragraph:
item-13 at level 4: paragraph: Paragraph 1.1.1
item-14 at level 4: paragraph:
item-15 at level 4: paragraph: Paragraph 1.1.2
item-16 at level 4: paragraph:
item-12 at level 4: text:
item-13 at level 4: text: Paragraph 1.1.1
item-14 at level 4: text:
item-15 at level 4: text: Paragraph 1.1.2
item-16 at level 4: text:
item-17 at level 3: section_header: 1.2 Section 1.2
item-18 at level 4: paragraph:
item-19 at level 4: paragraph: Paragraph 1.1.1
item-20 at level 4: paragraph:
item-21 at level 4: paragraph: Paragraph 1.1.2
item-22 at level 4: paragraph:
item-18 at level 4: text:
item-19 at level 4: text: Paragraph 1.1.1
item-20 at level 4: text:
item-21 at level 4: text: Paragraph 1.1.2
item-22 at level 4: text:
item-23 at level 4: section_header: 1.2.1 Section 1.2.3
item-24 at level 5: paragraph:
item-25 at level 5: paragraph: Paragraph 1.2.3.1
item-26 at level 5: paragraph:
item-27 at level 5: paragraph: Paragraph 1.2.3.1
item-28 at level 5: paragraph:
item-29 at level 5: paragraph:
item-24 at level 5: text:
item-25 at level 5: text: Paragraph 1.2.3.1
item-26 at level 5: text:
item-27 at level 5: text: Paragraph 1.2.3.1
item-28 at level 5: text:
item-29 at level 5: text:
item-30 at level 2: section_header: 2 Section 2
item-31 at level 1: paragraph:
item-32 at level 1: paragraph: Paragraph 2.1
item-33 at level 1: paragraph:
item-34 at level 1: paragraph: Paragraph 2.2
item-35 at level 1: paragraph:
item-31 at level 1: text:
item-32 at level 1: text: Paragraph 2.1
item-33 at level 1: text:
item-34 at level 1: text: Paragraph 2.2
item-35 at level 1: text:
item-36 at level 1: section: group header-0
item-37 at level 2: section: group header-1
item-38 at level 3: section: group header-2
item-39 at level 4: section_header: 2.1.1 Section 2.1.1
item-40 at level 5: paragraph:
item-41 at level 5: paragraph: Paragraph 2.1.1.1
item-42 at level 5: paragraph:
item-43 at level 5: paragraph: Paragraph 2.1.1.1
item-44 at level 5: paragraph:
item-40 at level 5: text:
item-41 at level 5: text: Paragraph 2.1.1.1
item-42 at level 5: text:
item-43 at level 5: text: Paragraph 2.1.1.1
item-44 at level 5: text:
item-45 at level 3: section_header: 2.2 Section 2.1
item-46 at level 4: paragraph:
item-47 at level 4: paragraph: Paragraph 2.1.1
item-48 at level 4: paragraph:
item-49 at level 4: paragraph: Paragraph 2.1.2
item-50 at level 4: paragraph:
item-51 at level 4: paragraph:
item-46 at level 4: text:
item-47 at level 4: text: Paragraph 2.1.1
item-48 at level 4: text:
item-49 at level 4: text: Paragraph 2.1.2
item-50 at level 4: text:
item-51 at level 4: text:

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "unit_test_headers_numbered",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -169,7 +169,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -194,7 +194,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -206,7 +206,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.1",
"text": "Paragraph 1.1",
@@ -225,7 +225,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -237,7 +237,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.2",
"text": "Paragraph 1.2",
@@ -256,7 +256,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -297,7 +297,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -309,7 +309,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.1.1",
"text": "Paragraph 1.1.1",
@@ -328,7 +328,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -340,7 +340,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.1.2",
"text": "Paragraph 1.1.2",
@@ -359,7 +359,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -403,7 +403,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -415,7 +415,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.1.1",
"text": "Paragraph 1.1.1",
@@ -434,7 +434,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -446,7 +446,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.1.2",
"text": "Paragraph 1.1.2",
@@ -465,7 +465,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -509,7 +509,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -521,7 +521,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.2.3.1",
"text": "Paragraph 1.2.3.1",
@@ -540,7 +540,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -552,7 +552,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 1.2.3.1",
"text": "Paragraph 1.2.3.1",
@@ -571,7 +571,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -583,7 +583,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -608,7 +608,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -620,7 +620,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1",
"text": "Paragraph 2.1",
@@ -639,7 +639,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -651,7 +651,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.2",
"text": "Paragraph 2.2",
@@ -670,7 +670,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -711,7 +711,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -723,7 +723,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1.1.1",
"text": "Paragraph 2.1.1.1",
@@ -742,7 +742,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -754,7 +754,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1.1.1",
"text": "Paragraph 2.1.1.1",
@@ -773,7 +773,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -817,7 +817,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -829,7 +829,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1.1",
"text": "Paragraph 2.1.1",
@@ -848,7 +848,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -860,7 +860,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1.2",
"text": "Paragraph 2.1.2",
@@ -879,7 +879,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -891,7 +891,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""

View File

@@ -1,25 +1,25 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-0
item-2 at level 2: section_header: Test Document
item-3 at level 3: paragraph:
item-4 at level 3: paragraph:
item-5 at level 3: paragraph: Paragraph 2.1.1
item-6 at level 3: paragraph:
item-7 at level 3: paragraph: Paragraph 2.1.2
item-8 at level 3: paragraph:
item-3 at level 3: text:
item-4 at level 3: text:
item-5 at level 3: text: Paragraph 2.1.1
item-6 at level 3: text:
item-7 at level 3: text: Paragraph 2.1.2
item-8 at level 3: text:
item-9 at level 3: section: group header-2
item-10 at level 4: section_header: Test 1:
item-11 at level 5: list: group list
item-12 at level 6: list_item: List item 1
item-13 at level 6: list_item: List item 2
item-14 at level 6: list_item: List item 3
item-15 at level 5: paragraph:
item-15 at level 5: text:
item-16 at level 4: section_header: Test 2:
item-17 at level 5: list: group list
item-18 at level 6: list_item: List item a
item-19 at level 6: list_item: List item b
item-20 at level 6: list_item: List item c
item-21 at level 5: paragraph:
item-21 at level 5: text:
item-22 at level 4: section_header: Test 3:
item-23 at level 5: list: group list
item-24 at level 6: list_item: List item 1
@@ -29,14 +29,14 @@ item-0 at level 0: unspecified: group _root_
item-28 at level 7: list_item: List item 1.2
item-29 at level 7: list_item: List item 1.3
item-30 at level 6: list_item: List item 3
item-31 at level 5: paragraph:
item-31 at level 5: text:
item-32 at level 4: section_header: Test 4:
item-33 at level 5: list: group list
item-34 at level 6: list_item: List item 1
item-35 at level 6: list: group list
item-36 at level 7: list_item: List item 1.1
item-37 at level 6: list_item: List item 2
item-38 at level 5: paragraph:
item-38 at level 5: text:
item-39 at level 4: section_header: Test 5:
item-40 at level 5: list: group list
item-41 at level 6: list_item: List item 1
@@ -45,7 +45,7 @@ item-0 at level 0: unspecified: group _root_
item-44 at level 7: list: group list
item-45 at level 8: list_item: List item 1.1.1
item-46 at level 6: list_item: List item 3
item-47 at level 5: paragraph:
item-47 at level 5: text:
item-48 at level 4: section_header: Test 6:
item-49 at level 5: list: group list
item-50 at level 6: list_item: List item 1
@@ -56,6 +56,6 @@ item-0 at level 0: unspecified: group _root_
item-55 at level 7: list: group list
item-56 at level 8: list_item: List item 1.2.1
item-57 at level 6: list_item: List item 3
item-58 at level 5: paragraph:
item-59 at level 5: paragraph:
item-60 at level 5: paragraph:
item-58 at level 5: text:
item-59 at level 5: text:
item-60 at level 5: text:

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "unit_test_lists",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -338,7 +338,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -350,7 +350,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -362,7 +362,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1.1",
"text": "Paragraph 2.1.1",
@@ -381,7 +381,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -393,7 +393,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Paragraph 2.1.2",
"text": "Paragraph 2.1.2",
@@ -412,7 +412,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -507,7 +507,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -602,7 +602,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -760,7 +760,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -855,7 +855,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -971,7 +971,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1135,7 +1135,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1147,7 +1147,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -1159,7 +1159,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""

View File

@@ -0,0 +1,66 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group WebVTT cue block
item-2 at level 2: text: 00:11.000 --> 00:13.000
item-3 at level 2: inline: group WebVTT cue voice span
item-4 at level 3: text: Roger Bingham:
item-5 at level 3: text: We are in New York City
item-6 at level 1: section: group WebVTT cue block
item-7 at level 2: text: 00:13.000 --> 00:16.000
item-8 at level 2: inline: group WebVTT cue voice span
item-9 at level 3: text: Roger Bingham:
item-10 at level 3: text: Were actually at the Lucern Hotel, just down the street
item-11 at level 1: section: group WebVTT cue block
item-12 at level 2: text: 00:16.000 --> 00:18.000
item-13 at level 2: inline: group WebVTT cue voice span
item-14 at level 3: text: Roger Bingham:
item-15 at level 3: text: from the American Museum of Natural History
item-16 at level 1: section: group WebVTT cue block
item-17 at level 2: text: 00:18.000 --> 00:20.000
item-18 at level 2: inline: group WebVTT cue voice span
item-19 at level 3: text: Roger Bingham:
item-20 at level 3: text: And with me is Neil deGrasse Tyson
item-21 at level 1: section: group WebVTT cue block
item-22 at level 2: text: 00:20.000 --> 00:22.000
item-23 at level 2: inline: group WebVTT cue voice span
item-24 at level 3: text: Roger Bingham:
item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
item-26 at level 1: section: group WebVTT cue block
item-27 at level 2: text: 00:22.000 --> 00:24.000
item-28 at level 2: inline: group WebVTT cue voice span
item-29 at level 3: text: Roger Bingham:
item-30 at level 3: text: at the AMNH.
item-31 at level 1: section: group WebVTT cue block
item-32 at level 2: text: 00:24.000 --> 00:26.000
item-33 at level 2: inline: group WebVTT cue voice span
item-34 at level 3: text: Roger Bingham:
item-35 at level 3: text: Thank you for walking down here.
item-36 at level 1: section: group WebVTT cue block
item-37 at level 2: text: 00:27.000 --> 00:30.000
item-38 at level 2: inline: group WebVTT cue voice span
item-39 at level 3: text: Roger Bingham:
item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
item-41 at level 1: section: group WebVTT cue block
item-42 at level 2: text: 00:30.000 --> 00:31.500
item-43 at level 2: inline: group WebVTT cue voice span
item-44 at level 3: text: Roger Bingham:
item-45 at level 3: text: When we e-mailed—
item-46 at level 1: section: group WebVTT cue block
item-47 at level 2: text: 00:30.500 --> 00:32.500
item-48 at level 2: inline: group WebVTT cue voice span
item-49 at level 3: text: Neil deGrasse Tyson:
item-50 at level 3: text: Didnt we talk about enough in that conversation?
item-51 at level 1: section: group WebVTT cue block
item-52 at level 2: text: 00:32.000 --> 00:35.500
item-53 at level 2: inline: group WebVTT cue voice span
item-54 at level 3: text: Roger Bingham:
item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
item-56 at level 1: section: group WebVTT cue block
item-57 at level 2: text: 00:32.500 --> 00:33.500
item-58 at level 2: inline: group WebVTT cue voice span
item-59 at level 3: text: Neil deGrasse Tyson:
item-60 at level 3: text: Laughs
item-61 at level 1: section: group WebVTT cue block
item-62 at level 2: text: 00:35.500 --> 00:38.000
item-63 at level 2: inline: group WebVTT cue voice span
item-64 at level 3: text: Roger Bingham:
item-65 at level 3: text: You know Im so excited my glasses are falling off here.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,51 @@
00:11.000 --> 00:13.000
Roger Bingham: We are in New York City
00:13.000 --> 00:16.000
Roger Bingham: Were actually at the Lucern Hotel, just down the street
00:16.000 --> 00:18.000
Roger Bingham: from the American Museum of Natural History
00:18.000 --> 00:20.000
Roger Bingham: And with me is Neil deGrasse Tyson
00:20.000 --> 00:22.000
Roger Bingham: Astrophysicist, Director of the Hayden Planetarium
00:22.000 --> 00:24.000
Roger Bingham: at the AMNH.
00:24.000 --> 00:26.000
Roger Bingham: Thank you for walking down here.
00:27.000 --> 00:30.000
Roger Bingham: And I want to do a follow-up on the last conversation we did.
00:30.000 --> 00:31.500
Roger Bingham: When we e-mailed—
00:30.500 --> 00:32.500
Neil deGrasse Tyson: Didnt we talk about enough in that conversation?
00:32.000 --> 00:35.500
Roger Bingham: No! No no no no; 'cos 'cos obviously 'cos
00:32.500 --> 00:33.500
Neil deGrasse Tyson: *Laughs*
00:35.500 --> 00:38.000
Roger Bingham: You know Im so excited my glasses are falling off here.

View File

@@ -0,0 +1,22 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group WebVTT cue block
item-2 at level 2: text: 00:00.000 --> 00:02.000
item-3 at level 2: inline: group WebVTT cue voice span
item-4 at level 3: text: Esme (first, loud):
item-5 at level 3: text: Its a blue apple tree!
item-6 at level 1: section: group WebVTT cue block
item-7 at level 2: text: 00:02.000 --> 00:04.000
item-8 at level 2: inline: group WebVTT cue voice span
item-9 at level 3: text: Mary:
item-10 at level 3: text: No way!
item-11 at level 1: section: group WebVTT cue block
item-12 at level 2: text: 00:04.000 --> 00:06.000
item-13 at level 2: inline: group WebVTT cue voice span
item-14 at level 3: text: Esme:
item-15 at level 3: text: Hee!
item-16 at level 2: text: laughter
item-17 at level 1: section: group WebVTT cue block
item-18 at level 2: text: 00:06.000 --> 00:08.000
item-19 at level 2: inline: group WebVTT cue voice span
item-20 at level 3: text: Mary (loud):
item-21 at level 3: text: Thats awesome!

View File

@@ -0,0 +1,376 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"name": "webvtt_example_02",
"origin": {
"mimetype": "text/vtt",
"binary_hash": 12867774546881601731,
"filename": "webvtt_example_02.vtt"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/2"
},
{
"$ref": "#/groups/4"
},
{
"$ref": "#/groups/6"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/groups/1"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/groups/0"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/3"
},
{
"$ref": "#/groups/3"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/groups/2"
},
"children": [
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
},
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/6"
},
{
"$ref": "#/groups/5"
},
{
"$ref": "#/texts/9"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/5",
"parent": {
"$ref": "#/groups/4"
},
"children": [
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
},
{
"self_ref": "#/groups/6",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/10"
},
{
"$ref": "#/groups/7"
}
],
"content_layer": "body",
"name": "WebVTT cue block",
"label": "section"
},
{
"self_ref": "#/groups/7",
"parent": {
"$ref": "#/groups/6"
},
"children": [
{
"$ref": "#/texts/11"
},
{
"$ref": "#/texts/12"
}
],
"content_layer": "body",
"name": "WebVTT cue voice span",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:00.000 --> 00:02.000",
"text": "00:00.000 --> 00:02.000"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Esme (first, loud): ",
"text": "Esme (first, loud): "
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Its a blue apple tree!",
"text": "Its a blue apple tree!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:02.000 --> 00:04.000",
"text": "00:02.000 --> 00:04.000"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Mary: ",
"text": "Mary: "
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "No way!",
"text": "No way!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:04.000 --> 00:06.000",
"text": "00:04.000 --> 00:06.000"
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Esme: ",
"text": "Esme: "
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Hee!",
"text": "Hee!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "laughter",
"text": "laughter",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "00:06.000 --> 00:08.000",
"text": "00:06.000 --> 00:08.000"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Mary (loud): ",
"text": "Mary (loud): "
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "Thats awesome!",
"text": "Thats awesome!",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,17 @@
00:00.000 --> 00:02.000
Esme (first, loud): Its a blue apple tree!
00:02.000 --> 00:04.000
Mary: No way!
00:04.000 --> 00:06.000
Esme: Hee!
*laughter*
00:06.000 --> 00:08.000
Mary (loud): Thats awesome!

View File

@@ -0,0 +1,77 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group WebVTT cue block
item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
item-4 at level 2: inline: group WebVTT cue voice span
item-5 at level 3: text: Speaker A:
item-6 at level 3: text: OK, I think now we should be recording
item-7 at level 1: section: group WebVTT cue block
item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
item-10 at level 2: inline: group WebVTT cue voice span
item-11 at level 3: text: Speaker A:
item-12 at level 3: text: properly.
item-13 at level 1: section: group WebVTT cue block
item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
item-16 at level 2: text: Good.
item-17 at level 1: section: group WebVTT cue block
item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
item-20 at level 2: inline: group WebVTT cue voice span
item-21 at level 3: text: Speaker A:
item-22 at level 3: text: Yeah.
item-23 at level 1: section: group WebVTT cue block
item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
item-26 at level 2: inline: group WebVTT cue voice span
item-27 at level 3: text: Speaker B:
item-28 at level 3: text: I was also thinking.
item-29 at level 1: section: group WebVTT cue block
item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
item-32 at level 2: inline: group WebVTT cue voice span
item-33 at level 3: text: Speaker B:
item-34 at level 3: text: Would be maybe good to create items,
item-35 at level 1: section: group WebVTT cue block
item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
item-38 at level 2: inline: group WebVTT cue voice span
item-39 at level 3: text: Speaker B:
item-40 at level 3: text: some metadata, some options that can be specific.
item-41 at level 1: section: group WebVTT cue block
item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
item-44 at level 2: inline: group WebVTT cue voice span
item-45 at level 3: text: Speaker A:
item-46 at level 3: text: Yeah, I mean I think you went even more than
item-47 at level 1: section: group WebVTT cue block
item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
item-50 at level 2: inline: group WebVTT cue voice span
item-51 at level 3: text: Speaker B:
item-52 at level 3: text: But we preserved the atoms.
item-53 at level 1: section: group WebVTT cue block
item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
item-56 at level 2: inline: group WebVTT cue voice span
item-57 at level 3: text: Speaker A:
item-58 at level 3: text: than me. I just opened the format.
item-59 at level 1: section: group WebVTT cue block
item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
item-62 at level 2: inline: group WebVTT cue voice span
item-63 at level 3: text: Speaker A:
item-64 at level 3: text: give it a try, yeah.
item-65 at level 1: section: group WebVTT cue block
item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
item-68 at level 2: inline: group WebVTT cue voice span
item-69 at level 3: text: Speaker B:
item-70 at level 3: text: Okay, talk to you later.
item-71 at level 1: section: group WebVTT cue block
item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
item-74 at level 2: inline: group WebVTT cue voice span
item-75 at level 3: text: Speaker A:
item-76 at level 3: text: See you.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,77 @@
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
00:00:04.963 --> 00:00:08.571
Speaker A: OK, I think now we should be recording
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
00:00:08.571 --> 00:00:09.403
Speaker A: properly.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
00:00:10.683 --> 00:00:11.563
Good.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
00:00:13.363 --> 00:00:13.803
Speaker A: Yeah.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
00:00:49.603 --> 00:00:53.363
Speaker B: I was also thinking.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
00:00:54.963 --> 00:01:02.072
Speaker B: Would be maybe good to create items,
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
00:01:02.072 --> 00:01:06.811
Speaker B: some metadata, some options that can be specific.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
00:01:10.243 --> 00:01:13.014
Speaker A: Yeah, I mean I think you went even more than
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
00:01:10.563 --> 00:01:12.643
Speaker B: But we preserved the atoms.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
00:01:13.014 --> 00:01:15.907
Speaker A: than me. I just opened the format.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
00:01:50.222 --> 00:01:51.643
Speaker A: give it a try, yeah.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
00:01:52.043 --> 00:01:55.043
Speaker B: Okay, talk to you later.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
00:01:54.603 --> 00:01:55.283
Speaker A: See you.

View File

@@ -1,16 +1,16 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Transcript
item-2 at level 1: paragraph: February 20, 2025, 8:32PM
item-1 at level 1: text: Transcript
item-2 at level 1: text: February 20, 2025, 8:32PM
item-3 at level 1: picture
item-4 at level 1: inline: group group
item-5 at level 2: paragraph: This is test 1
item-6 at level 2: paragraph: 0:08
item-5 at level 2: text: This is test 1
item-6 at level 2: text: 0:08
Correct, he is not.
item-7 at level 1: paragraph:
item-7 at level 1: text:
item-8 at level 1: picture
item-9 at level 1: inline: group group
item-10 at level 2: paragraph: This is test 2
item-11 at level 2: paragraph: 0:16
item-10 at level 2: text: This is test 2
item-11 at level 2: text: 0:16
Yeah, exactly.
item-12 at level 1: paragraph:
item-13 at level 1: paragraph:
item-12 at level 1: text:
item-13 at level 1: text:

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "word_image_anchors",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -93,7 +93,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Transcript",
"text": "Transcript",
@@ -112,7 +112,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "February 20, 2025, 8:32PM",
"text": "February 20, 2025, 8:32PM",
@@ -131,7 +131,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is test 1",
"text": "This is test 1",
@@ -150,7 +150,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "0:08\nCorrect, he is not.",
"text": "0:08\nCorrect, he is not.",
@@ -169,7 +169,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -181,7 +181,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is test 2",
"text": "This is test 2",
@@ -200,7 +200,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "0:16\nYeah, exactly.",
"text": "0:16\nYeah, exactly.",
@@ -219,7 +219,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -231,7 +231,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""

View File

@@ -1,28 +1,28 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Summer activities
item-1 at level 1: text: Summer activities
item-2 at level 1: title: Swimming in the lake
item-3 at level 2: paragraph: Duck
item-3 at level 2: text: Duck
item-4 at level 2: picture
item-5 at level 2: paragraph: Figure 1: This is a cute duckling
item-5 at level 2: text: Figure 1: This is a cute duckling
item-6 at level 2: section_header: Lets swim!
item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown:
item-7 at level 3: text: To get started with swimming, fi ... down in a water and try not to drown:
item-8 at level 3: list: group list
item-9 at level 4: list_item: You can relax and look around
item-10 at level 4: list_item: Paddle about
item-11 at level 4: list_item: Enjoy summer warmth
item-12 at level 3: paragraph: Also, dont forget:
item-12 at level 3: text: Also, dont forget:
item-13 at level 3: list: group list
item-14 at level 4: list_item: Wear sunglasses
item-15 at level 4: list_item: Dont forget to drink water
item-16 at level 4: list_item: Use sun cream
item-17 at level 3: paragraph: Hmm, what else…
item-17 at level 3: text: Hmm, what else…
item-18 at level 3: section_header: Lets eat
item-19 at level 4: paragraph: After we had a good day of swimm ... , its important to eat something nice
item-20 at level 4: paragraph: I like to eat leaves
item-21 at level 4: paragraph: Here are some interesting things a respectful duck could eat:
item-19 at level 4: text: After we had a good day of swimm ... , its important to eat something nice
item-20 at level 4: text: I like to eat leaves
item-21 at level 4: text: Here are some interesting things a respectful duck could eat:
item-22 at level 4: table with [4x3]
item-23 at level 4: paragraph:
item-24 at level 4: paragraph: And lets add another list in the end:
item-23 at level 4: text:
item-24 at level 4: text: And lets add another list in the end:
item-25 at level 4: list: group list
item-26 at level 5: list_item: Leaves
item-27 at level 5: list_item: Berries

View File

@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.6.0",
"version": "1.7.0",
"name": "word_sample",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -98,7 +98,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Summer activities",
"text": "Summer activities",
@@ -142,7 +142,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Duck",
"text": "Duck",
@@ -161,7 +161,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Figure 1: This is a cute duckling",
"text": "Figure 1: This is a cute duckling",
@@ -212,7 +212,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "To get started with swimming, first lay down in a water and try not to drown:",
"text": "To get started with swimming, first lay down in a water and try not to drown:",
@@ -294,7 +294,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Also, dont forget:",
"text": "Also, dont forget:",
@@ -376,7 +376,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Hmm, what else…",
"text": "Hmm, what else…",
@@ -430,7 +430,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "After we had a good day of swimming in the lake, its important to eat something nice",
"text": "After we had a good day of swimming in the lake, its important to eat something nice",
@@ -449,7 +449,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "I like to eat leaves",
"text": "I like to eat leaves",
@@ -468,7 +468,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Here are some interesting things a respectful duck could eat:",
"text": "Here are some interesting things a respectful duck could eat:",
@@ -487,7 +487,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "",
"text": ""
@@ -499,7 +499,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "And lets add another list in the end:",
"text": "And lets add another list in the end:",
@@ -625,7 +625,8 @@
"text": "",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -637,7 +638,8 @@
"text": "Food",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -649,7 +651,8 @@
"text": "Calories per portion",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -661,7 +664,8 @@
"text": "Leaves",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -673,7 +677,8 @@
"text": "Ash, Elm, Maple",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -685,7 +690,8 @@
"text": "50",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -697,7 +703,8 @@
"text": "Berries",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -709,7 +716,8 @@
"text": "Blueberry, Strawberry, Cranberry",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -721,7 +729,8 @@
"text": "150",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -733,7 +742,8 @@
"text": "Grain",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -745,7 +755,8 @@
"text": "Corn, Buckwheat, Barley",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -757,7 +768,8 @@
"text": "200",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
"num_rows": 4,
@@ -774,7 +786,8 @@
"text": "",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -786,7 +799,8 @@
"text": "Food",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -798,7 +812,8 @@
"text": "Calories per portion",
"column_header": true,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -812,7 +827,8 @@
"text": "Leaves",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -824,7 +840,8 @@
"text": "Ash, Elm, Maple",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -836,7 +853,8 @@
"text": "50",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -850,7 +868,8 @@
"text": "Berries",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -862,7 +881,8 @@
"text": "Blueberry, Strawberry, Cranberry",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -874,7 +894,8 @@
"text": "150",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
],
[
@@ -888,7 +909,8 @@
"text": "Grain",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -900,7 +922,8 @@
"text": "Corn, Buckwheat, Barley",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
},
{
"row_span": 1,
@@ -912,7 +935,8 @@
"text": "200",
"column_header": false,
"row_header": false,
"row_section": false
"row_section": false,
"fillable": false
}
]
]

View File

@@ -1,19 +1,19 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-0
item-2 at level 2: section_header: Test with tables
item-3 at level 3: paragraph: A uniform table
item-3 at level 3: text: A uniform table
item-4 at level 3: table with [3x3]
item-5 at level 3: paragraph:
item-6 at level 3: paragraph: A non-uniform table with horizontal spans
item-5 at level 3: text:
item-6 at level 3: text: A non-uniform table with horizontal spans
item-7 at level 3: table with [3x3]
item-8 at level 3: paragraph:
item-9 at level 3: paragraph: A non-uniform table with horizontal spans in inner columns
item-8 at level 3: text:
item-9 at level 3: text: A non-uniform table with horizontal spans in inner columns
item-10 at level 3: table with [3x4]
item-11 at level 3: paragraph:
item-12 at level 3: paragraph: A non-uniform table with vertical spans
item-11 at level 3: text:
item-12 at level 3: text: A non-uniform table with vertical spans
item-13 at level 3: table with [5x3]
item-14 at level 3: paragraph:
item-15 at level 3: paragraph: A non-uniform table with all kinds of spans and empty cells
item-14 at level 3: text:
item-15 at level 3: text: A non-uniform table with all kinds of spans and empty cells
item-16 at level 3: table with [9x5]
item-17 at level 3: paragraph:
item-18 at level 3: paragraph:
item-17 at level 3: text:
item-18 at level 3: text:

File diff suppressed because it is too large Load Diff

33
tests/data/md/escaped_characters.md vendored Normal file
View File

@@ -0,0 +1,33 @@
# Headers:
## &amp; &lt; &gt; &quot; &#39;
Text:
00:16.000 ----&gt; 00:18.000
&amp; &lt; &gt; &quot; &#39;
# Lists
1. &amp; &lt; &gt; &quot; &#39;
- &amp; &lt; &gt; &quot; &#39;
# Inline code
`&amp; &lt; &gt; &quot; &#39; `
# Code block
```
&amp; &lt; &gt; &quot; &#39;
```
# Table
| Key | Example |
| ------------------- | ----------------- |
| Ampersand | &amp; |
| Less-than | &lt; |
| Greater-than | &gt; |
| Quotes | &quot; |
| Apostrophes | &#39; |
# Raw HTML
<div title="">&amp; &lt; &gt; &quot; &#39;/div>
## Link
[&amp; &lt; &gt; &quot; &#39;](https://en.wikipedia.org/wiki/Albert_Einstein)

42
tests/data/webvtt/webvtt_example_01.vtt vendored Normal file
View File

@@ -0,0 +1,42 @@
WEBVTT
NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
00:11.000 --> 00:13.000
<v Roger Bingham>We are in New York City
00:13.000 --> 00:16.000
<v Roger Bingham>Were actually at the Lucern Hotel, just down the street
00:16.000 --> 00:18.000
<v Roger Bingham>from the American Museum of Natural History
00:18.000 --> 00:20.000
<v Roger Bingham>And with me is Neil deGrasse Tyson
00:20.000 --> 00:22.000
<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
00:22.000 --> 00:24.000
<v Roger Bingham>at the AMNH.
00:24.000 --> 00:26.000
<v Roger Bingham>Thank you for walking down here.
00:27.000 --> 00:30.000
<v Roger Bingham>And I want to do a follow-up on the last conversation we did.
00:30.000 --> 00:31.500 align:right size:50%
<v Roger Bingham>When we e-mailed—
00:30.500 --> 00:32.500 align:left size:50%
<v Neil deGrasse Tyson>Didnt we talk about enough in that conversation?
00:32.000 --> 00:35.500 align:right size:50%
<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
00:32.500 --> 00:33.500 align:left size:50%
<v Neil deGrasse Tyson><i>Laughs</i>
00:35.500 --> 00:38.000
<v Roger Bingham>You know Im so excited my glasses are falling off here.

15
tests/data/webvtt/webvtt_example_02.vtt vendored Normal file
View File

@@ -0,0 +1,15 @@
WEBVTT
NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
00:00.000 --> 00:02.000
<v.first.loud Esme>Its a blue apple tree!
00:02.000 --> 00:04.000
<v Mary>No way!
00:04.000 --> 00:06.000
<v Esme>Hee!</v> <i>laughter</i>
00:06.000 --> 00:08.000
<v.loud Mary>Thats awesome!

57
tests/data/webvtt/webvtt_example_03.vtt vendored Normal file
View File

@@ -0,0 +1,57 @@
WEBVTT
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
00:00:04.963 --> 00:00:08.571
<v Speaker A>OK,
I think now we should be recording</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
00:00:08.571 --> 00:00:09.403
<v Speaker A>properly.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
00:00:10.683 --> 00:00:11.563
Good.
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
00:00:13.363 --> 00:00:13.803
<v Speaker A>Yeah.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
00:00:49.603 --> 00:00:53.363
<v Speaker B>I was also thinking.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
00:00:54.963 --> 00:01:02.072
<v Speaker B>Would be maybe good to create items,</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
00:01:02.072 --> 00:01:06.811
<v Speaker B>some metadata,
some options that can be specific.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
00:01:10.243 --> 00:01:13.014
<v Speaker A>Yeah,
I mean I think you went even more than</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
00:01:10.563 --> 00:01:12.643
<v Speaker B>But we preserved the atoms.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
00:01:13.014 --> 00:01:15.907
<v Speaker A>than me.
I just opened the format.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
00:01:50.222 --> 00:01:51.643
<v Speaker A>give it a try, yeah.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
00:01:52.043 --> 00:01:55.043
<v Speaker B>Okay, talk to you later.</v>
62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
00:01:54.603 --> 00:01:55.283
<v Speaker A>See you.</v>

View File

@@ -26,10 +26,12 @@ def test_convert_valid():
assert len(relevant_paths) > 0
yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
json_filter = ["escaped_characters"]
for in_path in relevant_paths:
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
json_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.json"
in_doc = InputDocument(
path_or_stream=in_path,
@@ -45,6 +47,9 @@ def test_convert_valid():
act_doc = backend.convert()
act_data = act_doc.export_to_markdown()
if in_path.stem in json_filter:
assert verify_document(act_doc, json_gt_path, GENERATE), "export to json"
if GEN_TEST_DATA:
with open(md_gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n")

232
tests/test_backend_vtt.py Normal file
View File

@@ -0,0 +1,232 @@
# Assisted by watsonx Code Assistant
from pathlib import Path
import pytest
from docling_core.types.doc import DoclingDocument
from pydantic import ValidationError
from docling.backend.webvtt_backend import (
_WebVTTCueItalicSpan,
_WebVTTCueTextSpan,
_WebVTTCueTimings,
_WebVTTCueVoiceSpan,
_WebVTTFile,
_WebVTTTimestamp,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
def test_vtt_cue_commponents():
"""Test WebVTT components."""
valid_timestamps = [
"00:01:02.345",
"12:34:56.789",
"02:34.567",
"00:00:00.000",
]
valid_total_seconds = [
1 * 60 + 2.345,
12 * 3600 + 34 * 60 + 56.789,
2 * 60 + 34.567,
0.0,
]
for idx, ts in enumerate(valid_timestamps):
model = _WebVTTTimestamp(raw=ts)
assert model.seconds == valid_total_seconds[idx]
"""Test invalid WebVTT timestamps."""
invalid_timestamps = [
"00:60:02.345", # minutes > 59
"00:01:60.345", # seconds > 59
"00:01:02.1000", # milliseconds > 999
"01:02:03", # missing milliseconds
"01:02", # missing milliseconds
":01:02.345", # extra : for missing hours
"abc:01:02.345", # invalid format
]
for ts in invalid_timestamps:
with pytest.raises(ValidationError):
_WebVTTTimestamp(raw=ts)
"""Test the timestamp __str__ method."""
model = _WebVTTTimestamp(raw="00:01:02.345")
assert str(model) == "00:01:02.345"
"""Test valid cue timings."""
start = _WebVTTTimestamp(raw="00:10.005")
end = _WebVTTTimestamp(raw="00:14.007")
cue_timings = _WebVTTCueTimings(start=start, end=end)
assert cue_timings.start == start
assert cue_timings.end == end
assert str(cue_timings) == "00:10.005 --> 00:14.007"
"""Test invalid cue timings with end timestamp before start."""
start = _WebVTTTimestamp(raw="00:10.700")
end = _WebVTTTimestamp(raw="00:10.500")
with pytest.raises(ValidationError) as excinfo:
_WebVTTCueTimings(start=start, end=end)
assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
"""Test invalid cue timings with missing end."""
start = _WebVTTTimestamp(raw="00:10.500")
with pytest.raises(ValidationError) as excinfo:
_WebVTTCueTimings(start=start)
assert "Field required" in str(excinfo.value)
"""Test invalid cue timings with missing start."""
end = _WebVTTTimestamp(raw="00:10.500")
with pytest.raises(ValidationError) as excinfo:
_WebVTTCueTimings(end=end)
assert "Field required" in str(excinfo.value)
"""Test with valid text."""
valid_text = "This is a valid cue text span."
span = _WebVTTCueTextSpan(text=valid_text)
assert span.text == valid_text
assert str(span) == valid_text
"""Test with text containing newline characters."""
invalid_text = "This cue text span\ncontains a newline."
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text=invalid_text)
"""Test with text containing ampersand."""
invalid_text = "This cue text span contains &."
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text=invalid_text)
"""Test with text containing less-than sign."""
invalid_text = "This cue text span contains <."
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text=invalid_text)
"""Test with empty text."""
with pytest.raises(ValidationError):
_WebVTTCueTextSpan(text="")
"""Test that annotation validation works correctly."""
valid_annotation = "valid-annotation"
invalid_annotation = "invalid\nannotation"
with pytest.raises(ValidationError):
_WebVTTCueVoiceSpan(annotation=invalid_annotation)
assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
"""Test that classes validation works correctly."""
annotation = "speaker name"
valid_classes = ["class1", "class2"]
invalid_classes = ["class\nwith\nnewlines", ""]
with pytest.raises(ValidationError):
_WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
"""Test that components validation works correctly."""
annotation = "speaker name"
valid_components = [_WebVTTCueTextSpan(text="random text")]
invalid_components = [123, "not a component"]
with pytest.raises(ValidationError):
_WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
"""Test valid cue voice spans."""
cue_span = _WebVTTCueVoiceSpan(
annotation="speaker",
classes=["loud", "clear"],
components=[_WebVTTCueTextSpan(text="random text")],
)
expected_str = "<v.loud.clear speaker>random text</v>"
assert str(cue_span) == expected_str
cue_span = _WebVTTCueVoiceSpan(
annotation="speaker",
components=[_WebVTTCueTextSpan(text="random text")],
)
expected_str = "<v speaker>random text</v>"
assert str(cue_span) == expected_str
def test_webvtt_file():
"""Test WebVTT files."""
with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
content = f.read()
vtt = _WebVTTFile.parse(content)
assert len(vtt) == 13
block = vtt.cue_blocks[11]
assert str(block.timings) == "00:32.500 --> 00:33.500"
assert len(block.payload) == 1
cue_span = block.payload[0]
assert isinstance(cue_span, _WebVTTCueVoiceSpan)
assert cue_span.annotation == "Neil deGrasse Tyson"
assert not cue_span.classes
assert len(cue_span.components) == 1
comp = cue_span.components[0]
assert isinstance(comp, _WebVTTCueItalicSpan)
assert len(comp.components) == 1
comp2 = comp.components[0]
assert isinstance(comp2, _WebVTTCueTextSpan)
assert comp2.text == "Laughs"
with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
content = f.read()
vtt = _WebVTTFile.parse(content)
assert len(vtt) == 4
reverse = (
"WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
"https://www.w3.org/TR/webvtt1/\n\n"
)
reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
assert content == reverse
with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
content = f.read()
vtt = _WebVTTFile.parse(content)
assert len(vtt) == 13
for block in vtt:
assert block.identifier
block = vtt.cue_blocks[0]
assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
assert len(block.payload) == 1
assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
block = vtt.cue_blocks[2]
assert isinstance(cue_span, _WebVTTCueVoiceSpan)
assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
assert len(block.payload) == 1
assert isinstance(block.payload[0], _WebVTTCueTextSpan)
assert block.payload[0].text == "Good."
def test_e2e_vtt_conversions():
directory = Path("./tests/data/webvtt/")
vtt_paths = sorted(directory.rglob("*.vtt"))
converter = DocumentConverter(allowed_formats=[InputFormat.VTT])
for vtt in vtt_paths:
gt_path = vtt.parent.parent / "groundtruth" / "docling_v2" / vtt.name
conv_result: ConversionResult = converter.convert(vtt)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown(escape_html=False)
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
"export to md"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
"export to indented-text"
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE)

View File

@@ -206,6 +206,11 @@ def test_guess_format(tmp_path):
doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) is None
# Valid WebVTT
buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read())
stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf)
assert dci._guess_format(stream) == InputFormat.VTT
# Valid Docling JSON
test_str = '{"name": ""}'
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))

10
uv.lock generated
View File

@@ -1049,7 +1049,7 @@ wheels = [
[[package]]
name = "docling"
version = "2.53.0"
version = "2.54.0"
source = { editable = "." }
dependencies = [
{ name = "accelerate" },
@@ -1154,7 +1154,7 @@ requires-dist = [
{ name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
{ name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
{ name = "certifi", specifier = ">=2024.7.4" },
{ name = "docling-core", extras = ["chunking"], specifier = ">=2.48.0,<3.0.0" },
{ name = "docling-core", extras = ["chunking"], specifier = ">=2.48.2,<3.0.0" },
{ name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
{ name = "docling-parse", specifier = ">=4.4.0,<5.0.0" },
{ name = "easyocr", specifier = ">=1.7,<2.0" },
@@ -1233,7 +1233,7 @@ examples = [
[[package]]
name = "docling-core"
version = "2.48.1"
version = "2.48.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "jsonref" },
@@ -1247,9 +1247,9 @@ dependencies = [
{ name = "typer" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/f9/0c/dce7f80e99e56570d143885fc40536107e8a39ef4de2888959e055b39607/docling_core-2.48.1.tar.gz", hash = "sha256:48cb77575dfd020a51413957e96b165e45f6d1027c641710fddb389dcb9b189c", size = 161311, upload-time = "2025-09-11T12:33:22.46Z" }
sdist = { url = "https://files.pythonhosted.org/packages/dd/e6/922de61f2a7b7d337ffc781f8e85f5581b12801fe193827066ccd6c5ba04/docling_core-2.48.2.tar.gz", hash = "sha256:01c12a1d3c9877c6658d0d6adf5cdcefd56cb814d8083860ba2d77ab882ac2d0", size = 161344, upload-time = "2025-09-22T08:39:41.431Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/90/fe/1b96120c9d94c97016716ccf46ad2708a2e76157e52dfcca4101db70fc21/docling_core-2.48.1-py3-none-any.whl", hash = "sha256:a3985999ac2067e15e589ef0f11ccde264deacaea403c0f94049242f10a6189a", size = 164330, upload-time = "2025-09-11T12:33:20.935Z" },
{ url = "https://files.pythonhosted.org/packages/97/bc/a77739cc31d7de2be9d6682f880761083a2038355e513e813a73a041c644/docling_core-2.48.2-py3-none-any.whl", hash = "sha256:d1f2fe9be9a9f7e7a2fb6ddcc9d9fcbf437bfb02e0c6005cdec1ece1cf4aed44", size = 164376, upload-time = "2025-09-22T08:39:39.704Z" },
]
[package.optional-dependencies]