Merge from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-31 14:34:40 +00:00 · 2025-02-18 11:24:53 +01:00 · 2025-02-18 11:24:53 +01:00 · 8606b598dc
commit 8606b598dc
parent 48777b17fa 75db61127c
125 changed files with 25130 additions and 32303 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,32 @@
+## [v2.23.0](https://github.com/DS4SD/docling/releases/tag/v2.23.0) - 2025-02-17
+
+### Feature
+
+* Support cuda:n GPU device allocation ([#694](https://github.com/DS4SD/docling/issues/694)) ([`77eb77b`](https://github.com/DS4SD/docling/commit/77eb77bdc2c07b632a1d171826d1855a5218399e))
+* **xml-jats:** Parse XML JATS documents ([#967](https://github.com/DS4SD/docling/issues/967)) ([`428b656`](https://github.com/DS4SD/docling/commit/428b656793cb75d108c69f20c254be7c198cee5c))
+
+### Fix
+
+* Revise DocTags, fix iterate_items to output content_layer in items ([#965](https://github.com/DS4SD/docling/issues/965)) ([`6e75f0b`](https://github.com/DS4SD/docling/commit/6e75f0b5d3ee42738a80049d4cf2fa6d34e8ab97))
+
+## [v2.22.0](https://github.com/DS4SD/docling/releases/tag/v2.22.0) - 2025-02-14
+
+### Feature
+
+* Add support for CSV input with new backend to transform CSV files to DoclingDocument ([#945](https://github.com/DS4SD/docling/issues/945)) ([`00d9405`](https://github.com/DS4SD/docling/commit/00d9405b0ac519d321ae54e8150f5facbaabbe14))
+* Introduce the enable_remote_services option to allow remote connections while processing ([#941](https://github.com/DS4SD/docling/issues/941)) ([`2716c7d`](https://github.com/DS4SD/docling/commit/2716c7d4ffb836664178178d3f8d01b7f9112595))
+* Allow artifacts_path to be defined as ENV ([#940](https://github.com/DS4SD/docling/issues/940)) ([`5101e25`](https://github.com/DS4SD/docling/commit/5101e2519e7a5bb727531b1412b1131a7cfbda52))
+
+### Fix
+
+* Update Pillow constraints ([#958](https://github.com/DS4SD/docling/issues/958)) ([`af19c03`](https://github.com/DS4SD/docling/commit/af19c03f6e5e0b24e12d6a3baac6c46a4c8b10d1))
+* Fix the initialization of the TesseractOcrModel ([#935](https://github.com/DS4SD/docling/issues/935)) ([`c47ae70`](https://github.com/DS4SD/docling/commit/c47ae700ece2ea4efee17f82e4667c1ce9a0ed2a))
+
+### Documentation
+
+* Update example Dockerfile with download CLI ([#929](https://github.com/DS4SD/docling/issues/929)) ([`7493d5b`](https://github.com/DS4SD/docling/commit/7493d5b01f8be60294afeffdfb54a62bb74bcc92))
+* Examples for picture descriptions ([#951](https://github.com/DS4SD/docling/issues/951)) ([`2d66e99`](https://github.com/DS4SD/docling/commit/2d66e99b69f39a282109c366fae3679f41c6e081))
+
 ## [v2.21.0](https://github.com/DS4SD/docling/releases/tag/v2.21.0) - 2025-02-10

 ### Feature
--- a/6
+++ b/6
@ -16,8 +16,7 @@ ENV TORCH_HOME=/tmp/

 COPY docs/examples/minimal.py /root/minimal.py

-RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
-RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
+RUN docling-tools models download

 # On container environments, always set a thread budget to avoid undesired thread congestion.
 ENV OMP_NUM_THREADS=4
@ -25,3 +24,6 @@ ENV OMP_NUM_THREADS=4
 # On container shell:
 # > cd /root/
 # > python minimal.py
+
+# Running as `docker run -e DOCLING_ARTIFACTS_PATH=/root/.cache/docling/models` will use the
+# model weights included in the container image.
--- a/docling/backend/csv_backend.py
+++ b/docling/backend/csv_backend.py
@ -0,0 +1,125 @@
+import csv
+import logging
+import warnings
+from io import BytesIO, StringIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling_core.types.doc import DoclingDocument, DocumentOrigin, TableCell, TableData
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class CsvDocumentBackend(DeclarativeDocumentBackend):
+    content: StringIO
+
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        # Load content
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8"))
+            elif isinstance(self.path_or_stream, Path):
+                self.content = StringIO(self.path_or_stream.read_text("utf-8"))
+            self.valid = True
+        except Exception as e:
+            raise RuntimeError(
+                f"CsvDocumentBackend could not load document with hash {self.document_hash}"
+            ) from e
+        return
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.CSV}
+
+    def convert(self) -> DoclingDocument:
+        """
+        Parses the CSV data into a structured document model.
+        """
+
+        # Detect CSV dialect
+        head = self.content.readline()
+        dialect = csv.Sniffer().sniff(head, ",;\t|:")
+        _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
+        if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
+            raise RuntimeError(
+                f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
+            )
+
+        # Parce CSV
+        self.content.seek(0)
+        result = csv.reader(self.content, dialect=dialect, strict=True)
+        self.csv_data = list(result)
+        _log.info(f"Detected {len(self.csv_data)} lines")
+
+        # Ensure uniform column length
+        expected_length = len(self.csv_data[0])
+        is_uniform = all(len(row) == expected_length for row in self.csv_data)
+        if not is_uniform:
+            warnings.warn(
+                f"Inconsistent column lengths detected in CSV data. "
+                f"Expected {expected_length} columns, but found rows with varying lengths. "
+                f"Ensure all rows have the same number of columns."
+            )
+
+        # Parse the CSV into a structured document model
+        origin = DocumentOrigin(
+            filename=self.file.name or "file.csv",
+            mimetype="text/csv",
+            binary_hash=self.document_hash,
+        )
+
+        doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
+
+        if self.is_valid():
+            # Convert CSV data to table
+            if self.csv_data:
+                num_rows = len(self.csv_data)
+                num_cols = max(len(row) for row in self.csv_data)
+
+                table_data = TableData(
+                    num_rows=num_rows,
+                    num_cols=num_cols,
+                    table_cells=[],
+                )
+
+                # Convert each cell to TableCell
+                for row_idx, row in enumerate(self.csv_data):
+                    for col_idx, cell_value in enumerate(row):
+                        cell = TableCell(
+                            text=str(cell_value),
+                            row_span=1,  # CSV doesn't support merged cells
+                            col_span=1,
+                            start_row_offset_idx=row_idx,
+                            end_row_offset_idx=row_idx + 1,
+                            start_col_offset_idx=col_idx,
+                            end_col_offset_idx=col_idx + 1,
+                            col_header=row_idx == 0,  # First row as header
+                            row_header=False,
+                        )
+                        table_data.table_cells.append(cell)
+
+                doc.add_table(data=table_data)
+        else:
+            raise RuntimeError(
+                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+            )
+
+        return doc
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@ -0,0 +1,772 @@
+import logging
+import traceback
+from io import BytesIO
+from pathlib import Path
+from typing import Final, Optional, Union
+
+from bs4 import BeautifulSoup
+from docling_core.types.doc import (
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupItem,
+    GroupLabel,
+    NodeItem,
+    TableCell,
+    TableData,
+    TextItem,
+)
+from lxml import etree
+from typing_extensions import TypedDict, override
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+JATS_DTD_URL: Final = ["JATS-journalpublishing", "JATS-archive"]
+DEFAULT_HEADER_ACKNOWLEDGMENTS: Final = "Acknowledgments"
+DEFAULT_HEADER_ABSTRACT: Final = "Abstract"
+DEFAULT_HEADER_REFERENCES: Final = "References"
+DEFAULT_TEXT_ETAL: Final = "et al."
+
+
+class Abstract(TypedDict):
+    label: str
+    content: str
+
+
+class Author(TypedDict):
+    name: str
+    affiliation_names: list[str]
+
+
+class Citation(TypedDict):
+    author_names: str
+    title: str
+    source: str
+    year: str
+    volume: str
+    page: str
+    pub_id: str
+    publisher_name: str
+    publisher_loc: str
+
+
+class Table(TypedDict):
+    label: str
+    caption: str
+    content: str
+
+
+class XMLComponents(TypedDict):
+    title: str
+    authors: list[Author]
+    abstract: list[Abstract]
+
+
+class JatsDocumentBackend(DeclarativeDocumentBackend):
+    """Backend to parse articles in XML format tagged according to JATS definition.
+
+    The Journal Article Tag Suite (JATS) is an definition standard for the
+    representation of journal articles in XML format. Several publishers and journal
+    archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
+    medRxiv, or Springer Nature.
+
+    Refer to https://jats.nlm.nih.gov for more details on JATS.
+
+    The code from this document backend has been developed by modifying parts of the
+    PubMed Parser library (version 0.5.0, released on 12.08.2024):
+    Achakulvisut et al., (2020).
+    Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
+      Dataset XML Dataset.
+    Journal of Open Source Software, 5(46), 1979,
+    https://doi.org/10.21105/joss.01979
+    """
+
+    @override
+    def __init__(
+        self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
+    ) -> None:
+        super().__init__(in_doc, path_or_stream)
+        self.path_or_stream = path_or_stream
+
+        # Initialize the root of the document hiearchy
+        self.root: Optional[NodeItem] = None
+
+        self.valid = False
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.path_or_stream.seek(0)
+            self.tree: etree._ElementTree = etree.parse(self.path_or_stream)
+
+            doc_info: etree.DocInfo = self.tree.docinfo
+            if doc_info.system_url and any(
+                [kwd in doc_info.system_url for kwd in JATS_DTD_URL]
+            ):
+                self.valid = True
+                return
+            for ent in doc_info.internalDTD.iterentities():
+                if ent.system_url and any(
+                    [kwd in ent.system_url for kwd in JATS_DTD_URL]
+                ):
+                    self.valid = True
+                    return
+        except Exception as exc:
+            raise RuntimeError(
+                f"Could not initialize JATS backend for file with hash {self.document_hash}."
+            ) from exc
+
+    @override
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    @override
+    def supports_pagination(cls) -> bool:
+        return False
+
+    @override
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+
+    @classmethod
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
+        return {InputFormat.XML_JATS}
+
+    @override
+    def convert(self) -> DoclingDocument:
+        try:
+            # Create empty document
+            origin = DocumentOrigin(
+                filename=self.file.name or "file",
+                mimetype="application/xml",
+                binary_hash=self.document_hash,
+            )
+            doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
+
+            # Get metadata XML components
+            xml_components: XMLComponents = self._parse_metadata()
+
+            # Add metadata to the document
+            self._add_metadata(doc, xml_components)
+
+            # walk over the XML body
+            body = self.tree.xpath("//body")
+            if self.root and len(body) > 0:
+                self._walk_linear(doc, self.root, body[0])
+
+            # walk over the XML back matter
+            back = self.tree.xpath("//back")
+            if self.root and len(back) > 0:
+                self._walk_linear(doc, self.root, back[0])
+        except Exception:
+            _log.error(traceback.format_exc())
+
+        return doc
+
+    @staticmethod
+    def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
+        skip_tags = ["term", "disp-formula", "inline-formula"]
+        text: str = (
+            node.text.replace("\n", " ")
+            if (node.tag not in skip_tags and node.text)
+            else ""
+        )
+        for child in list(node):
+            if child.tag not in skip_tags:
+                # TODO: apply styling according to child.tag when supported by docling-core
+                text += JatsDocumentBackend._get_text(child, sep)
+            if sep:
+                text = text.rstrip(sep) + sep
+            text += child.tail.replace("\n", " ") if child.tail else ""
+
+        return text
+
+    def _find_metadata(self) -> Optional[etree._Element]:
+        meta_names: list[str] = ["article-meta", "book-part-meta"]
+        meta: Optional[etree._Element] = None
+        for name in meta_names:
+            node = self.tree.xpath(f".//{name}")
+            if len(node) > 0:
+                meta = node[0]
+                break
+
+        return meta
+
+    def _parse_abstract(self) -> list[Abstract]:
+        # TODO: address cases with multiple sections
+        abs_list: list[Abstract] = []
+
+        for abs_node in self.tree.xpath(".//abstract"):
+            abstract: Abstract = dict(label="", content="")
+            texts = []
+            for abs_par in abs_node.xpath("p"):
+                texts.append(JatsDocumentBackend._get_text(abs_par).strip())
+            abstract["content"] = " ".join(texts)
+
+            label_node = abs_node.xpath("title|label")
+            if len(label_node) > 0:
+                abstract["label"] = label_node[0].text.strip()
+
+            abs_list.append(abstract)
+
+        return abs_list
+
+    def _parse_authors(self) -> list[Author]:
+        # Get mapping between affiliation ids and names
+        authors: list[Author] = []
+        meta: Optional[etree._Element] = self._find_metadata()
+        if meta is None:
+            return authors
+
+        affiliation_names = []
+        for affiliation_node in meta.xpath(".//aff[@id]"):
+            aff = ", ".join([t for t in affiliation_node.itertext() if t.strip()])
+            aff = aff.replace("\n", " ")
+            label = affiliation_node.xpath("label")
+            if label:
+                # TODO: once superscript is supported, add label with formatting
+                aff = aff.removeprefix(f"{label[0].text}, ")
+            affiliation_names.append(aff)
+        affiliation_ids_names = {
+            id: name
+            for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
+        }
+
+        # Get author names and affiliation names
+        for author_node in meta.xpath(
+            './/contrib-group/contrib[@contrib-type="author"]'
+        ):
+            author: Author = {
+                "name": "",
+                "affiliation_names": [],
+            }
+
+            # Affiliation names
+            affiliation_ids = [
+                a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
+            ]
+            for id in affiliation_ids:
+                if id in affiliation_ids_names:
+                    author["affiliation_names"].append(affiliation_ids_names[id])
+
+            # Name
+            author["name"] = (
+                author_node.xpath("name/given-names")[0].text
+                + " "
+                + author_node.xpath("name/surname")[0].text
+            )
+
+            authors.append(author)
+
+        return authors
+
+    def _parse_title(self) -> str:
+        meta_names: list[str] = [
+            "article-meta",
+            "collection-meta",
+            "book-meta",
+            "book-part-meta",
+        ]
+        title_names: list[str] = ["article-title", "subtitle", "title", "label"]
+        titles: list[str] = [
+            " ".join(
+                elem.text.replace("\n", " ").strip()
+                for elem in list(title_node)
+                if elem.tag in title_names
+            ).strip()
+            for title_node in self.tree.xpath(
+                "|".join([f".//{item}/title-group" for item in meta_names])
+            )
+        ]
+
+        text = " - ".join(titles)
+
+        return text
+
+    def _parse_metadata(self) -> XMLComponents:
+        """Parsing JATS document metadata."""
+        xml_components: XMLComponents = {
+            "title": self._parse_title(),
+            "authors": self._parse_authors(),
+            "abstract": self._parse_abstract(),
+        }
+        return xml_components
+
+    def _add_abstract(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+
+        for abstract in xml_components["abstract"]:
+            text: str = abstract["content"]
+            title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
+            if not text:
+                continue
+            parent = doc.add_heading(parent=self.root, text=title)
+            doc.add_text(
+                parent=parent,
+                text=text,
+                label=DocItemLabel.TEXT,
+            )
+
+        return
+
+    def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
+        # TODO: once docling supports text formatting, add affiliation reference to
+        # author names through superscripts
+        authors: list = [item["name"] for item in xml_components["authors"]]
+        authors_str = ", ".join(authors)
+        affiliations: list = [
+            item
+            for author in xml_components["authors"]
+            for item in author["affiliation_names"]
+        ]
+        affiliations_str = "; ".join(list(dict.fromkeys(affiliations)))
+        if authors_str:
+            doc.add_text(
+                parent=self.root,
+                text=authors_str,
+                label=DocItemLabel.PARAGRAPH,
+            )
+        if affiliations_str:
+            doc.add_text(
+                parent=self.root,
+                text=affiliations_str,
+                label=DocItemLabel.PARAGRAPH,
+            )
+
+        return
+
+    def _add_citation(self, doc: DoclingDocument, parent: NodeItem, text: str) -> None:
+        if isinstance(parent, GroupItem) and parent.label == GroupLabel.LIST:
+            doc.add_list_item(text=text, enumerated=False, parent=parent)
+        else:
+            doc.add_text(text=text, label=DocItemLabel.TEXT, parent=parent)
+
+        return
+
+    def _parse_element_citation(self, node: etree._Element) -> str:
+        citation: Citation = {
+            "author_names": "",
+            "title": "",
+            "source": "",
+            "year": "",
+            "volume": "",
+            "page": "",
+            "pub_id": "",
+            "publisher_name": "",
+            "publisher_loc": "",
+        }
+
+        _log.debug("Citation parsing started")
+
+        # Author names
+        names = []
+        for name_node in node.xpath(".//name"):
+            name_str = (
+                name_node.xpath("surname")[0].text.replace("\n", " ").strip()
+                + " "
+                + name_node.xpath("given-names")[0].text.replace("\n", " ").strip()
+            )
+            names.append(name_str)
+        etal_node = node.xpath(".//etal")
+        if len(etal_node) > 0:
+            etal_text = etal_node[0].text or DEFAULT_TEXT_ETAL
+            names.append(etal_text)
+        citation["author_names"] = ", ".join(names)
+
+        titles: list[str] = [
+            "article-title",
+            "chapter-title",
+            "data-title",
+            "issue-title",
+            "part-title",
+            "trans-title",
+        ]
+        title_node: Optional[etree._Element] = None
+        for name in titles:
+            name_node = node.xpath(name)
+            if len(name_node) > 0:
+                title_node = name_node[0]
+                break
+        citation["title"] = (
+            JatsDocumentBackend._get_text(title_node)
+            if title_node is not None
+            else node.text.replace("\n", " ").strip()
+        )
+
+        # Journal, year, publisher name, publisher location, volume, elocation
+        fields: list[str] = [
+            "source",
+            "year",
+            "publisher-name",
+            "publisher-loc",
+            "volume",
+        ]
+        for item in fields:
+            item_node = node.xpath(item)
+            if len(item_node) > 0:
+                citation[item.replace("-", "_")] = (  # type: ignore[literal-required]
+                    item_node[0].text.replace("\n", " ").strip()
+                )
+
+        # Publication identifier
+        if len(node.xpath("pub-id")) > 0:
+            pub_id: list[str] = []
+            for id_node in node.xpath("pub-id"):
+                id_type = id_node.get("assigning-authority") or id_node.get(
+                    "pub-id-type"
+                )
+                id_text = id_node.text
+                if id_type and id_text:
+                    pub_id.append(
+                        id_type.replace("\n", " ").strip().upper()
+                        + ": "
+                        + id_text.replace("\n", " ").strip()
+                    )
+            if pub_id:
+                citation["pub_id"] = ", ".join(pub_id)
+
+        # Pages
+        if len(node.xpath("elocation-id")) > 0:
+            citation["page"] = (
+                node.xpath("elocation-id")[0].text.replace("\n", " ").strip()
+            )
+        elif len(node.xpath("fpage")) > 0:
+            citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
+            if len(node.xpath("lpage")) > 0:
+                citation["page"] += (
+                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
+                )
+
+        # Flatten the citation to string
+
+        text = ""
+        if citation["author_names"]:
+            text += citation["author_names"].rstrip(".") + ". "
+        if citation["title"]:
+            text += citation["title"] + ". "
+        if citation["source"]:
+            text += citation["source"] + ". "
+        if citation["publisher_name"]:
+            if citation["publisher_loc"]:
+                text += f"{citation['publisher_loc']}: "
+            text += citation["publisher_name"] + ". "
+        if citation["volume"]:
+            text = text.rstrip(". ")
+            text += f" {citation['volume']}. "
+        if citation["page"]:
+            text = text.rstrip(". ")
+            if citation["volume"]:
+                text += ":"
+            text += citation["page"] + ". "
+        if citation["year"]:
+            text = text.rstrip(". ")
+            text += f" ({citation['year']})."
+        if citation["pub_id"]:
+            text = text.rstrip(".") + ". "
+            text += citation["pub_id"]
+
+        _log.debug("Citation flattened")
+
+        return text
+
+    def _add_equation(
+        self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
+    ) -> None:
+        math_text = node.text
+        math_parts = math_text.split("$$")
+        if len(math_parts) == 3:
+            math_formula = math_parts[1]
+            doc.add_text(label=DocItemLabel.FORMULA, text=math_formula, parent=parent)
+
+        return
+
+    def _add_figure_captions(
+        self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
+    ) -> None:
+        label_node = node.xpath("label")
+        label: Optional[str] = (
+            JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
+        )
+
+        caption_node = node.xpath("caption")
+        caption: Optional[str]
+        if len(caption_node) > 0:
+            caption = ""
+            for caption_par in list(caption_node[0]):
+                if caption_par.xpath(".//supplementary-material"):
+                    continue
+                caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
+            caption = caption.strip()
+        else:
+            caption = None
+
+        # TODO: format label vs caption once styling is supported
+        fig_text: str = f"{label}{' ' if label and caption else ''}{caption}"
+        fig_caption: Optional[TextItem] = (
+            doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)
+            if fig_text
+            else None
+        )
+
+        doc.add_picture(parent=parent, caption=fig_caption)
+
+        return
+
+    # TODO: add footnotes when DocItemLabel.FOOTNOTE and styling are supported
+    # def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
+    #     new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
+    #     for child in node.iterchildren(tag="fn"):
+    #         text = JatsDocumentBackend._get_text(child)
+    #         doc.add_list_item(text=text, parent=new_parent)
+
+    def _add_metadata(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+        self._add_title(doc, xml_components)
+        self._add_authors(doc, xml_components)
+        self._add_abstract(doc, xml_components)
+
+        return
+
+    def _add_table(
+        self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table
+    ) -> None:
+        soup = BeautifulSoup(table_xml_component["content"], "html.parser")
+        table_tag = soup.find("table")
+
+        nested_tables = table_tag.find("table")
+        if nested_tables:
+            _log.warning(f"Skipping nested table in {str(self.file)}")
+            return
+
+        # Count the number of rows (number of <tr> elements)
+        num_rows = len(table_tag.find_all("tr"))
+
+        # Find the number of columns (taking into account colspan)
+        num_cols = 0
+        for row in table_tag.find_all("tr"):
+            col_count = 0
+            for cell in row.find_all(["td", "th"]):
+                colspan = int(cell.get("colspan", 1))
+                col_count += colspan
+            num_cols = max(num_cols, col_count)
+
+        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+
+        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+
+        # Iterate over the rows in the table
+        for row_idx, row in enumerate(table_tag.find_all("tr")):
+            # For each row, find all the column cells (both <td> and <th>)
+            cells = row.find_all(["td", "th"])
+
+            # Check if each cell in the row is a header -> means it is a column header
+            col_header = True
+            for j, html_cell in enumerate(cells):
+                if html_cell.name == "td":
+                    col_header = False
+
+            # Extract and print the text content of each cell
+            col_idx = 0
+            for _, html_cell in enumerate(cells):
+                # extract inline formulas
+                for formula in html_cell.find_all("inline-formula"):
+                    math_parts = formula.text.split("$$")
+                    if len(math_parts) == 3:
+                        math_formula = f"$${math_parts[1]}$$"
+                        formula.replaceWith(math_formula)
+                text = html_cell.text
+
+                col_span = int(html_cell.get("colspan", 1))
+                row_span = int(html_cell.get("rowspan", 1))
+
+                while grid[row_idx][col_idx] is not None:
+                    col_idx += 1
+                for r in range(row_span):
+                    for c in range(col_span):
+                        grid[row_idx + r][col_idx + c] = text
+
+                cell = TableCell(
+                    text=text,
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=row_idx,
+                    end_row_offset_idx=row_idx + row_span,
+                    start_col_offset_idx=col_idx,
+                    end_col_offset_idx=col_idx + col_span,
+                    col_header=col_header,
+                    row_header=((not col_header) and html_cell.name == "th"),
+                )
+                data.table_cells.append(cell)
+
+        # TODO: format label vs caption once styling is supported
+        label = table_xml_component["label"]
+        caption = table_xml_component["caption"]
+        table_text: str = f"{label}{' ' if label and caption else ''}{caption}"
+        table_caption: Optional[TextItem] = (
+            doc.add_text(label=DocItemLabel.CAPTION, text=table_text)
+            if table_text
+            else None
+        )
+
+        doc.add_table(data=data, parent=parent, caption=table_caption)
+
+        return
+
+    def _add_tables(
+        self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
+    ) -> None:
+        table: Table = {"label": "", "caption": "", "content": ""}
+
+        # Content
+        if len(node.xpath("table")) > 0:
+            table_content_node = node.xpath("table")[0]
+        elif len(node.xpath("alternatives/table")) > 0:
+            table_content_node = node.xpath("alternatives/table")[0]
+        else:
+            table_content_node = None
+        if table_content_node is not None:
+            table["content"] = etree.tostring(table_content_node).decode("utf-8")
+
+        # Caption
+        caption_node = node.xpath("caption")
+        caption: Optional[str]
+        if caption_node:
+            caption = ""
+            for caption_par in list(caption_node[0]):
+                if caption_par.xpath(".//supplementary-material"):
+                    continue
+                caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
+            caption = caption.strip()
+        else:
+            caption = None
+        if caption is not None:
+            table["caption"] = caption
+
+        # Label
+        if len(node.xpath("label")) > 0:
+            table["label"] = node.xpath("label")[0].text
+
+        try:
+            self._add_table(doc, parent, table)
+        except Exception as e:
+            _log.warning(f"Skipping unsupported table in {str(self.file)}")
+            pass
+
+        return
+
+    def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
+        self.root = doc.add_text(
+            parent=None,
+            text=xml_components["title"],
+            label=DocItemLabel.TITLE,
+        )
+        return
+
+    def _walk_linear(
+        self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
+    ) -> str:
+        # _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
+        skip_tags = ["term"]
+        flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
+        new_parent: NodeItem = parent
+        node_text: str = (
+            node.text.replace("\n", " ")
+            if (node.tag not in skip_tags and node.text)
+            else ""
+        )
+
+        for child in list(node):
+            stop_walk: bool = False
+
+            # flush text into TextItem for some tags in paragraph nodes
+            if node.tag == "p" and node_text.strip() and child.tag in flush_tags:
+                doc.add_text(
+                    label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent
+                )
+                node_text = ""
+
+            # add elements and decide whether to stop walking
+            if child.tag in ("sec", "ack"):
+                header = child.xpath("title|label")
+                text: Optional[str] = None
+                if len(header) > 0:
+                    text = JatsDocumentBackend._get_text(header[0])
+                elif child.tag == "ack":
+                    text = DEFAULT_HEADER_ACKNOWLEDGMENTS
+                if text:
+                    new_parent = doc.add_heading(text=text, parent=parent)
+            elif child.tag == "list":
+                new_parent = doc.add_group(
+                    label=GroupLabel.LIST, name="list", parent=parent
+                )
+            elif child.tag == "list-item":
+                # TODO: address any type of content (another list, formula,...)
+                # TODO: address list type and item label
+                text = JatsDocumentBackend._get_text(child).strip()
+                new_parent = doc.add_list_item(text=text, parent=parent)
+                stop_walk = True
+            elif child.tag == "fig":
+                self._add_figure_captions(doc, parent, child)
+                stop_walk = True
+            elif child.tag == "table-wrap":
+                self._add_tables(doc, parent, child)
+                stop_walk = True
+            elif child.tag == "suplementary-material":
+                stop_walk = True
+            elif child.tag == "fn-group":
+                # header = child.xpath(".//title") or child.xpath(".//label")
+                # if header:
+                #     text = JatsDocumentBackend._get_text(header[0])
+                #     fn_parent = doc.add_heading(text=text, parent=new_parent)
+                # self._add_footnote_group(doc, fn_parent, child)
+                stop_walk = True
+            elif child.tag == "ref-list" and node.tag != "ref-list":
+                header = child.xpath("title|label")
+                text = (
+                    JatsDocumentBackend._get_text(header[0])
+                    if len(header) > 0
+                    else DEFAULT_HEADER_REFERENCES
+                )
+                new_parent = doc.add_heading(text=text, parent=parent)
+                new_parent = doc.add_group(
+                    parent=new_parent, label=GroupLabel.LIST, name="list"
+                )
+            elif child.tag == "element-citation":
+                text = self._parse_element_citation(child)
+                self._add_citation(doc, parent, text)
+                stop_walk = True
+            elif child.tag == "mixed-citation":
+                text = JatsDocumentBackend._get_text(child).strip()
+                self._add_citation(doc, parent, text)
+                stop_walk = True
+            elif child.tag == "tex-math":
+                self._add_equation(doc, parent, child)
+                stop_walk = True
+            elif child.tag == "inline-formula":
+                # TODO: address inline formulas when supported by docling-core
+                stop_walk = True
+
+            # step into child
+            if not stop_walk:
+                new_text = self._walk_linear(doc, new_parent, child)
+                if not (node.getparent().tag == "p" and node.tag in flush_tags):
+                    node_text += new_text
+
+            # pick up the tail text
+            node_text += child.tail.replace("\n", " ") if child.tail else ""
+
+        # create paragraph
+        if node.tag == "p" and node_text.strip():
+            doc.add_text(label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent)
+            return ""
+        else:
+            # backpropagate the text
+            return node_text
--- a/docling/backend/xml/pubmed_backend.py
+++ b/docling/backend/xml/pubmed_backend.py
@ -1,592 +0,0 @@
-import logging
-from io import BytesIO
-from pathlib import Path
-from typing import Any, Set, Union
-
-import lxml
-from bs4 import BeautifulSoup
-from docling_core.types.doc import (
-    DocItemLabel,
-    DoclingDocument,
-    DocumentOrigin,
-    GroupLabel,
-    TableCell,
-    TableData,
-)
-from lxml import etree
-from typing_extensions import TypedDict, override
-
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-
-_log = logging.getLogger(__name__)
-
-
-class Paragraph(TypedDict):
-    text: str
-    headers: list[str]
-
-
-class Author(TypedDict):
-    name: str
-    affiliation_names: list[str]
-
-
-class Table(TypedDict):
-    label: str
-    caption: str
-    content: str
-
-
-class FigureCaption(TypedDict):
-    label: str
-    caption: str
-
-
-class Reference(TypedDict):
-    author_names: str
-    title: str
-    journal: str
-    year: str
-
-
-class XMLComponents(TypedDict):
-    title: str
-    authors: list[Author]
-    abstract: str
-    paragraphs: list[Paragraph]
-    tables: list[Table]
-    figure_captions: list[FigureCaption]
-    references: list[Reference]
-
-
-class PubMedDocumentBackend(DeclarativeDocumentBackend):
-    """
-    The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
-    Achakulvisut et al., (2020).
-    Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
-    Journal of Open Source Software, 5(46), 1979,
-    https://doi.org/10.21105/joss.01979
-    """
-
-    @override
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        self.path_or_stream = path_or_stream
-
-        # Initialize parents for the document hierarchy
-        self.parents: dict = {}
-
-        self.valid = False
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                self.path_or_stream.seek(0)
-            self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
-            if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
-                self.valid = True
-        except Exception as exc:
-            raise RuntimeError(
-                f"Could not initialize PubMed backend for file with hash {self.document_hash}."
-            ) from exc
-
-    @override
-    def is_valid(self) -> bool:
-        return self.valid
-
-    @classmethod
-    @override
-    def supports_pagination(cls) -> bool:
-        return False
-
-    @override
-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
-        self.path_or_stream = None
-
-    @classmethod
-    @override
-    def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.XML_PUBMED}
-
-    @override
-    def convert(self) -> DoclingDocument:
-        # Create empty document
-        origin = DocumentOrigin(
-            filename=self.file.name or "file",
-            mimetype="application/xml",
-            binary_hash=self.document_hash,
-        )
-        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
-
-        _log.debug("Trying to convert PubMed XML document...")
-
-        # Get parsed XML components
-        xml_components: XMLComponents = self._parse()
-
-        # Add XML components to the document
-        doc = self._populate_document(doc, xml_components)
-        return doc
-
-    def _parse_title(self) -> str:
-        title: str = " ".join(
-            [
-                t.replace("\n", "")
-                for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
-            ]
-        )
-        return title
-
-    def _parse_authors(self) -> list[Author]:
-        # Get mapping between affiliation ids and names
-        affiliation_names = []
-        for affiliation_node in self.tree.xpath(".//aff[@id]"):
-            affiliation_names.append(
-                ": ".join([t for t in affiliation_node.itertext() if t != "\n"])
-            )
-        affiliation_ids_names = {
-            id: name
-            for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
-        }
-
-        # Get author names and affiliation names
-        authors: list[Author] = []
-        for author_node in self.tree.xpath(
-            './/contrib-group/contrib[@contrib-type="author"]'
-        ):
-            author: Author = {
-                "name": "",
-                "affiliation_names": [],
-            }
-
-            # Affiliation names
-            affiliation_ids = [
-                a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
-            ]
-            for id in affiliation_ids:
-                if id in affiliation_ids_names:
-                    author["affiliation_names"].append(affiliation_ids_names[id])
-
-            # Name
-            author["name"] = (
-                author_node.xpath("name/surname")[0].text
-                + " "
-                + author_node.xpath("name/given-names")[0].text
-            )
-
-            authors.append(author)
-        return authors
-
-    def _parse_abstract(self) -> str:
-        texts = []
-        for abstract_node in self.tree.xpath(".//abstract"):
-            for text in abstract_node.itertext():
-                texts.append(text.replace("\n", ""))
-        abstract: str = "".join(texts)
-        return abstract
-
-    def _parse_main_text(self) -> list[Paragraph]:
-        paragraphs: list[Paragraph] = []
-        for paragraph_node in self.tree.xpath("//body//p"):
-            # Skip captions
-            if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
-                continue
-
-            paragraph: Paragraph = {"text": "", "headers": []}
-
-            # Text
-            paragraph["text"] = "".join(
-                [t.replace("\n", "") for t in paragraph_node.itertext()]
-            )
-
-            # Header
-            path = "../title"
-            while len(paragraph_node.xpath(path)) > 0:
-                paragraph["headers"].append(
-                    "".join(
-                        [
-                            t.replace("\n", "")
-                            for t in paragraph_node.xpath(path)[0].itertext()
-                        ]
-                    )
-                )
-                path = "../" + path
-
-            paragraphs.append(paragraph)
-
-        return paragraphs
-
-    def _parse_tables(self) -> list[Table]:
-        tables: list[Table] = []
-        for table_node in self.tree.xpath(".//body//table-wrap"):
-            table: Table = {"label": "", "caption": "", "content": ""}
-
-            # Content
-            if len(table_node.xpath("table")) > 0:
-                table_content_node = table_node.xpath("table")[0]
-            elif len(table_node.xpath("alternatives/table")) > 0:
-                table_content_node = table_node.xpath("alternatives/table")[0]
-            else:
-                table_content_node = None
-            if table_content_node != None:
-                table["content"] = etree.tostring(table_content_node).decode("utf-8")
-
-            # Caption
-            if len(table_node.xpath("caption/p")) > 0:
-                caption_node = table_node.xpath("caption/p")[0]
-            elif len(table_node.xpath("caption/title")) > 0:
-                caption_node = table_node.xpath("caption/title")[0]
-            else:
-                caption_node = None
-            if caption_node != None:
-                table["caption"] = "".join(
-                    [t.replace("\n", "") for t in caption_node.itertext()]
-                )
-
-            # Label
-            if len(table_node.xpath("label")) > 0:
-                table["label"] = table_node.xpath("label")[0].text
-
-            tables.append(table)
-        return tables
-
-    def _parse_figure_captions(self) -> list[FigureCaption]:
-        figure_captions: list[FigureCaption] = []
-
-        if not (self.tree.xpath(".//fig")):
-            return figure_captions
-
-        for figure_node in self.tree.xpath(".//fig"):
-            figure_caption: FigureCaption = {
-                "caption": "",
-                "label": "",
-            }
-
-            # Label
-            if figure_node.xpath("label"):
-                figure_caption["label"] = "".join(
-                    [
-                        t.replace("\n", "")
-                        for t in figure_node.xpath("label")[0].itertext()
-                    ]
-                )
-
-            # Caption
-            if figure_node.xpath("caption"):
-                caption = ""
-                for caption_node in figure_node.xpath("caption")[0].getchildren():
-                    caption += (
-                        "".join([t.replace("\n", "") for t in caption_node.itertext()])
-                        + "\n"
-                    )
-                figure_caption["caption"] = caption
-
-            figure_captions.append(figure_caption)
-
-        return figure_captions
-
-    def _parse_references(self) -> list[Reference]:
-        references: list[Reference] = []
-        for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
-            reference: Reference = {
-                "author_names": "",
-                "title": "",
-                "journal": "",
-                "year": "",
-            }
-            reference_node: Any = None
-            for tag in ["mixed-citation", "element-citation", "citation"]:
-                if len(reference_node_abs.xpath(tag)) > 0:
-                    reference_node = reference_node_abs.xpath(tag)[0]
-                    break
-
-            if reference_node is None:
-                continue
-
-            if all(
-                not (ref_type in ["citation-type", "publication-type"])
-                for ref_type in reference_node.attrib.keys()
-            ):
-                continue
-
-            # Author names
-            names = []
-            if len(reference_node.xpath("name")) > 0:
-                for name_node in reference_node.xpath("name"):
-                    name_str = " ".join(
-                        [t.text for t in name_node.getchildren() if (t.text != None)]
-                    )
-                    names.append(name_str)
-            elif len(reference_node.xpath("person-group")) > 0:
-                for name_node in reference_node.xpath("person-group")[0]:
-                    name_str = (
-                        name_node.xpath("given-names")[0].text
-                        + " "
-                        + name_node.xpath("surname")[0].text
-                    )
-                    names.append(name_str)
-            reference["author_names"] = "; ".join(names)
-
-            # Title
-            if len(reference_node.xpath("article-title")) > 0:
-                reference["title"] = " ".join(
-                    [
-                        t.replace("\n", " ")
-                        for t in reference_node.xpath("article-title")[0].itertext()
-                    ]
-                )
-
-            # Journal
-            if len(reference_node.xpath("source")) > 0:
-                reference["journal"] = reference_node.xpath("source")[0].text
-
-            # Year
-            if len(reference_node.xpath("year")) > 0:
-                reference["year"] = reference_node.xpath("year")[0].text
-
-            if (
-                not (reference_node.xpath("article-title"))
-                and not (reference_node.xpath("journal"))
-                and not (reference_node.xpath("year"))
-            ):
-                reference["title"] = reference_node.text
-
-            references.append(reference)
-        return references
-
-    def _parse(self) -> XMLComponents:
-        """Parsing PubMed document."""
-        xml_components: XMLComponents = {
-            "title": self._parse_title(),
-            "authors": self._parse_authors(),
-            "abstract": self._parse_abstract(),
-            "paragraphs": self._parse_main_text(),
-            "tables": self._parse_tables(),
-            "figure_captions": self._parse_figure_captions(),
-            "references": self._parse_references(),
-        }
-        return xml_components
-
-    def _populate_document(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> DoclingDocument:
-        self._add_title(doc, xml_components)
-        self._add_authors(doc, xml_components)
-        self._add_abstract(doc, xml_components)
-        self._add_main_text(doc, xml_components)
-
-        if xml_components["tables"]:
-            self._add_tables(doc, xml_components)
-
-        if xml_components["figure_captions"]:
-            self._add_figure_captions(doc, xml_components)
-
-        self._add_references(doc, xml_components)
-        return doc
-
-    def _add_figure_captions(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        self.parents["Figures"] = doc.add_heading(
-            parent=self.parents["Title"], text="Figures"
-        )
-        for figure_caption_xml_component in xml_components["figure_captions"]:
-            figure_caption_text = (
-                figure_caption_xml_component["label"]
-                + ": "
-                + figure_caption_xml_component["caption"].strip()
-            )
-            fig_caption = doc.add_text(
-                label=DocItemLabel.CAPTION, text=figure_caption_text
-            )
-            doc.add_picture(
-                parent=self.parents["Figures"],
-                caption=fig_caption,
-            )
-        return
-
-    def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
-        self.parents["Title"] = doc.add_text(
-            parent=None,
-            text=xml_components["title"],
-            label=DocItemLabel.TITLE,
-        )
-        return
-
-    def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
-        authors_affiliations: list = []
-        for author in xml_components["authors"]:
-            authors_affiliations.append(author["name"])
-            authors_affiliations.append(", ".join(author["affiliation_names"]))
-        authors_affiliations_str = "; ".join(authors_affiliations)
-
-        doc.add_text(
-            parent=self.parents["Title"],
-            text=authors_affiliations_str,
-            label=DocItemLabel.PARAGRAPH,
-        )
-        return
-
-    def _add_abstract(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        abstract_text: str = xml_components["abstract"]
-        self.parents["Abstract"] = doc.add_heading(
-            parent=self.parents["Title"], text="Abstract"
-        )
-        doc.add_text(
-            parent=self.parents["Abstract"],
-            text=abstract_text,
-            label=DocItemLabel.TEXT,
-        )
-        return
-
-    def _add_main_text(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        added_headers: list = []
-        for paragraph in xml_components["paragraphs"]:
-            if not (paragraph["headers"]):
-                continue
-
-            # Header
-            for i, header in enumerate(reversed(paragraph["headers"])):
-                if header in added_headers:
-                    continue
-                added_headers.append(header)
-
-                if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
-                    i - 1
-                ] in self.parents:
-                    parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
-                else:
-                    parent = self.parents["Title"]
-
-                self.parents[header] = doc.add_heading(parent=parent, text=header)
-
-            # Paragraph text
-            if paragraph["headers"][0] in self.parents:
-                parent = self.parents[paragraph["headers"][0]]
-            else:
-                parent = self.parents["Title"]
-
-            doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
-        return
-
-    def _add_references(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        self.parents["References"] = doc.add_heading(
-            parent=self.parents["Title"], text="References"
-        )
-        current_list = doc.add_group(
-            parent=self.parents["References"], label=GroupLabel.LIST, name="list"
-        )
-        for reference in xml_components["references"]:
-            reference_text: str = ""
-            if reference["author_names"]:
-                reference_text += reference["author_names"] + ". "
-
-            if reference["title"]:
-                reference_text += reference["title"]
-                if reference["title"][-1] != ".":
-                    reference_text += "."
-                reference_text += " "
-
-            if reference["journal"]:
-                reference_text += reference["journal"]
-
-            if reference["year"]:
-                reference_text += " (" + reference["year"] + ")"
-
-            if not (reference_text):
-                _log.debug(f"Skipping reference for: {str(self.file)}")
-                continue
-
-            doc.add_list_item(
-                text=reference_text, enumerated=False, parent=current_list
-            )
-        return
-
-    def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
-        self.parents["Tables"] = doc.add_heading(
-            parent=self.parents["Title"], text="Tables"
-        )
-        for table_xml_component in xml_components["tables"]:
-            try:
-                self._add_table(doc, table_xml_component)
-            except Exception as e:
-                _log.debug(f"Skipping unsupported table for: {str(self.file)}")
-                pass
-        return
-
-    def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
-        soup = BeautifulSoup(table_xml_component["content"], "html.parser")
-        table_tag = soup.find("table")
-
-        nested_tables = table_tag.find("table")
-        if nested_tables:
-            _log.debug(f"Skipping nested table for: {str(self.file)}")
-            return
-
-        # Count the number of rows (number of <tr> elements)
-        num_rows = len(table_tag.find_all("tr"))
-
-        # Find the number of columns (taking into account colspan)
-        num_cols = 0
-        for row in table_tag.find_all("tr"):
-            col_count = 0
-            for cell in row.find_all(["td", "th"]):
-                colspan = int(cell.get("colspan", 1))
-                col_count += colspan
-            num_cols = max(num_cols, col_count)
-
-        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
-
-        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-
-        # Iterate over the rows in the table
-        for row_idx, row in enumerate(table_tag.find_all("tr")):
-            # For each row, find all the column cells (both <td> and <th>)
-            cells = row.find_all(["td", "th"])
-
-            # Check if each cell in the row is a header -> means it is a column header
-            col_header = True
-            for j, html_cell in enumerate(cells):
-                if html_cell.name == "td":
-                    col_header = False
-
-            # Extract and print the text content of each cell
-            col_idx = 0
-            for _, html_cell in enumerate(cells):
-                text = html_cell.text
-
-                col_span = int(html_cell.get("colspan", 1))
-                row_span = int(html_cell.get("rowspan", 1))
-
-                while grid[row_idx][col_idx] != None:
-                    col_idx += 1
-                for r in range(row_span):
-                    for c in range(col_span):
-                        grid[row_idx + r][col_idx + c] = text
-
-                cell = TableCell(
-                    text=text,
-                    row_span=row_span,
-                    col_span=col_span,
-                    start_row_offset_idx=row_idx,
-                    end_row_offset_idx=row_idx + row_span,
-                    start_col_offset_idx=col_idx,
-                    end_col_offset_idx=col_idx + col_span,
-                    col_header=col_header,
-                    row_header=((not col_header) and html_cell.name == "th"),
-                )
-                data.table_cells.append(cell)
-
-        table_caption = doc.add_text(
-            label=DocItemLabel.CAPTION,
-            text=table_xml_component["label"] + ": " + table_xml_component["caption"],
-        )
-        doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
-        return
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -234,6 +234,12 @@ def convert(
        Optional[Path],
        typer.Option(..., help="If provided, the location of the model artifacts."),
    ] = None,
+    enable_remote_services: Annotated[
+        bool,
+        typer.Option(
+            ..., help="Must be enabled when using models connecting to remote services."
+        ),
+    ] = False,
    abort_on_error: Annotated[
        bool,
        typer.Option(
@ -380,6 +386,7 @@ def convert(

        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
        pipeline_options = PdfPipelineOptions(
+            enable_remote_services=enable_remote_services,
            accelerator_options=accelerator_options,
            do_ocr=ocr,
            ocr_options=ocr_options,
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -34,13 +34,14 @@ class InputFormat(str, Enum):
    DOCX = "docx"
    PPTX = "pptx"
    HTML = "html"
-    XML_PUBMED = "xml_pubmed"
    IMAGE = "image"
    PDF = "pdf"
    ASCIIDOC = "asciidoc"
    MD = "md"
+    CSV = "csv"
    XLSX = "xlsx"
    XML_USPTO = "xml_uspto"
+    XML_JATS = "xml_jats"
    JSON_DOCLING = "json_docling"


@ -58,9 +59,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.PDF: ["pdf"],
    InputFormat.MD: ["md"],
    InputFormat.HTML: ["html", "htm", "xhtml"],
-    InputFormat.XML_PUBMED: ["xml", "nxml"],
+    InputFormat.XML_JATS: ["xml", "nxml"],
    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
    InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
+    InputFormat.CSV: ["csv"],
    InputFormat.XLSX: ["xlsx"],
    InputFormat.XML_USPTO: ["xml", "txt"],
    InputFormat.JSON_DOCLING: ["json"],
@ -77,7 +79,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    ],
    InputFormat.HTML: ["text/html", "application/xhtml+xml"],
-    InputFormat.XML_PUBMED: ["application/xml"],
+    InputFormat.XML_JATS: ["application/xml"],
    InputFormat.IMAGE: [
        "image/png",
        "image/jpeg",
@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
    InputFormat.PDF: ["application/pdf"],
    InputFormat.ASCIIDOC: ["text/asciidoc"],
    InputFormat.MD: ["text/markdown", "text/x-markdown"],
+    InputFormat.CSV: ["text/csv"],
    InputFormat.XLSX: [
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    ],
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -1,3 +1,4 @@
+import csv
 import logging
 import re
 from enum import Enum
@ -296,6 +297,7 @@ class _DocumentConversionInput(BaseModel):
                mime = _DocumentConversionInput._mime_from_extension(ext)

        mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
+        mime = mime or _DocumentConversionInput._detect_csv(content)
        mime = mime or "text/plain"
        formats = MimeTypeToFormat.get(mime, [])
        if formats:
@ -331,11 +333,11 @@ class _DocumentConversionInput(BaseModel):
                ):
                    input_format = InputFormat.XML_USPTO

-                if (
-                    InputFormat.XML_PUBMED in formats
-                    and "/NLM//DTD JATS" in xml_doctype
+                if InputFormat.XML_JATS in formats and (
+                    "JATS-journalpublishing" in xml_doctype
+                    or "JATS-archive" in xml_doctype
                ):
-                    input_format = InputFormat.XML_PUBMED
+                    input_format = InputFormat.XML_JATS

        elif mime == "text/plain":
            if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
@ -352,6 +354,8 @@ class _DocumentConversionInput(BaseModel):
            mime = FormatToMimeType[InputFormat.HTML][0]
        elif ext in FormatToExtensions[InputFormat.MD]:
            mime = FormatToMimeType[InputFormat.MD][0]
+        elif ext in FormatToExtensions[InputFormat.CSV]:
+            mime = FormatToMimeType[InputFormat.CSV][0]
        elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
            mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
        elif ext in FormatToExtensions[InputFormat.PDF]:
@ -392,3 +396,32 @@ class _DocumentConversionInput(BaseModel):
            return "application/xml"

        return None
+
+    @staticmethod
+    def _detect_csv(
+        content: bytes,
+    ) -> Optional[Literal["text/csv"]]:
+        """Guess the mime type of a CSV file from its content.
+
+        Args:
+            content: A short piece of a document from its beginning.
+
+        Returns:
+            The mime type of a CSV file, or None if the content does
+              not match any of the format.
+        """
+        content_str = content.decode("ascii", errors="ignore").strip()
+
+        # Ensure there's at least one newline (CSV is usually multi-line)
+        if "\n" not in content_str:
+            return None
+
+        # Use csv.Sniffer to detect CSV characteristics
+        try:
+            dialect = csv.Sniffer().sniff(content_str)
+            if dialect.delimiter in {",", ";", "\t", "|"}:  # Common delimiters
+                return "text/csv"
+        except csv.Error:
+            return None
+
+        return None
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -1,11 +1,26 @@
 import logging
 import os
+import re
+import warnings
 from enum import Enum
 from pathlib import Path
 from typing import Annotated, Any, Dict, List, Literal, Optional, Union

-from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
-from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic import (
+    AnyUrl,
+    BaseModel,
+    ConfigDict,
+    Field,
+    field_validator,
+    model_validator,
+    validator,
+)
+from pydantic_settings import (
+    BaseSettings,
+    PydanticBaseSettingsSource,
+    SettingsConfigDict,
+)
+from typing_extensions import deprecated

 _log = logging.getLogger(__name__)

@ -25,7 +40,18 @@ class AcceleratorOptions(BaseSettings):
    )

    num_threads: int = 4
-    device: AcceleratorDevice = AcceleratorDevice.AUTO
+    device: Union[str, AcceleratorDevice] = "auto"
+
+    @field_validator("device")
+    def validate_device(cls, value):
+        # "auto", "cpu", "cuda", "mps", or "cuda:N"
+        if value in {d.value for d in AcceleratorDevice} or re.match(
+            r"^cuda(:\d+)?$", value
+        ):
+            return value
+        raise ValueError(
+            "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
+        )

    @model_validator(mode="before")
    @classmethod
@ -41,7 +67,6 @@ class AcceleratorOptions(BaseSettings):
        """
        if isinstance(data, dict):
            input_num_threads = data.get("num_threads")
-
            # Check if to set the num_threads from the alternative envvar
            if input_num_threads is None:
                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
@ -257,6 +282,7 @@ class PipelineOptions(BaseModel):
    )
    document_timeout: Optional[float] = None
    accelerator_options: AcceleratorOptions = AcceleratorOptions()
+    enable_remote_services: bool = False


 class PdfPipelineOptions(PipelineOptions):
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@ -1,6 +1,6 @@
 import sys
 from pathlib import Path
-from typing import Annotated, Tuple
+from typing import Annotated, Optional, Tuple

 from pydantic import BaseModel, PlainValidator
 from pydantic_settings import BaseSettings, SettingsConfigDict
@ -62,6 +62,7 @@ class AppSettings(BaseSettings):
    debug: DebugSettings

    cache_dir: Path = Path.home() / ".cache" / "docling"
+    artifacts_path: Optional[Path] = None


 settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -10,6 +10,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call

 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.asciidoc_backend import AsciiDocBackend
+from docling.backend.csv_backend import CsvDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.json.docling_json_backend import DoclingJSONBackend
@ -17,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
+from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
    ConversionStatus,
@ -61,6 +62,11 @@ class FormatOption(BaseModel):
        return self


+class CsvFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
+
+
 class ExcelFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
@ -96,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
    backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend


-class XMLPubMedFormatOption(FormatOption):
+class XMLJatsFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
+    backend: Type[AbstractDocumentBackend] = JatsDocumentBackend


 class ImageFormatOption(FormatOption):
@ -113,6 +119,9 @@ class PdfFormatOption(FormatOption):

 def _get_default_option(format: InputFormat) -> FormatOption:
    format_to_default_options = {
+        InputFormat.CSV: FormatOption(
+            pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
+        ),
        InputFormat.XLSX: FormatOption(
            pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
        ),
@ -134,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
        InputFormat.XML_USPTO: FormatOption(
            pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
        ),
-        InputFormat.XML_PUBMED: FormatOption(
-            pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
+        InputFormat.XML_JATS: FormatOption(
+            pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
        ),
        InputFormat.IMAGE: FormatOption(
            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
--- a/docling/exceptions.py
+++ b/docling/exceptions.py
@ -4,3 +4,7 @@ class BaseError(RuntimeError):

 class ConversionError(BaseError):
    pass
+
+
+class OperationNotAllowed(BaseError):
+    pass
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@ -8,6 +8,7 @@ from PIL import Image
 from pydantic import BaseModel, ConfigDict

 from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
+from docling.exceptions import OperationNotAllowed
 from docling.models.picture_description_base_model import PictureDescriptionBaseModel

 _log = logging.getLogger(__name__)
@ -45,14 +46,20 @@ class ApiResponse(BaseModel):
 class PictureDescriptionApiModel(PictureDescriptionBaseModel):
    # elements_batch_size = 4

-    def __init__(self, enabled: bool, options: PictureDescriptionApiOptions):
+    def __init__(
+        self,
+        enabled: bool,
+        enable_remote_services: bool,
+        options: PictureDescriptionApiOptions,
+    ):
        super().__init__(enabled=enabled, options=options)
        self.options: PictureDescriptionApiOptions

        if self.enabled:
-            if options.url.host != "localhost":
-                raise NotImplementedError(
-                    "The options try to connect to remote APIs which are not yet allowed."
+            if not enable_remote_services:
+                raise OperationNotAllowed(
+                    "Connections to remote services is only allowed when set explicitly. "
+                    "pipeline_options.enable_remote_services=True."
                )

    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -22,6 +22,7 @@ class TesseractOcrModel(BaseOcrModel):
        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
        self.reader = None
        self.osd_reader = None
+        self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}

        if self.enabled:
            install_errmsg = (
@ -57,8 +58,6 @@ class TesseractOcrModel(BaseOcrModel):
            _log.debug("Initializing TesserOCR: %s", tesseract_version)
            lang = "+".join(self.options.lang)

-            self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
-
            if any([l.startswith("script/") for l in self._tesserocr_languages]):
                self.script_prefix = "script/"
            else:
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -61,6 +61,14 @@ class StandardPdfPipeline(PaginatedPipeline):
        artifacts_path: Optional[Path] = None
        if pipeline_options.artifacts_path is not None:
            artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
+        elif settings.artifacts_path is not None:
+            artifacts_path = Path(settings.artifacts_path).expanduser()
+
+        if artifacts_path is not None and not artifacts_path.is_dir():
+            raise RuntimeError(
+                f"The value of {artifacts_path=} is not valid. "
+                "When defined, it must point to a folder containing all models required by the pipeline."
+            )

        self.keep_images = (
            self.pipeline_options.generate_page_images
@ -201,6 +209,7 @@ class StandardPdfPipeline(PaginatedPipeline):
        ):
            return PictureDescriptionApiModel(
                enabled=self.pipeline_options.do_picture_description,
+                enable_remote_services=self.pipeline_options.enable_remote_services,
                options=self.pipeline_options.picture_description_options,
            )
        elif isinstance(
--- a/docling/utils/accelerator_utils.py
+++ b/docling/utils/accelerator_utils.py
@ -7,36 +7,62 @@ from docling.datamodel.pipeline_options import AcceleratorDevice
 _log = logging.getLogger(__name__)


-def decide_device(accelerator_device: AcceleratorDevice) -> str:
+def decide_device(accelerator_device: str) -> str:
    r"""
-    Resolve the device based on the acceleration options and the available devices in the system
+    Resolve the device based on the acceleration options and the available devices in the system.
+
    Rules:
    1. AUTO: Check for the best available device on the system.
    2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
    """
-    cuda_index = 0
    device = "cpu"

    has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
    has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()

-    if accelerator_device == AcceleratorDevice.AUTO:
+    if accelerator_device == AcceleratorDevice.AUTO.value:  # Handle 'auto'
        if has_cuda:
-            device = f"cuda:{cuda_index}"
+            device = "cuda:0"
        elif has_mps:
            device = "mps"

+    elif accelerator_device.startswith("cuda"):
+        if has_cuda:
+            # if cuda device index specified extract device id
+            parts = accelerator_device.split(":")
+            if len(parts) == 2 and parts[1].isdigit():
+                # select cuda device's id
+                cuda_index = int(parts[1])
+                if cuda_index < torch.cuda.device_count():
+                    device = f"cuda:{cuda_index}"
+                else:
+                    _log.warning(
+                        "CUDA device 'cuda:%d' is not available. Fall back to 'CPU'.",
+                        cuda_index,
+                    )
+            elif len(parts) == 1:  # just "cuda"
+                device = "cuda:0"
+            else:
+                _log.warning(
+                    "Invalid CUDA device format '%s'. Fall back to 'CPU'",
+                    accelerator_device,
+                )
+        else:
+            _log.warning("CUDA is not available in the system. Fall back to 'CPU'")
+
+    elif accelerator_device == AcceleratorDevice.MPS.value:
+        if has_mps:
+            device = "mps"
+        else:
+            _log.warning("MPS is not available in the system. Fall back to 'CPU'")
+
+    elif accelerator_device == AcceleratorDevice.CPU.value:
+        device = "cpu"
+
    else:
-        if accelerator_device == AcceleratorDevice.CUDA:
-            if has_cuda:
-                device = f"cuda:{cuda_index}"
-            else:
-                _log.warning("CUDA is not available in the system. Fall back to 'CPU'")
-        elif accelerator_device == AcceleratorDevice.MPS:
-            if has_mps:
-                device = "mps"
-            else:
-                _log.warning("MPS is not available in the system. Fall back to 'CPU'")
+        _log.warning(
+            "Unknown device option '%s'. Fall back to 'CPU'", accelerator_device
+        )

    _log.info("Accelerator device: '%s'", device)
    return device
--- a/docs/examples/backend_csv.ipynb
+++ b/docs/examples/backend_csv.ipynb
@ -0,0 +1,80 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Conversion of CSV files\n",
+    "\n",
+    "This example shows how to convert CSV files to a structured Docling Document.\n",
+    "\n",
+    "* Multiple delimiters are supported: `,` `;` `|` `[tab]`\n",
+    "* Additional CSV dialect settings are detected automatically (e.g. quotes, line separator, escape character)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "# Convert CSV to Docling document\n",
+    "converter = DocumentConverter()\n",
+    "result = converter.convert(Path(\"../../tests/data/csv/csv-comma.csv\"))\n",
+    "output = result.document.export_to_markdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This code generates the following output:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "|   Index | Customer Id     | First Name   | Last Name   | Company                         | City              | Country                    | Phone 1                | Phone 2               | Email                       | Subscription Date   | Website                     |\n",
+    "|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|\n",
+    "|       1 | DD37Cf93aecA6Dc | Sheryl       | Baxter      | Rasmussen Group                 | East Leonard      | Chile                      | 229.077.5154           | 397.884.0519x718      | zunigavanessa@smith.info    | 2020-08-24          | http://www.stephenson.com/  |\n",
+    "|       2 | 1Ef7b82A4CAAD10 | Preston      | Lozano, Dr  | Vega-Gentry                     | East Jimmychester | Djibouti                   | 5153435776             | 686-620-1820x944      | vmata@colon.com             | 2021-04-23          | http://www.hobbs.com/       |\n",
+    "|       3 | 6F94879bDAfE5a6 | Roy          | Berry       | Murillo-Perry                   | Isabelborough     | Antigua and Barbuda        | +1-539-402-0259        | (496)978-3969x58947   | beckycarr@hogan.com         | 2020-03-25          | http://www.lawrence.com/    |\n",
+    "|       4 | 5Cef8BFA16c5e3c | Linda        | Olsen       | Dominguez, Mcmillan and Donovan | Bensonview        | Dominican Republic         | 001-808-617-6467x12895 | +1-813-324-8756       | stanleyblackwell@benson.org | 2020-06-02          | http://www.good-lyons.com/  |\n",
+    "|       5 | 053d585Ab6b3159 | Joanna       | Bender      | Martin, Lang and Andrade        | West Priscilla    | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net     | 2021-04-17          | https://goodwin-ingram.com/ |"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "docling-TtEIaPrw-py3.12",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@ -82,7 +82,7 @@
    "from docling.document_converter import DocumentConverter\n",
    "\n",
    "# a sample PMC article:\n",
-    "source = \"../../tests/data/pubmed/elife-56337.nxml\"\n",
+    "source = \"../../tests/data/jats/elife-56337.nxml\"\n",
    "converter = DocumentConverter()\n",
    "result = converter.convert(source)\n",
    "print(result.status)"
@ -97,7 +97,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
@ -106,11 +106,11 @@
     "text": [
      "# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage\n",
      "\n",
-      "Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States\n",
+      "Gernot Wolf, Alberto de Iaco, Ming-An Sun, Melania Bruno, Matthew Tinkham, Don Hoang, Apratim Mitra, Sherry Ralls, Didier Trono, Todd S Macfarlan\n",
+      "\n",
+      "The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health, Bethesda, United States; School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland\n",
      "\n",
      "## Abstract\n",
-      "\n",
-      "The Krüppel-associated box zinc finger protein (KRAB-ZFP) family diversified in mammals. The majority of human KRAB-ZFPs bind transposable elements (TEs), however, since most TEs are inactive in humans it is unclear whether KRAB-ZFPs emerged to suppress TEs. We demonstrate that many recently emerged murine KRAB-ZFPs also bind to TEs, including the active ETn, IAP, and L1 families. Using a CRISPR/Cas9-based engineering approach, we genetically deleted five large clusters of KRAB-ZFPs and demonstrate that target TEs are de-repressed, unleashing TE-encoded enhancers. Homozygous knockout mice lacking one of two KRAB-ZFP gene clusters on chromosome 2 and chromosome 4 were nonetheless viable. In pedigrees of chromosome 4 cluster KRAB-ZFP mutants, we identified numerous novel ETn insertions with a modest increase in mutants. Our data strongly support the current model that recent waves of retrotransposon activity drove the expansion of KRAB-ZFP genes in mice and that many KRAB-ZFPs play a redundant role restricting TE activity.\n",
      "\n"
     ]
    }
@ -131,7 +131,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@ -198,7 +198,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -224,7 +224,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -261,7 +261,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -313,7 +313,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@ -359,9 +359,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2024/ipg241217.zip...\n",
+      "Parsing zip file, splitting into XML sections, and exporting to files...\n"
+     ]
+    }
+   ],
   "source": [
    "import zipfile\n",
    "\n",
@ -407,7 +416,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@ -435,7 +444,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@ -449,7 +458,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3964d1ff30f74588a2f6b53ca8865a9f",
+       "model_id": "316241ca89a843bda3170f2a5c76c639",
       "version_major": 2,
       "version_minor": 0
      },
@ -471,7 +480,7 @@
   "source": [
    "from tqdm.notebook import tqdm\n",
    "\n",
-    "from docling.backend.xml.pubmed_backend import PubMedDocumentBackend\n",
+    "from docling.backend.xml.jats_backend import JatsDocumentBackend\n",
    "from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n",
    "from docling.datamodel.base_models import InputFormat\n",
    "from docling.datamodel.document import InputDocument\n",
@ -479,10 +488,10 @@
    "# check PMC\n",
    "in_doc = InputDocument(\n",
    "    path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n",
-    "    format=InputFormat.XML_PUBMED,\n",
-    "    backend=PubMedDocumentBackend,\n",
+    "    format=InputFormat.XML_JATS,\n",
+    "    backend=JatsDocumentBackend,\n",
    ")\n",
-    "backend = PubMedDocumentBackend(\n",
+    "backend = JatsDocumentBackend(\n",
    "    in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n",
    ")\n",
    "print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n",
@ -521,7 +530,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@ -543,7 +552,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or PubMed XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
+    "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or JATS (PubMed) XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files."
   ]
  },
  {
@ -579,7 +588,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@ -607,7 +616,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@ -625,144 +634,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2025-01-24 16:49:57,108 [DEBUG][_create_connection]: Created new connection using: 2d58fad6c63448a486c0c0ffe3b7b28c (async_milvus_client.py:600)\n",
-      "Loading files:  51%|█████     | 51/100 [00:00<00:00, 67.88file/s]Input document ipg241217-1050.xml does not match any allowed format.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Failed to load file /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml with error: File format not allowed: /var/folders/2r/b2sdj1512g1_0m7wzzy7sftr0000gn/T/tmp11rjcdj8/ipg241217-1050.xml. Skipping...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading files: 100%|██████████| 100/100 [00:01<00:00, 58.05file/s]\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e9208639f1a4418d97267a28305d18fa",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Parsing nodes:   0%|          | 0/99 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "88026613f6f44f0c8476dceaa1cb78cd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7522b8b434b54616b4cfc3d71e9556d7",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5879d8161c2041f5b100959e69ff9017",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "557912b5e3c741f3a06127156bc46379",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "843bb145942b449aa55fc5b8208da734",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c7dba09a4aed422998e9b9c2c3a70317",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0bd031356c7e4e879dcbe1d04e6c4a4e",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating embeddings:   0%|          | 0/425 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from llama_index.core import StorageContext, VectorStoreIndex\n",
    "from llama_index.vector_stores.milvus import MilvusVectorStore\n",
--- a/docs/examples/pictures_description_api.py
+++ b/docs/examples/pictures_description_api.py
@ -1,7 +1,10 @@
 import logging
+import os
 from pathlib import Path

+import requests
 from docling_core.types.doc import PictureItem
+from dotenv import load_dotenv

 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
@ -11,27 +14,87 @@ from docling.datamodel.pipeline_options import (
 from docling.document_converter import DocumentConverter, PdfFormatOption


-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
-
-    # This is using a local API server to do picture description.
-    # For example, you can launch it locally with:
-    # $ vllm serve "HuggingFaceTB/SmolVLM-256M-Instruct"
-
-    pipeline_options = PdfPipelineOptions()
-    pipeline_options.do_picture_description = True
-    pipeline_options.picture_description_options = PictureDescriptionApiOptions(
+def vllm_local_options(model: str):
+    options = PictureDescriptionApiOptions(
        url="http://localhost:8000/v1/chat/completions",
        params=dict(
-            model="HuggingFaceTB/SmolVLM-256M-Instruct",
+            model=model,
            seed=42,
            max_completion_tokens=200,
        ),
        prompt="Describe the image in three sentences. Be consise and accurate.",
        timeout=90,
    )
+    return options
+
+
+def watsonx_vlm_options():
+    load_dotenv()
+    api_key = os.environ.get("WX_API_KEY")
+    project_id = os.environ.get("WX_PROJECT_ID")
+
+    def _get_iam_access_token(api_key: str) -> str:
+        res = requests.post(
+            url="https://iam.cloud.ibm.com/identity/token",
+            headers={
+                "Content-Type": "application/x-www-form-urlencoded",
+            },
+            data=f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={api_key}",
+        )
+        res.raise_for_status()
+        api_out = res.json()
+        print(f"{api_out=}")
+        return api_out["access_token"]
+
+    options = PictureDescriptionApiOptions(
+        url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29",
+        params=dict(
+            model_id="meta-llama/llama-3-2-11b-vision-instruct",
+            project_id=project_id,
+            parameters=dict(
+                max_new_tokens=400,
+            ),
+        ),
+        headers={
+            "Authorization": "Bearer " + _get_iam_access_token(api_key=api_key),
+        },
+        prompt="Describe the image in three sentences. Be consise and accurate.",
+        timeout=60,
+    )
+    return options
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+
+    pipeline_options = PdfPipelineOptions(
+        enable_remote_services=True  # <-- this is required!
+    )
+    pipeline_options.do_picture_description = True
+
+    # The PictureDescriptionApiOptions() allows to interface with APIs supporting
+    # the multi-modal chat interface. Here follow a few example on how to configure those.
+    #
+    # One possibility is self-hosting model, e.g. via VLLM.
+    # $ vllm serve MODEL_NAME
+    # Then PictureDescriptionApiOptions can point to the localhost endpoint.
+    #
+    # Example for the Granite Vision model: (uncomment the following lines)
+    # pipeline_options.picture_description_options = vllm_local_options(
+    #     model="ibm-granite/granite-vision-3.1-2b-preview"
+    # )
+    #
+    # Example for the SmolVLM model: (uncomment the following lines)
+    pipeline_options.picture_description_options = vllm_local_options(
+        model="HuggingFaceTB/SmolVLM-256M-Instruct"
+    )
+    #
+    # Another possibility is using online services, e.g. watsonx.ai.
+    # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
+    # Uncomment the following line for this option:
+    # pipeline_options.picture_description_options = watsonx_vlm_options()

    doc_converter = DocumentConverter(
        format_options={
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@ -30,6 +30,9 @@ def main():
    #     num_threads=8, device=AcceleratorDevice.CUDA
    # )

+    # easyocr doesnt support cuda:N allocation, defaults to cuda:0
+    # accelerator_options = AcceleratorOptions(num_threads=8, device="cuda:1")
+
    pipeline_options = PdfPipelineOptions()
    pipeline_options.accelerator_options = accelerator_options
    pipeline_options.do_ocr = True
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@ -43,6 +43,7 @@ def main():
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
+                InputFormat.CSV,
                InputFormat.MD,
            ],  # whitelist formats, non-matching files are ignored.
            format_options={
--- a/docs/supported_formats.md
+++ b/docs/supported_formats.md
@ -13,6 +13,7 @@ Below you can find a listing of all supported input and output formats.
 | Markdown | |
 | AsciiDoc | |
 | HTML, XHTML | |
+| CSV | |
 | PNG, JPEG, TIFF, BMP | Image formats |

 Schema-specific support:
@ -20,7 +21,7 @@ Schema-specific support:
 | Format | Description |
 |--------|-------------|
 | USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents |
-| PMC XML | XML format followed by [PubMed Central®](https://pmc.ncbi.nlm.nih.gov/) articles |
+| JATS XML | XML format followed by [JATS](https://jats.nlm.nih.gov/) articles |
 | Docling JSON | JSON-serialized [Docling Document](./concepts/docling_document.md) |

 ## Supported output formats
--- a/docs/usage.md
+++ b/docs/usage.md
@ -71,6 +71,37 @@ Or using the CLI:
 docling --artifacts-path="/local/path/to/models" FILE
 ```

+#### Using remote services
+
+The main purpose of Docling is to run local models which are not sharing any user data with remote services.
+Anyhow, there are valid use cases for processing part of the pipeline using remote services, for example invoking OCR engines from cloud vendors or the usage of hosted LLMs.
+
+In Docling we decided to allow such models, but we require the user to explicitly opt-in in communicating with external services.
+
+```py
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+pipeline_options = PdfPipelineOptions(enable_remote_services=True)
+doc_converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+    }
+)
+```
+
+When the value `enable_remote_services=True` is not set, the system will raise an exception `OperationNotAllowed()`.
+
+_Note: This option is only related to the system sending user data to remote services. Control of pulling data (e.g. model weights) follows the logic described in [Model prefetching and offline usage](#model-prefetching-and-offline-usage)._
+
+##### List of remote model services
+
+The options in this list require the explicit `enable_remote_services=True` when processing the documents.
+
+- `PictureDescriptionApiOptions`: Using vision models via API calls.
+
+
 #### Adjust pipeline features

 The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -75,11 +75,14 @@ nav:
      - "Figure enrichment": examples/develop_picture_enrichment.py
      - "Table export": examples/export_tables.py
      - "Multimodal export": examples/export_multimodal.py
+      - "Annotate picture with local vlm": examples/pictures_description.py
+      - "Annotate picture with remote vlm": examples/pictures_description_api.py
      - "Force full page OCR": examples/full_page_ocr.py
      - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
      - "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
      - "Accelerator options": examples/run_with_accelerator.py
      - "Simple translation": examples/translate.py
+      - examples/backend_csv.ipynb
      - examples/backend_xml_rag.ipynb
    - ✂️ Chunking:
      - examples/hybrid_chunking.ipynb
--- a/poetry.lock
+++ b/poetry.lock
@ -187,8 +187,8 @@ files = [
 lazy-object-proxy = ">=1.4.0"
 typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
 wrapt = [
-    {version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
    {version = ">=1.11,<2", markers = "python_version < \"3.11\""},
+    {version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
 ]

 [[package]]
@ -820,13 +820,13 @@ files = [

 [[package]]
 name = "docling-core"
-version = "2.18.0"
+version = "2.19.0"
 description = "A python library to define and validate data types in Docling."
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "docling_core-2.18.0-py3-none-any.whl", hash = "sha256:9dee0084cef3d6d742686629f538653e332ee8b7541ad7581c98c8ddc28149b3"},
-    {file = "docling_core-2.18.0.tar.gz", hash = "sha256:e8623b8cf4b1e19d5c05c4e3446ac7835afb178997b91c8d11ce8e504a09ec43"},
+    {file = "docling_core-2.19.0-py3-none-any.whl", hash = "sha256:caa1e13d98fa9a00608091c386609c75b3560c7291e842c252f0b6f8d5812dbd"},
+    {file = "docling_core-2.19.0.tar.gz", hash = "sha256:ebf3062e31155bb5f0e6132056a2d239a0e6e693a75c5758886909bb9fef461a"},
 ]

 [package.dependencies]
@ -834,7 +834,7 @@ jsonref = ">=1.1.0,<2.0.0"
 jsonschema = ">=4.16.0,<5.0.0"
 latex2mathml = ">=3.77.0,<4.0.0"
 pandas = ">=2.1.4,<3.0.0"
-pillow = ">=10.3.0,<11.0.0"
+pillow = ">=10.0.0,<12.0.0"
 pydantic = ">=2.6.0,<2.10.0 || >2.10.0,<2.10.1 || >2.10.1,<2.10.2 || >2.10.2,<3.0.0"
 pyyaml = ">=5.1,<7.0.0"
 semchunk = {version = ">=2.2.0,<3.0.0", optional = true, markers = "extra == \"chunking\""}
@ -883,45 +883,45 @@ resolved_reference = "6892adfa4fcf0878b938e8efc1407dec46e96bdd"

 [[package]]
 name = "docling-parse"
-version = "3.3.0"
+version = "3.3.1"
 description = "Simple package to extract text with coordinates from programmatic PDFs"
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "docling_parse-3.3.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:25ac1137787d01cc1d402cd7f3bca2c702d8ae6d38cedd042337d04b5444aedb"},
-    {file = "docling_parse-3.3.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:7d8dc58751bd6a3dcd371363152d98df8deb33ab0752c2fce6a7b380d6804958"},
-    {file = "docling_parse-3.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:780a12e6fa7729f76d12dd0f6431192ad38cd3621bfcad77d6dc37a7bca78ca3"},
-    {file = "docling_parse-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15db955169ddb57e5fd9e887d5467789d5748154bedbe3e861003318b8de0ca3"},
-    {file = "docling_parse-3.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:1104451aef07fdacb45341e1ce15f31193e32e46905d0703d0a3d62aeaa8632a"},
-    {file = "docling_parse-3.3.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:b1768fca5700384ea309af3a2714ad0b06a9be00da24d171e840b0d2098d5e52"},
-    {file = "docling_parse-3.3.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:b5647814c701c6199ed3da5577aac292f42e22d581df206a81f34fbb7e9f2fec"},
-    {file = "docling_parse-3.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ae4bea3441d8f358a9669a6d40c510c640ae45fa5c3464698de9c01fa773f14"},
-    {file = "docling_parse-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2e1aae698a0a7fe81ab394cb8fa328a0f4eaa883962168e7dea5387ee30e76d"},
-    {file = "docling_parse-3.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:43757a1848dffe2262c10347115e4be38dd9565261659c3d8c29ce02770bbcfd"},
-    {file = "docling_parse-3.3.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:1498748fb3270ab8074ef48dcdcef1107c6e312e3b3793c71b1341bf6d706966"},
-    {file = "docling_parse-3.3.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:b9e4db7bf9736c46a6b5cae45242f2dd2cc478f661bade3d06200ba86ad2aa33"},
-    {file = "docling_parse-3.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8d73364e16383403b0828ad12c74dd8dbb2bd5d572f4b62c9a417f0cdea2138"},
-    {file = "docling_parse-3.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fe95708c21039216ff0620ff2c822de14dd70cd61ac9b7117fc30b3d7990f81"},
-    {file = "docling_parse-3.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:b23bc96c593979567ba18727a31adb1a4524d203feea9b12dac4c8774971d709"},
-    {file = "docling_parse-3.3.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:ab240ce8c923143ef125291ee15121f67a99034df110133cd7d481c5f0b4525a"},
-    {file = "docling_parse-3.3.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:916cdd6fd52c32e7026c1e70795c0a9d2d3299f200cc240c047dcc702fcf268f"},
-    {file = "docling_parse-3.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b34f0c5629db675dda7bd8992a80416a03b9b75e99f6d6db8869cdd935d63b7f"},
-    {file = "docling_parse-3.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d260306cbe52a3954cba9f9b9459547b7f66156a1e85821b2f7106a3f230586"},
-    {file = "docling_parse-3.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:c37be103a07f27fb2bd77350f324e55822d7126b534ebd52ec9de3d10feae72f"},
-    {file = "docling_parse-3.3.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:cfb921c4901e342c42d1754f0fe67e73577d6506143b75dfde44720efe89aae4"},
-    {file = "docling_parse-3.3.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:acc7b4929296f6c9fdc91851c999a353f785039c3f2eb591923564b42fc3d0d1"},
-    {file = "docling_parse-3.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51e7640aadc75d9a3efa2f591b583824999111cc1830868dd8409c7f6f8e19a6"},
-    {file = "docling_parse-3.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:118c69c81539bcac2b3b635edbabac29301e778894382688fedaa86f6dba5c13"},
-    {file = "docling_parse-3.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:556653428da5f2b9863f630222f5369c66a4ca0cadb42ea1d8db826cb4e6c4c6"},
-    {file = "docling_parse-3.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d6f56e7ed06dee6451d0552c9c2221491af7a7a0cb8c9b8b9c78c1b728b35d27"},
-    {file = "docling_parse-3.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b83347e82dbb5bb58807cc605fd521803fc6a0852f0063b9f6d26b3f7426b662"},
-    {file = "docling_parse-3.3.0.tar.gz", hash = "sha256:38ed09e63c735b5e010e5a75af92da5d8b2fcab9e93d2d17873e9115290fd7fa"},
+    {file = "docling_parse-3.3.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:abf3a0c9ea35fc33fbd288031096826688d1e787f7c51e174cc9fea6a22d2f67"},
+    {file = "docling_parse-3.3.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:1e07cc6e3603ff246affa11bec25a82d90f79c6b92c370d993f2bd6318476b7c"},
+    {file = "docling_parse-3.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:338000deee0251f7e2ebdfde2bcd6392c388624206555410867cfc93608d84fe"},
+    {file = "docling_parse-3.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46f14a9871e840a021a642ea0ece4b675cc9584224eba3b85cd269feda892b76"},
+    {file = "docling_parse-3.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:c0bad3db594e05bca2366d46e630a0b8050b6eb37fcae2cbcd5020b06ac0879a"},
+    {file = "docling_parse-3.3.1-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:db0cebe28b299f78f1da58b5567c22de6f5b30aa5b6fe4fb2daae9f372bd022c"},
+    {file = "docling_parse-3.3.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:72e901cc6dfac9e4f5e13ddd841f758b41484e61b7092b891c693e2c036461ac"},
+    {file = "docling_parse-3.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fcfc7309b46e9b0941cc5513560b06f0b1c221ff3a2d5e516eb752ae7f2ccb81"},
+    {file = "docling_parse-3.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76dc12fc510d08d1f76741dc429666a5791db003275067aa1f18da02b7a98925"},
+    {file = "docling_parse-3.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:4c096f5c2460a6eb308e046e3045bb0100b6b602ef4394924cfd4846cee5800e"},
+    {file = "docling_parse-3.3.1-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:3cc23f0d6aa91d015117b8962162bf4a482e2208d2068abfada34fda112ef077"},
+    {file = "docling_parse-3.3.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:7a62afe01b6f008f3f50a12e3d8feddb28d045bc2b96321d48933ab23ff1e201"},
+    {file = "docling_parse-3.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd93d2a23a22b84c61213fa62db906ae444201e4e404d7dc2b6152d64d69ec50"},
+    {file = "docling_parse-3.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87a8678b8e37a1f0bd41fb0227938a7bba0dc7e6f6ce69777b1ee947ef0e28ef"},
+    {file = "docling_parse-3.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:c6ae62864d3d0e1e3bfb467e217c90ae938b0773c671412ff3ca110081b024ea"},
+    {file = "docling_parse-3.3.1-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:79cc92e3b1d3d8957df11c8dfd5c8f89aaae06d5ac49f019a59a0aad301ba59c"},
+    {file = "docling_parse-3.3.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:8d8b988ca77dd88111fb8e5d961806fcb26b3ea146841d7d304d1d52b82ed27b"},
+    {file = "docling_parse-3.3.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:781a88c3a4eb27dd09598fe5956f8cd874acb49c102c2d35ccf0fbbeb3fc714d"},
+    {file = "docling_parse-3.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb803dd61d79a7f876ff7febb4ce2f43de19646f05196031f35b891a8c24b57a"},
+    {file = "docling_parse-3.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:2052d3dc4711975fdcfa343947a1fbb9502c6a81ccc5834af41615868e61fb94"},
+    {file = "docling_parse-3.3.1-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:7ccb186369f706b5df8d6751c6cfff2a4355c3c843c68b0210e3f53a2bdf9bf6"},
+    {file = "docling_parse-3.3.1-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:5a00c37ed9923f7d7317044135d8ff81829474d1d47730dfc8bd2d2a3e3e60cd"},
+    {file = "docling_parse-3.3.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:72beb63683f5e581c15d1c3370480dbe4457031f447944342a09bd23a66b378e"},
+    {file = "docling_parse-3.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5ac8f3bb64e50cf58959ce591574cb0ba3d6ebe9bdbedfabbff1817aaf34664"},
+    {file = "docling_parse-3.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:24ea10d7bda2ea35c6cc24b8db3fdea4a1e05182890ea44364fcd703e5090e54"},
+    {file = "docling_parse-3.3.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:048157bc8640e3c03c082b4296f9f8946516c624a5469e10c7c9a32dcb0dc5c8"},
+    {file = "docling_parse-3.3.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6f293644856d05a1fced3e100dd052374da793e7e3a0e6023c69b9d3eb64881"},
+    {file = "docling_parse-3.3.1.tar.gz", hash = "sha256:536f581e7564cbfd37bff2e79d2cb17e7dbaa0d34d054cfdb28d648da31da85b"},
 ]

 [package.dependencies]
 docling-core = ">=2.14.0,<3.0.0"
-pillow = ">=10.4.0,<11.0.0"
-pydantic = ">=2.10.5,<3.0.0"
+pillow = ">=10.0.0,<12.0.0"
+pydantic = ">=2.0.0,<3.0.0"
 pywin32 = {version = ">=305", markers = "sys_platform == \"win32\""}
 tabulate = ">=0.9.0,<1.0.0"

@ -1100,13 +1100,13 @@ dev = ["pyTest", "pyTest-cov"]

 [[package]]
 name = "flatbuffers"
-version = "25.1.24"
+version = "25.2.10"
 description = "The FlatBuffers serialization format for Python"
 optional = true
 python-versions = "*"
 files = [
-    {file = "flatbuffers-25.1.24-py2.py3-none-any.whl", hash = "sha256:1abfebaf4083117225d0723087ea909896a34e3fec933beedb490d595ba24145"},
-    {file = "flatbuffers-25.1.24.tar.gz", hash = "sha256:e0f7b7d806c0abdf166275492663130af40c11f89445045fbef0aa3c9a8643ad"},
+    {file = "flatbuffers-25.2.10-py2.py3-none-any.whl", hash = "sha256:ebba5f4d5ea615af3f7fd70fc310636fbb2bbd1f566ac0a23d98dd412de50051"},
+    {file = "flatbuffers-25.2.10.tar.gz", hash = "sha256:97e451377a41262f8d9bd4295cc836133415cc03d8cb966410a4af92eb00d26e"},
 ]

 [[package]]
@ -1303,13 +1303,13 @@ test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit",

 [[package]]
 name = "griffe"
-version = "1.5.6"
+version = "1.5.7"
 description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "griffe-1.5.6-py3-none-any.whl", hash = "sha256:b2a3afe497c6c1f952e54a23095ecc09435016293e77af8478ed65df1022a394"},
-    {file = "griffe-1.5.6.tar.gz", hash = "sha256:181f6666d5aceb6cd6e2da5a2b646cfb431e47a0da1fda283845734b67e10944"},
+    {file = "griffe-1.5.7-py3-none-any.whl", hash = "sha256:4af8ec834b64de954d447c7b6672426bb145e71605c74a4e22d510cc79fe7d8b"},
+    {file = "griffe-1.5.7.tar.gz", hash = "sha256:465238c86deaf1137761f700fb343edd8ffc846d72f6de43c3c345ccdfbebe92"},
 ]

 [package.dependencies]
@ -1983,13 +1983,13 @@ files = [

 [[package]]
 name = "jupytext"
-version = "1.16.6"
+version = "1.16.7"
 description = "Jupyter notebooks as Markdown documents, Julia, Python or R scripts"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "jupytext-1.16.6-py3-none-any.whl", hash = "sha256:900132031f73fee15a1c9ebd862e05eb5f51e1ad6ab3a2c6fdd97ce2f9c913b4"},
-    {file = "jupytext-1.16.6.tar.gz", hash = "sha256:dbd03f9263c34b737003f388fc069e9030834fb7136879c4c32c32473557baa0"},
+    {file = "jupytext-1.16.7-py3-none-any.whl", hash = "sha256:912f9d9af7bd3f15470105e5c5dddf1669b2d8c17f0c55772687fc5a4a73fe69"},
+    {file = "jupytext-1.16.7.tar.gz", hash = "sha256:fc4e97f0890e22062c4ef10313c7ca960b07b3767246a1fef7585888cc2afe5d"},
 ]

 [package.dependencies]
@ -2620,13 +2620,13 @@ min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-imp

 [[package]]
 name = "mkdocs-autorefs"
-version = "1.3.0"
+version = "1.3.1"
 description = "Automatically link across pages in MkDocs."
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "mkdocs_autorefs-1.3.0-py3-none-any.whl", hash = "sha256:d180f9778a04e78b7134e31418f238bba56f56d6a8af97873946ff661befffb3"},
-    {file = "mkdocs_autorefs-1.3.0.tar.gz", hash = "sha256:6867764c099ace9025d6ac24fd07b85a98335fbd30107ef01053697c8f46db61"},
+    {file = "mkdocs_autorefs-1.3.1-py3-none-any.whl", hash = "sha256:18c504ae4d3ee7f344369bb26cb31d4105569ee252aab7d75ec2734c2c8b0474"},
+    {file = "mkdocs_autorefs-1.3.1.tar.gz", hash = "sha256:a6d30cbcccae336d622a66c2418a3c92a8196b69782774529ad441abb23c0902"},
 ]

 [package.dependencies]
@ -2687,13 +2687,13 @@ pygments = ">2.12.0"

 [[package]]
 name = "mkdocs-material"
-version = "9.6.3"
+version = "9.6.4"
 description = "Documentation that simply works"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mkdocs_material-9.6.3-py3-none-any.whl", hash = "sha256:1125622067e26940806701219303b27c0933e04533560725d97ec26fd16a39cf"},
-    {file = "mkdocs_material-9.6.3.tar.gz", hash = "sha256:c87f7d1c39ce6326da5e10e232aed51bae46252e646755900f4b0fc9192fa832"},
+    {file = "mkdocs_material-9.6.4-py3-none-any.whl", hash = "sha256:414e8376551def6d644b8e6f77226022868532a792eb2c9accf52199009f568f"},
+    {file = "mkdocs_material-9.6.4.tar.gz", hash = "sha256:4d1d35e1c1d3e15294cb7fa5d02e0abaee70d408f75027dc7be6e30fb32e6867"},
 ]

 [package.dependencies]
@ -2794,8 +2794,8 @@ files = [

 [package.dependencies]
 multiprocess = [
-    {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
    {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""},
+    {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
 ]
 pygments = ">=2.0"
 pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""}
@ -3319,66 +3319,66 @@ files = [

 [[package]]
 name = "numpy"
-version = "2.2.2"
+version = "2.2.3"
 description = "Fundamental package for array computing in Python"
 optional = false
 python-versions = ">=3.10"
 files = [
-    {file = "numpy-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7079129b64cb78bdc8d611d1fd7e8002c0a2565da6a47c4df8062349fee90e3e"},
-    {file = "numpy-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ec6c689c61df613b783aeb21f945c4cbe6c51c28cb70aae8430577ab39f163e"},
-    {file = "numpy-2.2.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:40c7ff5da22cd391944a28c6a9c638a5eef77fcf71d6e3a79e1d9d9e82752715"},
-    {file = "numpy-2.2.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:995f9e8181723852ca458e22de5d9b7d3ba4da3f11cc1cb113f093b271d7965a"},
-    {file = "numpy-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b78ea78450fd96a498f50ee096f69c75379af5138f7881a51355ab0e11286c97"},
-    {file = "numpy-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3fbe72d347fbc59f94124125e73fc4976a06927ebc503ec5afbfb35f193cd957"},
-    {file = "numpy-2.2.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8e6da5cffbbe571f93588f562ed130ea63ee206d12851b60819512dd3e1ba50d"},
-    {file = "numpy-2.2.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:09d6a2032faf25e8d0cadde7fd6145118ac55d2740132c1d845f98721b5ebcfd"},
-    {file = "numpy-2.2.2-cp310-cp310-win32.whl", hash = "sha256:159ff6ee4c4a36a23fe01b7c3d07bd8c14cc433d9720f977fcd52c13c0098160"},
-    {file = "numpy-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:64bd6e1762cd7f0986a740fee4dff927b9ec2c5e4d9a28d056eb17d332158014"},
-    {file = "numpy-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:642199e98af1bd2b6aeb8ecf726972d238c9877b0f6e8221ee5ab945ec8a2189"},
-    {file = "numpy-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6d9fc9d812c81e6168b6d405bf00b8d6739a7f72ef22a9214c4241e0dc70b323"},
-    {file = "numpy-2.2.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:c7d1fd447e33ee20c1f33f2c8e6634211124a9aabde3c617687d8b739aa69eac"},
-    {file = "numpy-2.2.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:451e854cfae0febe723077bd0cf0a4302a5d84ff25f0bfece8f29206c7bed02e"},
-    {file = "numpy-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd249bc894af67cbd8bad2c22e7cbcd46cf87ddfca1f1289d1e7e54868cc785c"},
-    {file = "numpy-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02935e2c3c0c6cbe9c7955a8efa8908dd4221d7755644c59d1bba28b94fd334f"},
-    {file = "numpy-2.2.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a972cec723e0563aa0823ee2ab1df0cb196ed0778f173b381c871a03719d4826"},
-    {file = "numpy-2.2.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d6d6a0910c3b4368d89dde073e630882cdb266755565155bc33520283b2d9df8"},
-    {file = "numpy-2.2.2-cp311-cp311-win32.whl", hash = "sha256:860fd59990c37c3ef913c3ae390b3929d005243acca1a86facb0773e2d8d9e50"},
-    {file = "numpy-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:da1eeb460ecce8d5b8608826595c777728cdf28ce7b5a5a8c8ac8d949beadcf2"},
-    {file = "numpy-2.2.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ac9bea18d6d58a995fac1b2cb4488e17eceeac413af014b1dd26170b766d8467"},
-    {file = "numpy-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23ae9f0c2d889b7b2d88a3791f6c09e2ef827c2446f1c4a3e3e76328ee4afd9a"},
-    {file = "numpy-2.2.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:3074634ea4d6df66be04f6728ee1d173cfded75d002c75fac79503a880bf3825"},
-    {file = "numpy-2.2.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8ec0636d3f7d68520afc6ac2dc4b8341ddb725039de042faf0e311599f54eb37"},
-    {file = "numpy-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ffbb1acd69fdf8e89dd60ef6182ca90a743620957afb7066385a7bbe88dc748"},
-    {file = "numpy-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0349b025e15ea9d05c3d63f9657707a4e1d471128a3b1d876c095f328f8ff7f0"},
-    {file = "numpy-2.2.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:463247edcee4a5537841d5350bc87fe8e92d7dd0e8c71c995d2c6eecb8208278"},
-    {file = "numpy-2.2.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9dd47ff0cb2a656ad69c38da850df3454da88ee9a6fde0ba79acceee0e79daba"},
-    {file = "numpy-2.2.2-cp312-cp312-win32.whl", hash = "sha256:4525b88c11906d5ab1b0ec1f290996c0020dd318af8b49acaa46f198b1ffc283"},
-    {file = "numpy-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:5acea83b801e98541619af398cc0109ff48016955cc0818f478ee9ef1c5c3dcb"},
-    {file = "numpy-2.2.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b208cfd4f5fe34e1535c08983a1a6803fdbc7a1e86cf13dd0c61de0b51a0aadc"},
-    {file = "numpy-2.2.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d0bbe7dd86dca64854f4b6ce2ea5c60b51e36dfd597300057cf473d3615f2369"},
-    {file = "numpy-2.2.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:22ea3bb552ade325530e72a0c557cdf2dea8914d3a5e1fecf58fa5dbcc6f43cd"},
-    {file = "numpy-2.2.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:128c41c085cab8a85dc29e66ed88c05613dccf6bc28b3866cd16050a2f5448be"},
-    {file = "numpy-2.2.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:250c16b277e3b809ac20d1f590716597481061b514223c7badb7a0f9993c7f84"},
-    {file = "numpy-2.2.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0c8854b09bc4de7b041148d8550d3bd712b5c21ff6a8ed308085f190235d7ff"},
-    {file = "numpy-2.2.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b6fb9c32a91ec32a689ec6410def76443e3c750e7cfc3fb2206b985ffb2b85f0"},
-    {file = "numpy-2.2.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:57b4012e04cc12b78590a334907e01b3a85efb2107df2b8733ff1ed05fce71de"},
-    {file = "numpy-2.2.2-cp313-cp313-win32.whl", hash = "sha256:4dbd80e453bd34bd003b16bd802fac70ad76bd463f81f0c518d1245b1c55e3d9"},
-    {file = "numpy-2.2.2-cp313-cp313-win_amd64.whl", hash = "sha256:5a8c863ceacae696aff37d1fd636121f1a512117652e5dfb86031c8d84836369"},
-    {file = "numpy-2.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:b3482cb7b3325faa5f6bc179649406058253d91ceda359c104dac0ad320e1391"},
-    {file = "numpy-2.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9491100aba630910489c1d0158034e1c9a6546f0b1340f716d522dc103788e39"},
-    {file = "numpy-2.2.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:41184c416143defa34cc8eb9d070b0a5ba4f13a0fa96a709e20584638254b317"},
-    {file = "numpy-2.2.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:7dca87ca328f5ea7dafc907c5ec100d187911f94825f8700caac0b3f4c384b49"},
-    {file = "numpy-2.2.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bc61b307655d1a7f9f4b043628b9f2b721e80839914ede634e3d485913e1fb2"},
-    {file = "numpy-2.2.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fad446ad0bc886855ddf5909cbf8cb5d0faa637aaa6277fb4b19ade134ab3c7"},
-    {file = "numpy-2.2.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:149d1113ac15005652e8d0d3f6fd599360e1a708a4f98e43c9c77834a28238cb"},
-    {file = "numpy-2.2.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:106397dbbb1896f99e044efc90360d098b3335060375c26aa89c0d8a97c5f648"},
-    {file = "numpy-2.2.2-cp313-cp313t-win32.whl", hash = "sha256:0eec19f8af947a61e968d5429f0bd92fec46d92b0008d0a6685b40d6adf8a4f4"},
-    {file = "numpy-2.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:97b974d3ba0fb4612b77ed35d7627490e8e3dff56ab41454d9e8b23448940576"},
-    {file = "numpy-2.2.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b0531f0b0e07643eb089df4c509d30d72c9ef40defa53e41363eca8a8cc61495"},
-    {file = "numpy-2.2.2-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:e9e82dcb3f2ebbc8cb5ce1102d5f1c5ed236bf8a11730fb45ba82e2841ec21df"},
-    {file = "numpy-2.2.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0d4142eb40ca6f94539e4db929410f2a46052a0fe7a2c1c59f6179c39938d2a"},
-    {file = "numpy-2.2.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:356ca982c188acbfa6af0d694284d8cf20e95b1c3d0aefa8929376fea9146f60"},
-    {file = "numpy-2.2.2.tar.gz", hash = "sha256:ed6906f61834d687738d25988ae117683705636936cc605be0bb208b23df4d8f"},
+    {file = "numpy-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cbc6472e01952d3d1b2772b720428f8b90e2deea8344e854df22b0618e9cce71"},
+    {file = "numpy-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdfe0c22692a30cd830c0755746473ae66c4a8f2e7bd508b35fb3b6a0813d787"},
+    {file = "numpy-2.2.3-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:e37242f5324ffd9f7ba5acf96d774f9276aa62a966c0bad8dae692deebec7716"},
+    {file = "numpy-2.2.3-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:95172a21038c9b423e68be78fd0be6e1b97674cde269b76fe269a5dfa6fadf0b"},
+    {file = "numpy-2.2.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5b47c440210c5d1d67e1cf434124e0b5c395eee1f5806fdd89b553ed1acd0a3"},
+    {file = "numpy-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0391ea3622f5c51a2e29708877d56e3d276827ac5447d7f45e9bc4ade8923c52"},
+    {file = "numpy-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f6b3dfc7661f8842babd8ea07e9897fe3d9b69a1d7e5fbb743e4160f9387833b"},
+    {file = "numpy-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1ad78ce7f18ce4e7df1b2ea4019b5817a2f6a8a16e34ff2775f646adce0a5027"},
+    {file = "numpy-2.2.3-cp310-cp310-win32.whl", hash = "sha256:5ebeb7ef54a7be11044c33a17b2624abe4307a75893c001a4800857956b41094"},
+    {file = "numpy-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:596140185c7fa113563c67c2e894eabe0daea18cf8e33851738c19f70ce86aeb"},
+    {file = "numpy-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:16372619ee728ed67a2a606a614f56d3eabc5b86f8b615c79d01957062826ca8"},
+    {file = "numpy-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5521a06a3148686d9269c53b09f7d399a5725c47bbb5b35747e1cb76326b714b"},
+    {file = "numpy-2.2.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:7c8dde0ca2f77828815fd1aedfdf52e59071a5bae30dac3b4da2a335c672149a"},
+    {file = "numpy-2.2.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:77974aba6c1bc26e3c205c2214f0d5b4305bdc719268b93e768ddb17e3fdd636"},
+    {file = "numpy-2.2.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d42f9c36d06440e34226e8bd65ff065ca0963aeecada587b937011efa02cdc9d"},
+    {file = "numpy-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2712c5179f40af9ddc8f6727f2bd910ea0eb50206daea75f58ddd9fa3f715bb"},
+    {file = "numpy-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c8b0451d2ec95010d1db8ca733afc41f659f425b7f608af569711097fd6014e2"},
+    {file = "numpy-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d9b4a8148c57ecac25a16b0e11798cbe88edf5237b0df99973687dd866f05e1b"},
+    {file = "numpy-2.2.3-cp311-cp311-win32.whl", hash = "sha256:1f45315b2dc58d8a3e7754fe4e38b6fce132dab284a92851e41b2b344f6441c5"},
+    {file = "numpy-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f48ba6f6c13e5e49f3d3efb1b51c8193215c42ac82610a04624906a9270be6f"},
+    {file = "numpy-2.2.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:12c045f43b1d2915eca6b880a7f4a256f59d62df4f044788c8ba67709412128d"},
+    {file = "numpy-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:87eed225fd415bbae787f93a457af7f5990b92a334e346f72070bf569b9c9c95"},
+    {file = "numpy-2.2.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:712a64103d97c404e87d4d7c47fb0c7ff9acccc625ca2002848e0d53288b90ea"},
+    {file = "numpy-2.2.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a5ae282abe60a2db0fd407072aff4599c279bcd6e9a2475500fc35b00a57c532"},
+    {file = "numpy-2.2.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5266de33d4c3420973cf9ae3b98b54a2a6d53a559310e3236c4b2b06b9c07d4e"},
+    {file = "numpy-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b787adbf04b0db1967798dba8da1af07e387908ed1553a0d6e74c084d1ceafe"},
+    {file = "numpy-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:34c1b7e83f94f3b564b35f480f5652a47007dd91f7c839f404d03279cc8dd021"},
+    {file = "numpy-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4d8335b5f1b6e2bce120d55fb17064b0262ff29b459e8493d1785c18ae2553b8"},
+    {file = "numpy-2.2.3-cp312-cp312-win32.whl", hash = "sha256:4d9828d25fb246bedd31e04c9e75714a4087211ac348cb39c8c5f99dbb6683fe"},
+    {file = "numpy-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:83807d445817326b4bcdaaaf8e8e9f1753da04341eceec705c001ff342002e5d"},
+    {file = "numpy-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7bfdb06b395385ea9b91bf55c1adf1b297c9fdb531552845ff1d3ea6e40d5aba"},
+    {file = "numpy-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:23c9f4edbf4c065fddb10a4f6e8b6a244342d95966a48820c614891e5059bb50"},
+    {file = "numpy-2.2.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:a0c03b6be48aaf92525cccf393265e02773be8fd9551a2f9adbe7db1fa2b60f1"},
+    {file = "numpy-2.2.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:2376e317111daa0a6739e50f7ee2a6353f768489102308b0d98fcf4a04f7f3b5"},
+    {file = "numpy-2.2.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8fb62fe3d206d72fe1cfe31c4a1106ad2b136fcc1606093aeab314f02930fdf2"},
+    {file = "numpy-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52659ad2534427dffcc36aac76bebdd02b67e3b7a619ac67543bc9bfe6b7cdb1"},
+    {file = "numpy-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1b416af7d0ed3271cad0f0a0d0bee0911ed7eba23e66f8424d9f3dfcdcae1304"},
+    {file = "numpy-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1402da8e0f435991983d0a9708b779f95a8c98c6b18a171b9f1be09005e64d9d"},
+    {file = "numpy-2.2.3-cp313-cp313-win32.whl", hash = "sha256:136553f123ee2951bfcfbc264acd34a2fc2f29d7cdf610ce7daf672b6fbaa693"},
+    {file = "numpy-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:5b732c8beef1d7bc2d9e476dbba20aaff6167bf205ad9aa8d30913859e82884b"},
+    {file = "numpy-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:435e7a933b9fda8126130b046975a968cc2d833b505475e588339e09f7672890"},
+    {file = "numpy-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7678556eeb0152cbd1522b684dcd215250885993dd00adb93679ec3c0e6e091c"},
+    {file = "numpy-2.2.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2e8da03bd561504d9b20e7a12340870dfc206c64ea59b4cfee9fceb95070ee94"},
+    {file = "numpy-2.2.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:c9aa4496fd0e17e3843399f533d62857cef5900facf93e735ef65aa4bbc90ef0"},
+    {file = "numpy-2.2.3-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4ca91d61a4bf61b0f2228f24bbfa6a9facd5f8af03759fe2a655c50ae2c6610"},
+    {file = "numpy-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:deaa09cd492e24fd9b15296844c0ad1b3c976da7907e1c1ed3a0ad21dded6f76"},
+    {file = "numpy-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:246535e2f7496b7ac85deffe932896a3577be7af8fb7eebe7146444680297e9a"},
+    {file = "numpy-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:daf43a3d1ea699402c5a850e5313680ac355b4adc9770cd5cfc2940e7861f1bf"},
+    {file = "numpy-2.2.3-cp313-cp313t-win32.whl", hash = "sha256:cf802eef1f0134afb81fef94020351be4fe1d6681aadf9c5e862af6602af64ef"},
+    {file = "numpy-2.2.3-cp313-cp313t-win_amd64.whl", hash = "sha256:aee2512827ceb6d7f517c8b85aa5d3923afe8fc7a57d028cffcd522f1c6fd082"},
+    {file = "numpy-2.2.3-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3c2ec8a0f51d60f1e9c0c5ab116b7fc104b165ada3f6c58abf881cb2eb16044d"},
+    {file = "numpy-2.2.3-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:ed2cf9ed4e8ebc3b754d398cba12f24359f018b416c380f577bbae112ca52fc9"},
+    {file = "numpy-2.2.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39261798d208c3095ae4f7bc8eaeb3481ea8c6e03dc48028057d3cbdbdb8937e"},
+    {file = "numpy-2.2.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:783145835458e60fa97afac25d511d00a1eca94d4a8f3ace9fe2043003c678e4"},
+    {file = "numpy-2.2.3.tar.gz", hash = "sha256:dbdc15f0c81611925f382dfa97b3bd0bc2c1ce19d4fe50482cb0ddc12ba30020"},
 ]

 [[package]]
@ -3804,10 +3804,10 @@ files = [

 [package.dependencies]
 numpy = [
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
    {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
 ]
@ -3830,10 +3830,10 @@ files = [

 [package.dependencies]
 numpy = [
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
    {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
 ]
@ -4019,9 +4019,9 @@ files = [

 [package.dependencies]
 numpy = [
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
-    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@ -4405,32 +4405,25 @@ files = [

 [[package]]
 name = "psutil"
-version = "6.1.1"
-description = "Cross-platform lib for process and system monitoring in Python."
+version = "7.0.0"
+description = "Cross-platform lib for process and system monitoring in Python.  NOTE: the syntax of this script MUST be kept compatible with Python 2.7."
 optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+python-versions = ">=3.6"
 files = [
-    {file = "psutil-6.1.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:9ccc4316f24409159897799b83004cb1e24f9819b0dcf9c0b68bdcb6cefee6a8"},
-    {file = "psutil-6.1.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ca9609c77ea3b8481ab005da74ed894035936223422dc591d6772b147421f777"},
-    {file = "psutil-6.1.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:8df0178ba8a9e5bc84fed9cfa61d54601b371fbec5c8eebad27575f1e105c0d4"},
-    {file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:1924e659d6c19c647e763e78670a05dbb7feaf44a0e9c94bf9e14dfc6ba50468"},
-    {file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:018aeae2af92d943fdf1da6b58665124897cfc94faa2ca92098838f83e1b1bca"},
-    {file = "psutil-6.1.1-cp27-none-win32.whl", hash = "sha256:6d4281f5bbca041e2292be3380ec56a9413b790579b8e593b1784499d0005dac"},
-    {file = "psutil-6.1.1-cp27-none-win_amd64.whl", hash = "sha256:c777eb75bb33c47377c9af68f30e9f11bc78e0f07fbf907be4a5d70b2fe5f030"},
-    {file = "psutil-6.1.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc0ed7fe2231a444fc219b9c42d0376e0a9a1a72f16c5cfa0f68d19f1a0663e8"},
-    {file = "psutil-6.1.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bdd4eab935276290ad3cb718e9809412895ca6b5b334f5a9111ee6d9aff9377"},
-    {file = "psutil-6.1.1-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6e06c20c05fe95a3d7302d74e7097756d4ba1247975ad6905441ae1b5b66003"},
-    {file = "psutil-6.1.1-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97f7cb9921fbec4904f522d972f0c0e1f4fabbdd4e0287813b21215074a0f160"},
-    {file = "psutil-6.1.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33431e84fee02bc84ea36d9e2c4a6d395d479c9dd9bba2376c1f6ee8f3a4e0b3"},
-    {file = "psutil-6.1.1-cp36-cp36m-win32.whl", hash = "sha256:384636b1a64b47814437d1173be1427a7c83681b17a450bfc309a1953e329603"},
-    {file = "psutil-6.1.1-cp36-cp36m-win_amd64.whl", hash = "sha256:8be07491f6ebe1a693f17d4f11e69d0dc1811fa082736500f649f79df7735303"},
-    {file = "psutil-6.1.1-cp37-abi3-win32.whl", hash = "sha256:eaa912e0b11848c4d9279a93d7e2783df352b082f40111e078388701fd479e53"},
-    {file = "psutil-6.1.1-cp37-abi3-win_amd64.whl", hash = "sha256:f35cfccb065fff93529d2afb4a2e89e363fe63ca1e4a5da22b603a85833c2649"},
-    {file = "psutil-6.1.1.tar.gz", hash = "sha256:cf8496728c18f2d0b45198f06895be52f36611711746b7f30c464b422b50e2f5"},
+    {file = "psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25"},
+    {file = "psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da"},
+    {file = "psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91"},
+    {file = "psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34"},
+    {file = "psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993"},
+    {file = "psutil-7.0.0-cp36-cp36m-win32.whl", hash = "sha256:84df4eb63e16849689f76b1ffcb36db7b8de703d1bc1fe41773db487621b6c17"},
+    {file = "psutil-7.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1e744154a6580bc968a0195fd25e80432d3afec619daf145b9e5ba16cc1d688e"},
+    {file = "psutil-7.0.0-cp37-abi3-win32.whl", hash = "sha256:ba3fcef7523064a6c9da440fc4d6bd07da93ac726b5733c29027d7dc95b39d99"},
+    {file = "psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553"},
+    {file = "psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456"},
 ]

 [package.extras]
-dev = ["abi3audit", "black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"]
+dev = ["abi3audit", "black (==24.10.0)", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest", "pytest-cov", "pytest-xdist", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"]
 test = ["pytest", "pytest-xdist", "setuptools"]

 [[package]]
@ -4785,8 +4778,8 @@ files = [
 astroid = ">=2.15.8,<=2.17.0-dev0"
 colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
 dill = [
-    {version = ">=0.3.6", markers = "python_version >= \"3.11\""},
    {version = ">=0.2", markers = "python_version < \"3.11\""},
+    {version = ">=0.3.6", markers = "python_version >= \"3.11\""},
 ]
 isort = ">=4.2.5,<6"
 mccabe = ">=0.6,<0.8"
@ -5288,29 +5281,29 @@ files = [

 [[package]]
 name = "pywin32"
-version = "308"
+version = "307"
 description = "Python for Window Extensions"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"},
-    {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"},
-    {file = "pywin32-308-cp310-cp310-win_arm64.whl", hash = "sha256:a5ab5381813b40f264fa3495b98af850098f814a25a63589a8e9eb12560f450c"},
-    {file = "pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a"},
-    {file = "pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b"},
-    {file = "pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6"},
-    {file = "pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897"},
-    {file = "pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47"},
-    {file = "pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091"},
-    {file = "pywin32-308-cp313-cp313-win32.whl", hash = "sha256:1c44539a37a5b7b21d02ab34e6a4d314e0788f1690d65b48e9b0b89f31abbbed"},
-    {file = "pywin32-308-cp313-cp313-win_amd64.whl", hash = "sha256:fd380990e792eaf6827fcb7e187b2b4b1cede0585e3d0c9e84201ec27b9905e4"},
-    {file = "pywin32-308-cp313-cp313-win_arm64.whl", hash = "sha256:ef313c46d4c18dfb82a2431e3051ac8f112ccee1a34f29c263c583c568db63cd"},
-    {file = "pywin32-308-cp37-cp37m-win32.whl", hash = "sha256:1f696ab352a2ddd63bd07430080dd598e6369152ea13a25ebcdd2f503a38f1ff"},
-    {file = "pywin32-308-cp37-cp37m-win_amd64.whl", hash = "sha256:13dcb914ed4347019fbec6697a01a0aec61019c1046c2b905410d197856326a6"},
-    {file = "pywin32-308-cp38-cp38-win32.whl", hash = "sha256:5794e764ebcabf4ff08c555b31bd348c9025929371763b2183172ff4708152f0"},
-    {file = "pywin32-308-cp38-cp38-win_amd64.whl", hash = "sha256:3b92622e29d651c6b783e368ba7d6722b1634b8e70bd376fd7610fe1992e19de"},
-    {file = "pywin32-308-cp39-cp39-win32.whl", hash = "sha256:7873ca4dc60ab3287919881a7d4f88baee4a6e639aa6962de25a98ba6b193341"},
-    {file = "pywin32-308-cp39-cp39-win_amd64.whl", hash = "sha256:71b3322d949b4cc20776436a9c9ba0eeedcbc9c650daa536df63f0ff111bb920"},
+    {file = "pywin32-307-cp310-cp310-win32.whl", hash = "sha256:f8f25d893c1e1ce2d685ef6d0a481e87c6f510d0f3f117932781f412e0eba31b"},
+    {file = "pywin32-307-cp310-cp310-win_amd64.whl", hash = "sha256:36e650c5e5e6b29b5d317385b02d20803ddbac5d1031e1f88d20d76676dd103d"},
+    {file = "pywin32-307-cp310-cp310-win_arm64.whl", hash = "sha256:0c12d61e0274e0c62acee79e3e503c312426ddd0e8d4899c626cddc1cafe0ff4"},
+    {file = "pywin32-307-cp311-cp311-win32.whl", hash = "sha256:fec5d27cc893178fab299de911b8e4d12c5954e1baf83e8a664311e56a272b75"},
+    {file = "pywin32-307-cp311-cp311-win_amd64.whl", hash = "sha256:987a86971753ed7fdd52a7fb5747aba955b2c7fbbc3d8b76ec850358c1cc28c3"},
+    {file = "pywin32-307-cp311-cp311-win_arm64.whl", hash = "sha256:fd436897c186a2e693cd0437386ed79f989f4d13d6f353f8787ecbb0ae719398"},
+    {file = "pywin32-307-cp312-cp312-win32.whl", hash = "sha256:07649ec6b01712f36debf39fc94f3d696a46579e852f60157a729ac039df0815"},
+    {file = "pywin32-307-cp312-cp312-win_amd64.whl", hash = "sha256:00d047992bb5dcf79f8b9b7c81f72e0130f9fe4b22df613f755ab1cc021d8347"},
+    {file = "pywin32-307-cp312-cp312-win_arm64.whl", hash = "sha256:b53658acbfc6a8241d72cc09e9d1d666be4e6c99376bc59e26cdb6223c4554d2"},
+    {file = "pywin32-307-cp313-cp313-win32.whl", hash = "sha256:ea4d56e48dc1ab2aa0a5e3c0741ad6e926529510516db7a3b6981a1ae74405e5"},
+    {file = "pywin32-307-cp313-cp313-win_amd64.whl", hash = "sha256:576d09813eaf4c8168d0bfd66fb7cb3b15a61041cf41598c2db4a4583bf832d2"},
+    {file = "pywin32-307-cp313-cp313-win_arm64.whl", hash = "sha256:b30c9bdbffda6a260beb2919f918daced23d32c79109412c2085cbc513338a0a"},
+    {file = "pywin32-307-cp37-cp37m-win32.whl", hash = "sha256:5101472f5180c647d4525a0ed289ec723a26231550dbfd369ec19d5faf60e511"},
+    {file = "pywin32-307-cp37-cp37m-win_amd64.whl", hash = "sha256:05de55a7c110478dc4b202230e98af5e0720855360d2b31a44bb4e296d795fba"},
+    {file = "pywin32-307-cp38-cp38-win32.whl", hash = "sha256:13d059fb7f10792542082f5731d5d3d9645320fc38814759313e5ee97c3fac01"},
+    {file = "pywin32-307-cp38-cp38-win_amd64.whl", hash = "sha256:7e0b2f93769d450a98ac7a31a087e07b126b6d571e8b4386a5762eb85325270b"},
+    {file = "pywin32-307-cp39-cp39-win32.whl", hash = "sha256:55ee87f2f8c294e72ad9d4261ca423022310a6e79fb314a8ca76ab3f493854c6"},
+    {file = "pywin32-307-cp39-cp39-win_amd64.whl", hash = "sha256:e9d5202922e74985b037c9ef46778335c102b74b95cec70f629453dbe7235d87"},
 ]

 [[package]]
@ -6066,51 +6059,57 @@ test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "po

 [[package]]
 name = "scipy"
-version = "1.15.1"
+version = "1.15.2"
 description = "Fundamental algorithms for scientific computing in Python"
 optional = false
 python-versions = ">=3.10"
 files = [
-    {file = "scipy-1.15.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:c64ded12dcab08afff9e805a67ff4480f5e69993310e093434b10e85dc9d43e1"},
-    {file = "scipy-1.15.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5b190b935e7db569960b48840e5bef71dc513314cc4e79a1b7d14664f57fd4ff"},
-    {file = "scipy-1.15.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:4b17d4220df99bacb63065c76b0d1126d82bbf00167d1730019d2a30d6ae01ea"},
-    {file = "scipy-1.15.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:63b9b6cd0333d0eb1a49de6f834e8aeaefe438df8f6372352084535ad095219e"},
-    {file = "scipy-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f151e9fb60fbf8e52426132f473221a49362091ce7a5e72f8aa41f8e0da4f25"},
-    {file = "scipy-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21e10b1dd56ce92fba3e786007322542361984f8463c6d37f6f25935a5a6ef52"},
-    {file = "scipy-1.15.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5dff14e75cdbcf07cdaa1c7707db6017d130f0af9ac41f6ce443a93318d6c6e0"},
-    {file = "scipy-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:f82fcf4e5b377f819542fbc8541f7b5fbcf1c0017d0df0bc22c781bf60abc4d8"},
-    {file = "scipy-1.15.1-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:5bd8d27d44e2c13d0c1124e6a556454f52cd3f704742985f6b09e75e163d20d2"},
-    {file = "scipy-1.15.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:be3deeb32844c27599347faa077b359584ba96664c5c79d71a354b80a0ad0ce0"},
-    {file = "scipy-1.15.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:5eb0ca35d4b08e95da99a9f9c400dc9f6c21c424298a0ba876fdc69c7afacedf"},
-    {file = "scipy-1.15.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:74bb864ff7640dea310a1377d8567dc2cb7599c26a79ca852fc184cc851954ac"},
-    {file = "scipy-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:667f950bf8b7c3a23b4199db24cb9bf7512e27e86d0e3813f015b74ec2c6e3df"},
-    {file = "scipy-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395be70220d1189756068b3173853029a013d8c8dd5fd3d1361d505b2aa58fa7"},
-    {file = "scipy-1.15.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ce3a000cd28b4430426db2ca44d96636f701ed12e2b3ca1f2b1dd7abdd84b39a"},
-    {file = "scipy-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:3fe1d95944f9cf6ba77aa28b82dd6bb2a5b52f2026beb39ecf05304b8392864b"},
-    {file = "scipy-1.15.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c09aa9d90f3500ea4c9b393ee96f96b0ccb27f2f350d09a47f533293c78ea776"},
-    {file = "scipy-1.15.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:0ac102ce99934b162914b1e4a6b94ca7da0f4058b6d6fd65b0cef330c0f3346f"},
-    {file = "scipy-1.15.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:09c52320c42d7f5c7748b69e9f0389266fd4f82cf34c38485c14ee976cb8cb04"},
-    {file = "scipy-1.15.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:cdde8414154054763b42b74fe8ce89d7f3d17a7ac5dd77204f0e142cdc9239e9"},
-    {file = "scipy-1.15.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c9d8fc81d6a3b6844235e6fd175ee1d4c060163905a2becce8e74cb0d7554ce"},
-    {file = "scipy-1.15.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fb57b30f0017d4afa5fe5f5b150b8f807618819287c21cbe51130de7ccdaed2"},
-    {file = "scipy-1.15.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:491d57fe89927fa1aafbe260f4cfa5ffa20ab9f1435025045a5315006a91b8f5"},
-    {file = "scipy-1.15.1-cp312-cp312-win_amd64.whl", hash = "sha256:900f3fa3db87257510f011c292a5779eb627043dd89731b9c461cd16ef76ab3d"},
-    {file = "scipy-1.15.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:100193bb72fbff37dbd0bf14322314fc7cbe08b7ff3137f11a34d06dc0ee6b85"},
-    {file = "scipy-1.15.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:2114a08daec64980e4b4cbdf5bee90935af66d750146b1d2feb0d3ac30613692"},
-    {file = "scipy-1.15.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:6b3e71893c6687fc5e29208d518900c24ea372a862854c9888368c0b267387ab"},
-    {file = "scipy-1.15.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:837299eec3d19b7e042923448d17d95a86e43941104d33f00da7e31a0f715d3c"},
-    {file = "scipy-1.15.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82add84e8a9fb12af5c2c1a3a3f1cb51849d27a580cb9e6bd66226195142be6e"},
-    {file = "scipy-1.15.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:070d10654f0cb6abd295bc96c12656f948e623ec5f9a4eab0ddb1466c000716e"},
-    {file = "scipy-1.15.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:55cc79ce4085c702ac31e49b1e69b27ef41111f22beafb9b49fea67142b696c4"},
-    {file = "scipy-1.15.1-cp313-cp313-win_amd64.whl", hash = "sha256:c352c1b6d7cac452534517e022f8f7b8d139cd9f27e6fbd9f3cbd0bfd39f5bef"},
-    {file = "scipy-1.15.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0458839c9f873062db69a03de9a9765ae2e694352c76a16be44f93ea45c28d2b"},
-    {file = "scipy-1.15.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:af0b61c1de46d0565b4b39c6417373304c1d4f5220004058bdad3061c9fa8a95"},
-    {file = "scipy-1.15.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:71ba9a76c2390eca6e359be81a3e879614af3a71dfdabb96d1d7ab33da6f2364"},
-    {file = "scipy-1.15.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:14eaa373c89eaf553be73c3affb11ec6c37493b7eaaf31cf9ac5dffae700c2e0"},
-    {file = "scipy-1.15.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f735bc41bd1c792c96bc426dece66c8723283695f02df61dcc4d0a707a42fc54"},
-    {file = "scipy-1.15.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2722a021a7929d21168830790202a75dbb20b468a8133c74a2c0230c72626b6c"},
-    {file = "scipy-1.15.1-cp313-cp313t-win_amd64.whl", hash = "sha256:bc7136626261ac1ed988dca56cfc4ab5180f75e0ee52e58f1e6aa74b5f3eacd5"},
-    {file = "scipy-1.15.1.tar.gz", hash = "sha256:033a75ddad1463970c96a88063a1df87ccfddd526437136b6ee81ff0312ebdf6"},
+    {file = "scipy-1.15.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a2ec871edaa863e8213ea5df811cd600734f6400b4af272e1c011e69401218e9"},
+    {file = "scipy-1.15.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6f223753c6ea76983af380787611ae1291e3ceb23917393079dcc746ba60cfb5"},
+    {file = "scipy-1.15.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ecf797d2d798cf7c838c6d98321061eb3e72a74710e6c40540f0e8087e3b499e"},
+    {file = "scipy-1.15.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:9b18aa747da280664642997e65aab1dd19d0c3d17068a04b3fe34e2559196cb9"},
+    {file = "scipy-1.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87994da02e73549dfecaed9e09a4f9d58a045a053865679aeb8d6d43747d4df3"},
+    {file = "scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69ea6e56d00977f355c0f84eba69877b6df084516c602d93a33812aa04d90a3d"},
+    {file = "scipy-1.15.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:888307125ea0c4466287191e5606a2c910963405ce9671448ff9c81c53f85f58"},
+    {file = "scipy-1.15.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9412f5e408b397ff5641080ed1e798623dbe1ec0d78e72c9eca8992976fa65aa"},
+    {file = "scipy-1.15.2-cp310-cp310-win_amd64.whl", hash = "sha256:b5e025e903b4f166ea03b109bb241355b9c42c279ea694d8864d033727205e65"},
+    {file = "scipy-1.15.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:92233b2df6938147be6fa8824b8136f29a18f016ecde986666be5f4d686a91a4"},
+    {file = "scipy-1.15.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:62ca1ff3eb513e09ed17a5736929429189adf16d2d740f44e53270cc800ecff1"},
+    {file = "scipy-1.15.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c6676490ad76d1c2894d77f976144b41bd1a4052107902238047fb6a473e971"},
+    {file = "scipy-1.15.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:a8bf5cb4a25046ac61d38f8d3c3426ec11ebc350246a4642f2f315fe95bda655"},
+    {file = "scipy-1.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a8e34cf4c188b6dd004654f88586d78f95639e48a25dfae9c5e34a6dc34547e"},
+    {file = "scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28a0d2c2075946346e4408b211240764759e0fabaeb08d871639b5f3b1aca8a0"},
+    {file = "scipy-1.15.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:42dabaaa798e987c425ed76062794e93a243be8f0f20fff6e7a89f4d61cb3d40"},
+    {file = "scipy-1.15.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f5e296ec63c5da6ba6fa0343ea73fd51b8b3e1a300b0a8cae3ed4b1122c7462"},
+    {file = "scipy-1.15.2-cp311-cp311-win_amd64.whl", hash = "sha256:597a0c7008b21c035831c39927406c6181bcf8f60a73f36219b69d010aa04737"},
+    {file = "scipy-1.15.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c4697a10da8f8765bb7c83e24a470da5797e37041edfd77fd95ba3811a47c4fd"},
+    {file = "scipy-1.15.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:869269b767d5ee7ea6991ed7e22b3ca1f22de73ab9a49c44bad338b725603301"},
+    {file = "scipy-1.15.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:bad78d580270a4d32470563ea86c6590b465cb98f83d760ff5b0990cb5518a93"},
+    {file = "scipy-1.15.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b09ae80010f52efddb15551025f9016c910296cf70adbf03ce2a8704f3a5ad20"},
+    {file = "scipy-1.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a6fd6eac1ce74a9f77a7fc724080d507c5812d61e72bd5e4c489b042455865e"},
+    {file = "scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b871df1fe1a3ba85d90e22742b93584f8d2b8e6124f8372ab15c71b73e428b8"},
+    {file = "scipy-1.15.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:03205d57a28e18dfd39f0377d5002725bf1f19a46f444108c29bdb246b6c8a11"},
+    {file = "scipy-1.15.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:601881dfb761311045b03114c5fe718a12634e5608c3b403737ae463c9885d53"},
+    {file = "scipy-1.15.2-cp312-cp312-win_amd64.whl", hash = "sha256:e7c68b6a43259ba0aab737237876e5c2c549a031ddb7abc28c7b47f22e202ded"},
+    {file = "scipy-1.15.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:01edfac9f0798ad6b46d9c4c9ca0e0ad23dbf0b1eb70e96adb9fa7f525eff0bf"},
+    {file = "scipy-1.15.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:08b57a9336b8e79b305a143c3655cc5bdbe6d5ece3378578888d2afbb51c4e37"},
+    {file = "scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:54c462098484e7466362a9f1672d20888f724911a74c22ae35b61f9c5919183d"},
+    {file = "scipy-1.15.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:cf72ff559a53a6a6d77bd8eefd12a17995ffa44ad86c77a5df96f533d4e6c6bb"},
+    {file = "scipy-1.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9de9d1416b3d9e7df9923ab23cd2fe714244af10b763975bea9e4f2e81cebd27"},
+    {file = "scipy-1.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb530e4794fc8ea76a4a21ccb67dea33e5e0e60f07fc38a49e821e1eae3b71a0"},
+    {file = "scipy-1.15.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5ea7ed46d437fc52350b028b1d44e002646e28f3e8ddc714011aaf87330f2f32"},
+    {file = "scipy-1.15.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:11e7ad32cf184b74380f43d3c0a706f49358b904fa7d5345f16ddf993609184d"},
+    {file = "scipy-1.15.2-cp313-cp313-win_amd64.whl", hash = "sha256:a5080a79dfb9b78b768cebf3c9dcbc7b665c5875793569f48bf0e2b1d7f68f6f"},
+    {file = "scipy-1.15.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:447ce30cee6a9d5d1379087c9e474628dab3db4a67484be1b7dc3196bfb2fac9"},
+    {file = "scipy-1.15.2-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c90ebe8aaa4397eaefa8455a8182b164a6cc1d59ad53f79943f266d99f68687f"},
+    {file = "scipy-1.15.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:def751dd08243934c884a3221156d63e15234a3155cf25978b0a668409d45eb6"},
+    {file = "scipy-1.15.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:302093e7dfb120e55515936cb55618ee0b895f8bcaf18ff81eca086c17bd80af"},
+    {file = "scipy-1.15.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd5b77413e1855351cdde594eca99c1f4a588c2d63711388b6a1f1c01f62274"},
+    {file = "scipy-1.15.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d0194c37037707b2afa7a2f2a924cf7bac3dc292d51b6a925e5fcb89bc5c776"},
+    {file = "scipy-1.15.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:bae43364d600fdc3ac327db99659dcb79e6e7ecd279a75fe1266669d9a652828"},
+    {file = "scipy-1.15.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f031846580d9acccd0044efd1a90e6f4df3a6e12b4b6bd694a7bc03a89892b28"},
+    {file = "scipy-1.15.2-cp313-cp313t-win_amd64.whl", hash = "sha256:fe8a9eb875d430d81755472c5ba75e84acc980e4a8f6204d402849234d3017db"},
+    {file = "scipy-1.15.2.tar.gz", hash = "sha256:cd58a314d92838f7e6f755c8a2167ead4f27e1fd5c1251fd54289569ef3495ec"},
 ]

 [package.dependencies]
@ -6406,37 +6405,37 @@ test = ["pytest", "tornado (>=4.5)", "typeguard"]

 [[package]]
 name = "tesserocr"
-version = "2.7.1"
+version = "2.8.0"
 description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython"
 optional = true
 python-versions = "*"
 files = [
-    {file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"},
-    {file = "tesserocr-2.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3bb5d336ebf2cc47cd0d117cadc8b25b2e558f54fb9a2dedaa28a14cb5a6b437"},
-    {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3ff7f6d6b5c12dd31b80842eb0892b661a41ca3edf0e6cc1e54ec2c14552ceef"},
-    {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ae794c5434373f4afa4c7f8b59f19fde810f8caf096d8bb701a4b2f3a6739460"},
-    {file = "tesserocr-2.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0a0895a4d9ff6a34f5a6f203fe0c9899f31d6f2378ae99be80605637b622687b"},
-    {file = "tesserocr-2.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c3187d14b95c866aa1d34cc374a53d583e2168742eefe33347e4790af70338e"},
-    {file = "tesserocr-2.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec52be3d82136430081427062ad0211a52fc38fa28fe58e216b89f840354f216"},
-    {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:44e71b3e8da36b2567760309398689ea9785ee62db3ff21140a9ea6941a233c4"},
-    {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e31a49d7784e7e52fe656719145c3a872856d67daa9bfb340c2990db00e023e9"},
-    {file = "tesserocr-2.7.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:37abde15c1c940d691305fd87836e4cad25a1434799729c324bbcd2277bcae44"},
-    {file = "tesserocr-2.7.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1b6349d35d333d420d24acf1953ad6f1d5613ffcde462c62126b68bdfca12753"},
-    {file = "tesserocr-2.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:42f009cde8479f3b339da12a8e419fd9559b64b13bc08a248bd0833c6ae94331"},
-    {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e13204b3b92fac76ece6e33f55eba6335b30e379f4a7b75e285c2ad05762027"},
-    {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:65afdec0c5dc09a4a23a62e65524989cd940af41be1603e251a64ac10de9babf"},
-    {file = "tesserocr-2.7.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5f59fb072c90bff8aa6a365fc82b747c2668b7b48233901728b155860d1ff9"},
-    {file = "tesserocr-2.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f62d662e3002868384e14e8cd620bdedf34ab9f9fc3ebbce527cfe032a7485ee"},
-    {file = "tesserocr-2.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e80051812685bd521bc17cb70cf1480ffbb3e54ccc2883e90d5bcda15f8278ea"},
-    {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:2690cb2330fc9349d68ff027cbdac09693fdda36470836b196c04f16dcc99e9d"},
-    {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d01ebd094103451ecb77b6510ade2f6bb064c51413ff35b135f649f3d6067a67"},
-    {file = "tesserocr-2.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8069ae6cd9ea3c056b6a596bc99f501ee9f95d6fd2928fcaffb9777071c210d"},
-    {file = "tesserocr-2.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2d3d23223d0a448877fb91af83c46ce95ff0a497a82fa93e93068148c9712e5"},
-    {file = "tesserocr-2.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef8a09a44c2e96bab0f40dbf0633767d063680d86b79365b43fc4e1234219694"},
-    {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6e613213ea5b64db06f2cba0b93c3656b7e6aec2d9b2d2e929edf49da7143225"},
-    {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4a8888b765e26680a6e34b8ec09b7bb85a17e08cea76f0661eafe2a84254562a"},
-    {file = "tesserocr-2.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:64f25763e56c4c29b808e59b485c930cac46b6a1ac8eadd994086dc40a29d3a1"},
-    {file = "tesserocr-2.7.1.tar.gz", hash = "sha256:3744c5c8bbabf18172849c7731be00dc2e5e44f8c556d37c850e788794ae0af4"},
+    {file = "tesserocr-2.8.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:b5d5dcabe688bf7bb76f87eef05783aa1d305c9566b7f6f6735a12f224ca379b"},
+    {file = "tesserocr-2.8.0-cp310-cp310-macosx_15_0_arm64.whl", hash = "sha256:55d0e018d34054fa7f875cd126abaf423de4069fde49d638a399de530949055b"},
+    {file = "tesserocr-2.8.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ad52bb2b1d48b7db6fed379a6805c2437432374fab98b0ab5071ff3fc81efaf2"},
+    {file = "tesserocr-2.8.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:4ac659c3207fd3c0e43081a51e486e3d42259abd20bbaed6cd2ee4cd332a78c0"},
+    {file = "tesserocr-2.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c47c69177e948f567f818dec308717a679bdd3941fd5d3fc6cd9ecf93fe165a4"},
+    {file = "tesserocr-2.8.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:88876546ddadc9590800df5dec7f2acbd35a423f0803ca2f17a93567aabbd877"},
+    {file = "tesserocr-2.8.0-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:09d8c55838a0085662d2a07a40843a6bbbd6baf44b45eda01df307cdac17089c"},
+    {file = "tesserocr-2.8.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:e89b4928eefcea953ad70ed03fb344568d1a574347d1f0d18699d01a020a7c7e"},
+    {file = "tesserocr-2.8.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:4636a86269e97d60731a1edd16d29cb2c79a28cc91594d7f0af31ee65f72f4ae"},
+    {file = "tesserocr-2.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9dbe02605da205ce253524c4ca681a519a55258906ff8ca585f9df7bb1e78616"},
+    {file = "tesserocr-2.8.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:7a0b03d46a0ad2265b83f461ca305a6e5aaac2626853a82012c6198bb4105d66"},
+    {file = "tesserocr-2.8.0-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:c9acde3d66d6ef40f95e4cef424b24acbf90e278396827fc064915c665c6548d"},
+    {file = "tesserocr-2.8.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:426dfff81bae757faa25477feaf783f6f5bcdb94ae6a95f4fe24eda97f4825c0"},
+    {file = "tesserocr-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7cb74e1ce1bc038a5cc6db90e5a79cb55d6db1b7e6fe7a0d9eb30475fdfd9036"},
+    {file = "tesserocr-2.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9ad1a2900424994ca5caa2470be04bd1c6ee3f0674b0050a34b556f6ba7d2ed5"},
+    {file = "tesserocr-2.8.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:44b3396d52379155fd838931b78b044129c7c77a8f02a92574cde626cff9b4a8"},
+    {file = "tesserocr-2.8.0-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:1edd2302f4a91b5491a4ce3f63e612441adf92fd81b339b85cbedb3b5b40f206"},
+    {file = "tesserocr-2.8.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b0dd849ce77373f9ac4b54d345b4d7115414e525e57a158e948887d744c6f909"},
+    {file = "tesserocr-2.8.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9ce710a73308964f2ac53f94b4980d2791bb67a82863bb7ef0ca445c1b325aa4"},
+    {file = "tesserocr-2.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a7a36af39aaf29a152c629cf62457192944f8854fbdd28395ef92d283e800662"},
+    {file = "tesserocr-2.8.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:f83344e350062d7db8625aa21695d34949a25e1f144788996a0e1e91dc53ca45"},
+    {file = "tesserocr-2.8.0-cp39-cp39-macosx_15_0_arm64.whl", hash = "sha256:10fa0125d57c9edc93a7f35673f6b977e0fc0deb123d62b158c93fd8ca4c1c2c"},
+    {file = "tesserocr-2.8.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:efef77ed8702d56a3dc7ba5dba37ce13beecd24128042ad41cbc20c50bb5e23e"},
+    {file = "tesserocr-2.8.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b41a78eaa35c90d61facd07dca96443e7dc1f0604ae955843be916e2f9a225af"},
+    {file = "tesserocr-2.8.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:317931096378a1dd056500d9c3a489aa0e4546e4d7792a6ffa1a31c0902ab365"},
+    {file = "tesserocr-2.8.0.tar.gz", hash = "sha256:be518d1b1b5ff54c11aada1e0fd12942509ea70581e0a8b39a2a473a0b2dbd36"},
 ]

 [[package]]
@ -7349,13 +7348,13 @@ zstd = ["zstandard (>=0.18.0)"]

 [[package]]
 name = "virtualenv"
-version = "20.29.1"
+version = "20.29.2"
 description = "Virtual Python Environment builder"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "virtualenv-20.29.1-py3-none-any.whl", hash = "sha256:4e4cb403c0b0da39e13b46b1b2476e505cb0046b25f242bee80f62bf990b2779"},
-    {file = "virtualenv-20.29.1.tar.gz", hash = "sha256:b8b8970138d32fb606192cb97f6cd4bb644fa486be9308fb9b63f81091b5dc35"},
+    {file = "virtualenv-20.29.2-py3-none-any.whl", hash = "sha256:febddfc3d1ea571bdb1dc0f98d7b45d24def7428214d4fb73cc486c9568cce6a"},
+    {file = "virtualenv-20.29.2.tar.gz", hash = "sha256:fdaabebf6d03b5ba83ae0a02cfe96f48a716f4fae556461d180825866f75b728"},
 ]

 [package.dependencies]
@ -7811,4 +7810,4 @@ vlm = ["transformers", "transformers"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "19ee67c2a10b5d377e6292699fcf0fb6ff351996a197d6fb747b1471ad7ab7da"
+content-hash = "b19c39233b5c7ca2a4feed4886542395492ed43f4957f9c6f097b03e8d5b6148"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.21.0"  # DO NOT EDIT, updated automatically
+version = "2.23.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@ -26,7 +26,7 @@ packages = [{include = "docling"}]
 ######################
 python = "^3.9"
 pydantic = "^2.0.0"
-docling-core = {extras = ["chunking"], version = "^2.18.0"}
+docling-core = {extras = ["chunking"], version = "^2.19.0"}
 docling-ibm-models = {git = "https://github.com/DS4SD/docling-ibm-models.git", rev = "dev/add-reading-order"}
 docling-parse = "^3.3.0"
 filetype = "^1.2.0"
@ -62,7 +62,7 @@ transformers = [
  {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
  {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
 ]
-pillow = "^10.0.0"
+pillow = ">=10.0.0,<12.0.0"
 tqdm = "^4.65.0"

 [tool.poetry.group.dev.dependencies]
--- a/tests/data/csv/csv-comma-in-cell.csv
+++ b/tests/data/csv/csv-comma-in-cell.csv
@ -0,0 +1,5 @@
+1,2,3,4
+a,b,c,d
+a,",",c,d
+a,b,c,d
+a,b,c,d
--- a/tests/data/csv/csv-comma.csv
+++ b/tests/data/csv/csv-comma.csv
@ -0,0 +1,6 @@
+Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
+1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
+2,1Ef7b82A4CAAD10,Preston,"Lozano, Dr",Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
+3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
+4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
+5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/
--- a/tests/data/csv/csv-inconsistent-header.csv
+++ b/tests/data/csv/csv-inconsistent-header.csv
@ -0,0 +1,5 @@
+1,2,3
+a,b,c,d
+a,b,c,d
+a,b,c,d
+a,b,c,d
--- a/tests/data/csv/csv-pipe.csv
+++ b/tests/data/csv/csv-pipe.csv
@ -0,0 +1,6 @@
+Index|Customer Id|First Name|Last Name|Company|City|Country|Phone 1|Phone 2|Email|Subscription Date|Website
+1|DD37Cf93aecA6Dc|Sheryl|Baxter|Rasmussen Group|East Leonard|Chile|229.077.5154|397.884.0519x718|zunigavanessa@smith.info|2020-08-24|http://www.stephenson.com/
+2|1Ef7b82A4CAAD10|Preston|Lozano|Vega-Gentry|East Jimmychester|Djibouti|5153435776|686-620-1820x944|vmata@colon.com|2021-04-23|http://www.hobbs.com/
+3|6F94879bDAfE5a6|Roy|Berry|Murillo-Perry|Isabelborough|Antigua and Barbuda|+1-539-402-0259|(496)978-3969x58947|beckycarr@hogan.com|2020-03-25|http://www.lawrence.com/
+4|5Cef8BFA16c5e3c|Linda|Olsen|"Dominguez|Mcmillan and Donovan"|Bensonview|Dominican Republic|001-808-617-6467x12895|+1-813-324-8756|stanleyblackwell@benson.org|2020-06-02|http://www.good-lyons.com/
+5|053d585Ab6b3159|Joanna|Bender|"Martin|Lang and Andrade"|West Priscilla|Slovakia (Slovak Republic)|001-234-203-0635x76146|001-199-446-3860x3486|colinalvarado@miles.net|2021-04-17|https://goodwin-ingram.com/
--- a/tests/data/csv/csv-semicolon.csv
+++ b/tests/data/csv/csv-semicolon.csv
@ -0,0 +1,6 @@
+Index;Customer Id;First Name;Last Name;Company;City;Country;Phone 1;Phone 2;Email;Subscription Date;Website
+1;DD37Cf93aecA6Dc;Sheryl;Baxter;Rasmussen Group;East Leonard;Chile;229.077.5154;397.884.0519x718;zunigavanessa@smith.info;2020-08-24;http://www.stephenson.com/
+2;1Ef7b82A4CAAD10;Preston;Lozano;Vega-Gentry;East Jimmychester;Djibouti;5153435776;686-620-1820x944;vmata@colon.com;2021-04-23;http://www.hobbs.com/
+3;6F94879bDAfE5a6;Roy;Berry;Murillo-Perry;Isabelborough;Antigua and Barbuda;+1-539-402-0259;(496)978-3969x58947;beckycarr@hogan.com;2020-03-25;http://www.lawrence.com/
+4;5Cef8BFA16c5e3c;Linda;Olsen;"Dominguez;Mcmillan and Donovan";Bensonview;Dominican Republic;001-808-617-6467x12895;+1-813-324-8756;stanleyblackwell@benson.org;2020-06-02;http://www.good-lyons.com/
+5;053d585Ab6b3159;Joanna;Bender;"Martin;Lang and Andrade";West Priscilla;Slovakia (Slovak Republic);001-234-203-0635x76146;001-199-446-3860x3486;colinalvarado@miles.net;2021-04-17;https://goodwin-ingram.com/
--- a/tests/data/csv/csv-tab.csv
+++ b/tests/data/csv/csv-tab.csv
@ -0,0 +1,6 @@
+Index	Customer Id	First Name	Last Name	Company	City	Country	Phone 1	Phone 2	Email	Subscription Date	Website
+1	DD37Cf93aecA6Dc	Sheryl	Baxter	Rasmussen Group	East Leonard	Chile	229.077.5154	397.884.0519x718	zunigavanessa@smith.info	2020-08-24	http://www.stephenson.com/
+2	1Ef7b82A4CAAD10	Preston	Lozano	Vega-Gentry	East Jimmychester	Djibouti	5153435776	686-620-1820x944	vmata@colon.com	2021-04-23	http://www.hobbs.com/
+3	6F94879bDAfE5a6	Roy	Berry	Murillo-Perry	Isabelborough	Antigua and Barbuda	+1-539-402-0259	(496)978-3969x58947	beckycarr@hogan.com	2020-03-25	http://www.lawrence.com/
+4	5Cef8BFA16c5e3c	Linda	Olsen	"Dominguez	Mcmillan and Donovan"	Bensonview	Dominican Republic	001-808-617-6467x12895	+1-813-324-8756	stanleyblackwell@benson.org	2020-06-02	http://www.good-lyons.com/
+5	053d585Ab6b3159	Joanna	Bender	"Martin	Lang and Andrade"	West Priscilla	Slovakia (Slovak Republic)	001-234-203-0635x76146	001-199-446-3860x3486	colinalvarado@miles.net	2021-04-17	https://goodwin-ingram.com/
--- a/tests/data/csv/csv-too-few-columns.csv
+++ b/tests/data/csv/csv-too-few-columns.csv
@ -0,0 +1,5 @@
+1,2,3,4
+a,'b',c,d
+a,b,c
+a,b,c,d
+a,b,c,d
--- a/tests/data/csv/csv-too-many-columns.csv
+++ b/tests/data/csv/csv-too-many-columns.csv
@ -0,0 +1,5 @@
+1,2,3,4
+a,b,c,d
+a,b,c,d,e
+a,b,c,d
+a,b,c,d
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v1/code_and_formula.json
+++ b/tests/data/groundtruth/docling_v1/code_and_formula.json
--- a/tests/data/groundtruth/docling_v1/picture_classification.json
+++ b/tests/data/groundtruth/docling_v1/picture_classification.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_03.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_03.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
@ -1,465 +1,282 @@
-<document>
-<section_header_level_1><location><page_1><loc_16><loc_85><loc_82><loc_86></location>TableFormer: Table Structure Understanding with Transformers.</section_header_level_1>
-<section_header_level_1><location><page_1><loc_23><loc_78><loc_74><loc_81></location>Ahmed Nassar, Nikolaos Livathinos, Maksym Lysak, Peter Staar IBM Research</section_header_level_1>
-<text><location><page_1><loc_34><loc_77><loc_62><loc_78></location>{ ahn,nli,mly,taa } @zurich.ibm.com</text>
-<section_header_level_1><location><page_1><loc_24><loc_71><loc_31><loc_73></location>Abstract</section_header_level_1>
-<section_header_level_1><location><page_1><loc_52><loc_71><loc_67><loc_72></location>a. Picture of a table:</section_header_level_1>
-<section_header_level_1><location><page_1><loc_8><loc_30><loc_21><loc_32></location>1. Introduction</section_header_level_1>
-<text><location><page_1><loc_8><loc_10><loc_47><loc_29></location>The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.</text>
-<figure>
-<location><page_1><loc_52><loc_62><loc_88><loc_71></location>
-</figure>
-<table>
-<location><page_1><loc_52><loc_62><loc_88><loc_71></location>
-<caption>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption>
-<row_0><col_0><col_header>3</col_0><col_1><col_header>1</col_1></row_0>
-</table>
-<unordered_list>
-<list_item><location><page_1><loc_52><loc_58><loc_79><loc_60></location>b. Red-annotation of bounding boxes, Blue-predictions by TableFormer</list_item>
+<doctag><page_header><loc_15><loc_131><loc_30><loc_354>arXiv:2203.01017v2 [cs.CV] 11 Mar 2022</page_header>
+<section_header_level_1><loc_79><loc_68><loc_408><loc_76>TableFormer: Table Structure Understanding with Transformers.</section_header_level_1>
+<section_header_level_1><loc_116><loc_93><loc_370><loc_108>Ahmed Nassar, Nikolaos Livathinos, Maksym Lysak, Peter Staar IBM Research</section_header_level_1>
+<text><loc_170><loc_111><loc_309><loc_116>{ ahn,nli,mly,taa } @zurich.ibm.com</text>
+<section_header_level_1><loc_119><loc_136><loc_156><loc_143>Abstract</section_header_level_1>
+<section_header_level_1><loc_258><loc_138><loc_334><loc_143>a. Picture of a table:</section_header_level_1>
+<section_header_level_1><loc_41><loc_341><loc_104><loc_348>1. Introduction</section_header_level_1>
+<text><loc_41><loc_354><loc_234><loc_450>The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.</text>
+<picture><loc_258><loc_144><loc_439><loc_191></picture>
+<otsl><loc_258><loc_144><loc_439><loc_191><ched>3<ched>1<nl><caption><loc_41><loc_152><loc_234><loc_324>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption></otsl>
+<unordered_list><list_item><loc_258><loc_198><loc_397><loc_210>b. Red-annotation of bounding boxes, Blue-predictions by TableFormer</list_item>
 </unordered_list>
-<figure>
-<location><page_1><loc_51><loc_48><loc_88><loc_57></location>
-</figure>
-<unordered_list>
-<list_item><location><page_1><loc_52><loc_46><loc_80><loc_47></location>c. Structure predicted by TableFormer:</list_item>
+<picture><loc_257><loc_213><loc_441><loc_259></picture>
+<unordered_list><list_item><loc_258><loc_265><loc_401><loc_271>c. Structure predicted by TableFormer:</list_item>
 </unordered_list>
-<figure>
-<location><page_1><loc_52><loc_37><loc_88><loc_45></location>
-</figure>
-<table>
-<location><page_1><loc_52><loc_37><loc_88><loc_45></location>
-<caption>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
-<row_0><col_0><col_header>0</col_0><col_1><col_header>1</col_1><col_2><col_header>1</col_2><col_3><col_header>2 1</col_3><col_4><col_header>2 1</col_4><col_5><body></col_5></row_0>
-<row_1><col_0><body>3</col_0><col_1><body>4</col_1><col_2><body>5 3</col_2><col_3><body>6</col_3><col_4><body>7</col_4><col_5><body></col_5></row_1>
-<row_2><col_0><body>8</col_0><col_1><body>9</col_1><col_2><body>10</col_2><col_3><body>11</col_3><col_4><body>12</col_4><col_5><body>2</col_5></row_2>
-<row_3><col_0><body></col_0><col_1><body>13</col_1><col_2><body>14</col_2><col_3><body>15</col_3><col_4><body>16</col_4><col_5><body>2</col_5></row_3>
-<row_4><col_0><body></col_0><col_1><body>17</col_1><col_2><body>18</col_2><col_3><body>19</col_3><col_4><body>20</col_4><col_5><body>2</col_5></row_4>
-</table>
-<text><location><page_1><loc_50><loc_16><loc_89><loc_26></location>Recently, significant progress has been made with vision based approaches to extract tables in documents. For the sake of completeness, the issue of table extraction from documents is typically decomposed into two separate challenges, i.e. (1) finding the location of the table(s) on a document-page and (2) finding the structure of a given table in the document.</text>
-<text><location><page_1><loc_50><loc_10><loc_89><loc_16></location>The first problem is called table-location and has been previously addressed [30, 38, 19, 21, 23, 26, 8] with stateof-the-art object-detection networks (e.g. YOLO and later on Mask-RCNN [9]). For all practical purposes, it can be</text>
-<text><location><page_2><loc_8><loc_88><loc_47><loc_91></location>considered as a solved problem, given enough ground-truth data to train on.</text>
-<text><location><page_2><loc_8><loc_71><loc_47><loc_87></location>The second problem is called table-structure decomposition. The latter is a long standing problem in the community of document understanding [6, 4, 14]. Contrary to the table-location problem, there are no commonly used approaches that can easily be re-purposed to solve this problem. Lately, a set of new model-architectures has been proposed by the community to address table-structure decomposition [37, 36, 18, 20]. All these models have some weaknesses (see Sec. 2). The common denominator here is the reliance on textual features and/or the inability to provide the bounding box of each table-cell in the original image.</text>
-<text><location><page_2><loc_8><loc_53><loc_47><loc_71></location>In this paper, we want to address these weaknesses and present a robust table-structure decomposition algorithm. The design criteria for our model are the following. First, we want our algorithm to be language agnostic. In this way, we can obtain the structure of any table, irregardless of the language. Second, we want our algorithm to leverage as much data as possible from the original PDF document. For programmatic PDF documents, the text-cells can often be extracted much faster and with higher accuracy compared to OCR methods. Last but not least, we want to have a direct link between the table-cell and its bounding box in the image.</text>
-<text><location><page_2><loc_8><loc_45><loc_47><loc_53></location>To meet the design criteria listed above, we developed a new model called TableFormer and a synthetically generated table structure dataset called SynthTabNet $^{1}$. In particular, our contributions in this work can be summarised as follows:</text>
-<unordered_list>
-<list_item><location><page_2><loc_10><loc_38><loc_47><loc_44></location>· We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.</list_item>
-<list_item><location><page_2><loc_10><loc_31><loc_47><loc_37></location>· Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.</list_item>
-<list_item><location><page_2><loc_10><loc_25><loc_47><loc_29></location>· We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.</list_item>
-<list_item><location><page_2><loc_10><loc_19><loc_47><loc_24></location>· An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.</list_item>
+<picture><loc_258><loc_274><loc_439><loc_313></picture>
+<otsl><loc_258><loc_274><loc_439><loc_313><ched>0<ched>1<lcel><ched>2 1<lcel><ecel><nl><fcel>3<fcel>4<fcel>5 3<fcel>6<fcel>7<ecel><nl><fcel>8<fcel>9<fcel>10<fcel>11<fcel>12<fcel>2<nl><ecel><fcel>13<fcel>14<fcel>15<fcel>16<ucel><nl><ecel><fcel>17<fcel>18<fcel>19<fcel>20<ucel><nl><caption><loc_252><loc_325><loc_445><loc_353>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption></otsl>
+<text><loc_252><loc_369><loc_445><loc_420>Recently, significant progress has been made with vision based approaches to extract tables in documents. For the sake of completeness, the issue of table extraction from documents is typically decomposed into two separate challenges, i.e. (1) finding the location of the table(s) on a document-page and (2) finding the structure of a given table in the document.</text>
+<text><loc_252><loc_422><loc_445><loc_450>The first problem is called table-location and has been previously addressed [30, 38, 19, 21, 23, 26, 8] with stateof-the-art object-detection networks (e.g. YOLO and later on Mask-RCNN [9]). For all practical purposes, it can be</text>
+<page_footer><loc_241><loc_463><loc_245><loc_469>1</page_footer>
+<page_break>
+<text><loc_41><loc_47><loc_234><loc_61>considered as a solved problem, given enough ground-truth data to train on.</text>
+<text><loc_41><loc_63><loc_234><loc_144>The second problem is called table-structure decomposition. The latter is a long standing problem in the community of document understanding [6, 4, 14]. Contrary to the table-location problem, there are no commonly used approaches that can easily be re-purposed to solve this problem. Lately, a set of new model-architectures has been proposed by the community to address table-structure decomposition [37, 36, 18, 20]. All these models have some weaknesses (see Sec. 2). The common denominator here is the reliance on textual features and/or the inability to provide the bounding box of each table-cell in the original image.</text>
+<text><loc_41><loc_146><loc_234><loc_235>In this paper, we want to address these weaknesses and present a robust table-structure decomposition algorithm. The design criteria for our model are the following. First, we want our algorithm to be language agnostic. In this way, we can obtain the structure of any table, irregardless of the language. Second, we want our algorithm to leverage as much data as possible from the original PDF document. For programmatic PDF documents, the text-cells can often be extracted much faster and with higher accuracy compared to OCR methods. Last but not least, we want to have a direct link between the table-cell and its bounding box in the image.</text>
+<text><loc_41><loc_237><loc_234><loc_273>To meet the design criteria listed above, we developed a new model called TableFormer and a synthetically generated table structure dataset called SynthTabNet $^{1}$. In particular, our contributions in this work can be summarised as follows:</text>
+<unordered_list><list_item><loc_50><loc_281><loc_234><loc_309>· We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.</list_item>
+<list_item><loc_50><loc_317><loc_234><loc_345>· Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.</list_item>
+<list_item><loc_50><loc_353><loc_234><loc_374>· We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.</list_item>
+<list_item><loc_50><loc_382><loc_234><loc_403>· An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.</list_item>
 </unordered_list>
-<text><location><page_2><loc_8><loc_12><loc_47><loc_18></location>The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe</text>
-<text><location><page_2><loc_50><loc_86><loc_89><loc_91></location>its results & performance in Sec. 5. As a conclusion, we describe how this new model-architecture can be re-purposed for other tasks in the computer-vision community.</text>
-<section_header_level_1><location><page_2><loc_50><loc_83><loc_81><loc_85></location>2. Previous work and State of the Art</section_header_level_1>
-<text><location><page_2><loc_50><loc_58><loc_89><loc_82></location>Identifying the structure of a table has been an outstanding problem in the document-parsing community, that motivates many organised public challenges [6, 4, 14]. The difficulty of the problem can be attributed to a number of factors. First, there is a large variety in the shapes and sizes of tables. Such large variety requires a flexible method. This is especially true for complex column- and row headers, which can be extremely intricate and demanding. A second factor of complexity is the lack of data with regard to table-structure. Until the publication of PubTabNet [37], there were no large datasets (i.e. > 100 K tables) that provided structure information. This happens primarily due to the fact that tables are notoriously time-consuming to annotate by hand. However, this has definitely changed in recent years with the deliverance of PubTabNet [37], FinTabNet [36], TableBank [17] etc.</text>
-<text><location><page_2><loc_50><loc_43><loc_89><loc_58></location>Before the rising popularity of deep neural networks, the community relied heavily on heuristic and/or statistical methods to do table structure identification [3, 7, 11, 5, 13, 28]. Although such methods work well on constrained tables [12], a more data-driven approach can be applied due to the advent of convolutional neural networks (CNNs) and the availability of large datasets. To the best-of-our knowledge, there are currently two different types of network architecture that are being pursued for state-of-the-art tablestructure identification.</text>
-<text><location><page_2><loc_50><loc_10><loc_89><loc_43></location>Image-to-Text networks : In this type of network, one predicts a sequence of tokens starting from an encoded image. Such sequences of tokens can be HTML table tags [37, 17] or LaTeX symbols[10]. The choice of symbols is ultimately not very important, since one can be transformed into the other. There are however subtle variations in the Image-to-Text networks. The easiest network architectures are "image-encoder → text-decoder" (IETD), similar to network architectures that try to provide captions to images [32]. In these IETD networks, one expects as output the LaTeX/HTML string of the entire table, i.e. the symbols necessary for creating the table with the content of the table. Another approach is the "image-encoder → dual decoder" (IEDD) networks. In these type of networks, one has two consecutive decoders with different purposes. The first decoder is the tag-decoder , i.e. it only produces the HTML/LaTeX tags which construct an empty table. The second content-decoder uses the encoding of the image in combination with the output encoding of each cell-tag (from the tag-decoder ) to generate the textual content of each table cell. The network architecture of IEDD is certainly more elaborate, but it has the advantage that one can pre-train the</text>
-<text><location><page_3><loc_8><loc_89><loc_41><loc_91></location>tag-decoder which is constrained to the table-tags.</text>
-<text><location><page_3><loc_8><loc_65><loc_47><loc_89></location>In practice, both network architectures (IETD and IEDD) require an implicit, custom trained object-characterrecognition (OCR) to obtain the content of the table-cells. In the case of IETD, this OCR engine is implicit in the decoder similar to [24]. For the IEDD, the OCR is solely embedded in the content-decoder. This reliance on a custom, implicit OCR decoder is of course problematic. OCR is a well known and extremely tough problem, that often needs custom training for each individual language. However, the limited availability for non-english content in the current datasets, makes it impractical to apply the IETD and IEDD methods on tables with other languages. Additionally, OCR can be completely omitted if the tables originate from programmatic PDF documents with known positions of each cell. The latter was the inspiration for the work of this paper.</text>
-<text><location><page_3><loc_8><loc_38><loc_47><loc_65></location>Graph Neural networks : Graph Neural networks (GNN's) take a radically different approach to tablestructure extraction. Note that one table cell can constitute out of multiple text-cells. To obtain the table-structure, one creates an initial graph, where each of the text-cells becomes a node in the graph similar to [33, 34, 2]. Each node is then associated with en embedding vector coming from the encoded image, its coordinates and the encoded text. Furthermore, nodes that represent adjacent text-cells are linked. Graph Convolutional Networks (GCN's) based methods take the image as an input, but also the position of the text-cells and their content [18]. The purpose of a GCN is to transform the input graph into a new graph, which replaces the old links with new ones. The new links then represent the table-structure. With this approach, one can avoid the need to build custom OCR decoders. However, the quality of the reconstructed structure is not comparable to the current state-of-the-art [18].</text>
-<text><location><page_3><loc_8><loc_21><loc_47><loc_38></location>Hybrid Deep Learning-Rule-Based approach : A popular current model for table-structure identification is the use of a hybrid Deep Learning-Rule-Based approach similar to [27, 29]. In this approach, one first detects the position of the table-cells with object detection (e.g. YoloVx or MaskRCNN), then classifies the table into different types (from its images) and finally uses different rule-sets to obtain its table-structure. Currently, this approach achieves stateof-the-art results, but is not an end-to-end deep-learning method. As such, new rules need to be written if different types of tables are encountered.</text>
-<section_header_level_1><location><page_3><loc_8><loc_18><loc_17><loc_20></location>3. Datasets</section_header_level_1>
-<text><location><page_3><loc_8><loc_10><loc_47><loc_17></location>We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and TableBank [17] datasets to train and evaluate our models. These datasets span over various appearance styles and content. We also introduce our own synthetically generated SynthTabNet dataset to fix an im-</text>
-<figure>
-<location><page_3><loc_51><loc_68><loc_90><loc_90></location>
-<caption>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
-</figure>
-<text><location><page_3><loc_50><loc_59><loc_71><loc_60></location>balance in the previous datasets.</text>
-<text><location><page_3><loc_50><loc_21><loc_89><loc_58></location>The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as "simple" when it does not contain row spans or column spans, otherwise it is "complex". The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits.</text>
-<text><location><page_3><loc_50><loc_10><loc_89><loc_20></location>Due to the heterogeneity across the dataset formats, it was necessary to combine all available data into one homogenized dataset before we could train our models for practical purposes. Given the size of PubTabNet, we adopted its annotation format and we extracted and converted all tables as PNG images with a resolution of 72 dpi. Additionally, we have filtered out tables with extreme sizes due to small</text>
-<text><location><page_4><loc_8><loc_88><loc_47><loc_91></location>amount of such tables, and kept only those ones ranging between 1*1 and 20*10 (rows/columns).</text>
-<text><location><page_4><loc_8><loc_60><loc_47><loc_87></location>The availability of the bounding boxes for all table cells is essential to train our models. In order to distinguish between empty and non-empty bounding boxes, we have introduced a binary class in the annotation. Unfortunately, the original datasets either omit the bounding boxes for whole tables (e.g. TableBank) or they narrow their scope only to non-empty cells. Therefore, it was imperative to introduce a data pre-processing procedure that generates the missing bounding boxes out of the annotation information. This procedure first parses the provided table structure and calculates the dimensions of the most fine-grained grid that covers the table structure. Notice that each table cell may occupy multiple grid squares due to row or column spans. In case of PubTabNet we had to compute missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes.</text>
-<text><location><page_4><loc_8><loc_45><loc_47><loc_60></location>As it is illustrated in Fig. 2, the table distributions from all datasets are skewed towards simpler structures with fewer number of rows/columns. Additionally, there is very limited variance in the table styles, which in case of PubTabNet and FinTabNet means one styling format for the majority of the tables. Similar limitations appear also in the type of table content, which in some cases (e.g. FinTabNet) is restricted to a certain domain. Ultimately, the lack of diversity in the training dataset damages the ability of the models to generalize well on unseen data.</text>
-<text><location><page_4><loc_8><loc_21><loc_47><loc_45></location>Motivated by those observations we aimed at generating a synthetic table dataset named SynthTabNet . This approach offers control over: 1) the size of the dataset, 2) the table structure, 3) the table style and 4) the type of content. The complexity of the table structure is described by the size of the table header and the table body, as well as the percentage of the table cells covered by row spans and column spans. A set of carefully designed styling templates provides the basis to build a wide range of table appearances. Lastly, the table content is generated out of a curated collection of text corpora. By controlling the size and scope of the synthetic datasets we are able to train and evaluate our models in a variety of different conditions. For example, we can first generate a highly diverse dataset to train our models and then evaluate their performance on other synthetic datasets which are focused on a specific domain.</text>
-<text><location><page_4><loc_8><loc_10><loc_47><loc_20></location>In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third</text>
-<table>
-<location><page_4><loc_51><loc_80><loc_89><loc_91></location>
-<caption>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption>
-<row_0><col_0><body></col_0><col_1><col_header>Tags</col_1><col_2><col_header>Bbox</col_2><col_3><col_header>Size</col_3><col_4><col_header>Format</col_4></row_0>
-<row_1><col_0><row_header>PubTabNet</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>509k</col_3><col_4><body>PNG</col_4></row_1>
-<row_2><col_0><row_header>FinTabNet</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>112k</col_3><col_4><body>PDF</col_4></row_2>
-<row_3><col_0><row_header>TableBank</col_0><col_1><body>3</col_1><col_2><body>7</col_2><col_3><body>145k</col_3><col_4><body>JPEG</col_4></row_3>
-<row_4><col_0><row_header>Combined-Tabnet(*)</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>400k</col_3><col_4><body>PNG</col_4></row_4>
-<row_5><col_0><row_header>Combined(**)</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>500k</col_3><col_4><body>PNG</col_4></row_5>
-<row_6><col_0><row_header>SynthTabNet</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>600k</col_3><col_4><body>PNG</col_4></row_6>
-</table>
-<text><location><page_4><loc_50><loc_63><loc_89><loc_68></location>one adopts a colorful appearance with high contrast and the last one contains tables with sparse content. Lastly, we have combined all synthetic datasets into one big unified synthetic dataset of 600k examples.</text>
-<text><location><page_4><loc_52><loc_61><loc_89><loc_62></location>Tab. 1 summarizes the various attributes of the datasets.</text>
-<section_header_level_1><location><page_4><loc_50><loc_58><loc_73><loc_59></location>4. The TableFormer model</section_header_level_1>
-<text><location><page_4><loc_50><loc_44><loc_89><loc_57></location>Given the image of a table, TableFormer is able to predict: 1) a sequence of tokens that represent the structure of a table, and 2) a bounding box coupled to a subset of those tokens. The conversion of an image into a sequence of tokens is a well-known task [35, 16]. While attention is often used as an implicit method to associate each token of the sequence with a position in the original image, an explicit association between the individual table-cells and the image bounding boxes is also required.</text>
-<section_header_level_1><location><page_4><loc_50><loc_41><loc_69><loc_42></location>4.1. Model architecture.</section_header_level_1>
-<text><location><page_4><loc_50><loc_16><loc_89><loc_40></location>We now describe in detail the proposed method, which is composed of three main components, see Fig. 4. Our CNN Backbone Network encodes the input as a feature vector of predefined length. The input feature vector of the encoded image is passed to the Structure Decoder to produce a sequence of HTML tags that represent the structure of the table. With each prediction of an HTML standard data cell (' < td > ') the hidden state of that cell is passed to the Cell BBox Decoder. As for spanning cells, such as row or column span, the tag is broken down to ' < ', 'rowspan=' or 'colspan=', with the number of spanning cells (attribute), and ' > '. The hidden state attached to ' < ' is passed to the Cell BBox Decoder. A shared feed forward network (FFN) receives the hidden states from the Structure Decoder, to provide the final detection predictions of the bounding box coordinates and their classification.</text>
-<text><location><page_4><loc_50><loc_10><loc_89><loc_16></location>CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table image and encodes it as a vector of predefined length. The network has been modified by removing the linear and pooling layer, as we are not per-</text>
-<figure>
-<location><page_5><loc_12><loc_77><loc_85><loc_90></location>
-<caption>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
-</figure>
-<figure>
-<location><page_5><loc_9><loc_36><loc_47><loc_67></location>
-<caption>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
-</figure>
-<text><location><page_5><loc_50><loc_63><loc_89><loc_68></location>forming classification, and adding an adaptive pooling layer of size 28*28. ResNet by default downsamples the image resolution by 32 and then the encoded image is provided to both the Structure Decoder , and Cell BBox Decoder .</text>
-<text><location><page_5><loc_50><loc_48><loc_89><loc_62></location>Structure Decoder. The transformer architecture of this component is based on the work proposed in [31]. After extensive experimentation, the Structure Decoder is modeled as a transformer encoder with two encoder layers and a transformer decoder made from a stack of 4 decoder layers that comprise mainly of multi-head attention and feed forward layers. This configuration uses fewer layers and heads in comparison to networks applied to other problems (e.g. "Scene Understanding", "Image Captioning"), something which we relate to the simplicity of table images.</text>
-<text><location><page_5><loc_50><loc_31><loc_89><loc_47></location>The transformer encoder receives an encoded image from the CNN Backbone Network and refines it through a multi-head dot-product attention layer, followed by a Feed Forward Network. During training, the transformer decoder receives as input the output feature produced by the transformer encoder, and the tokenized input of the HTML ground-truth tags. Using a stack of multi-head attention layers, different aspects of the tag sequence could be inferred. This is achieved by each attention head on a layer operating in a different subspace, and then combining altogether their attention score.</text>
-<text><location><page_5><loc_50><loc_18><loc_89><loc_31></location>Cell BBox Decoder. Our architecture allows to simultaneously predict HTML tags and bounding boxes for each table cell without the need of a separate object detector end to end. This approach is inspired by DETR [1] which employs a Transformer Encoder, and Decoder that looks for a specific number of object queries (potential object detections). As our model utilizes a transformer architecture, the hidden state of the < td > ' and ' < ' HTML structure tags become the object query.</text>
-<text><location><page_5><loc_50><loc_10><loc_89><loc_17></location>The encoding generated by the CNN Backbone Network along with the features acquired for every data cell from the Transformer Decoder are then passed to the attention network. The attention network takes both inputs and learns to provide an attention weighted encoding. This weighted at-</text>
-<text><location><page_6><loc_8><loc_80><loc_47><loc_91></location>tention encoding is then multiplied to the encoded image to produce a feature for each table cell. Notice that this is different than the typical object detection problem where imbalances between the number of detections and the amount of objects may exist. In our case, we know up front that the produced detections always match with the table cells in number and correspondence.</text>
-<text><location><page_6><loc_8><loc_70><loc_47><loc_80></location>The output features for each table cell are then fed into the feed-forward network (FFN). The FFN consists of a Multi-Layer Perceptron (3 layers with ReLU activation function) that predicts the normalized coordinates for the bounding box of each table cell. Finally, the predicted bounding boxes are classified based on whether they are empty or not using a linear layer.</text>
-<text><location><page_6><loc_8><loc_44><loc_47><loc_69></location>Loss Functions. We formulate a multi-task loss Eq. 2 to train our network. The Cross-Entropy loss (denoted as l$_{s}$ ) is used to train the Structure Decoder which predicts the structure tokens. As for the Cell BBox Decoder it is trained with a combination of losses denoted as l$_{box}$ . l$_{box}$ consists of the generally used l$_{1}$ loss for object detection and the IoU loss ( l$_{iou}$ ) to be scale invariant as explained in [25]. In comparison to DETR, we do not use the Hungarian algorithm [15] to match the predicted bounding boxes with the ground-truth boxes, as we have already achieved a one-toone match through two steps: 1) Our token input sequence is naturally ordered, therefore the hidden states of the table data cells are also in order when they are provided as input to the Cell BBox Decoder , and 2) Our bounding boxes generation mechanism (see Sec. 3) ensures a one-to-one mapping between the cell content and its bounding box for all post-processed datasets.</text>
-<text><location><page_6><loc_8><loc_41><loc_47><loc_43></location>The loss used to train the TableFormer can be defined as following:</text>
-<formula><location><page_6><loc_20><loc_35><loc_47><loc_38></location></formula>
-<text><location><page_6><loc_8><loc_32><loc_46><loc_33></location>where λ ∈ [0, 1], and λ$_{iou}$, λ$_{l}$$_{1}$ ∈$_{R}$ are hyper-parameters.</text>
-<section_header_level_1><location><page_6><loc_8><loc_28><loc_28><loc_30></location>5. Experimental Results</section_header_level_1>
-<section_header_level_1><location><page_6><loc_8><loc_26><loc_29><loc_27></location>5.1. Implementation Details</section_header_level_1>
-<text><location><page_6><loc_8><loc_19><loc_47><loc_25></location>TableFormer uses ResNet-18 as the CNN Backbone Network . The input images are resized to 448*448 pixels and the feature map has a dimension of 28*28. Additionally, we enforce the following input constraints:</text>
-<formula><location><page_6><loc_15><loc_14><loc_47><loc_17></location></formula>
-<text><location><page_6><loc_8><loc_10><loc_47><loc_13></location>Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved</text>
-<text><location><page_6><loc_50><loc_86><loc_89><loc_91></location>runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.</text>
-<text><location><page_6><loc_50><loc_59><loc_89><loc_85></location>The Transformer Encoder consists of two "Transformer Encoder Layers", with an input feature size of 512, feed forward network of 1024, and 4 attention heads. As for the Transformer Decoder it is composed of four "Transformer Decoder Layers" with similar input and output dimensions as the "Transformer Encoder Layers". Even though our model uses fewer layers and heads than the default implementation parameters, our extensive experimentation has proved this setup to be more suitable for table images. We attribute this finding to the inherent design of table images, which contain mostly lines and text, unlike the more elaborate content present in other scopes (e.g. the COCO dataset). Moreover, we have added ResNet blocks to the inputs of the Structure Decoder and Cell BBox Decoder. This prevents a decoder having a stronger influence over the learned weights which would damage the other prediction task (structure vs bounding boxes), but learn task specific weights instead. Lastly our dropout layers are set to 0.5.</text>
-<text><location><page_6><loc_50><loc_46><loc_89><loc_58></location>For training, TableFormer is trained with 3 Adam optimizers, each one for the CNN Backbone Network , Structure Decoder , and Cell BBox Decoder . Taking the PubTabNet as an example for our parameter set up, the initializing learning rate is 0.001 for 12 epochs with a batch size of 24, and λ set to 0.5. Afterwards, we reduce the learning rate to 0.0001, the batch size to 18 and train for 12 more epochs or convergence.</text>
-<text><location><page_6><loc_50><loc_30><loc_89><loc_45></location>TableFormer is implemented with PyTorch and Torchvision libraries [22]. To speed up the inference, the image undergoes a single forward pass through the CNN Backbone Network and transformer encoder. This eliminates the overhead of generating the same features for each decoding step. Similarly, we employ a 'caching' technique to preform faster autoregressive decoding. This is achieved by storing the features of decoded tokens so we can reuse them for each time step. Therefore, we only compute the attention for each new tag.</text>
-<section_header_level_1><location><page_6><loc_50><loc_26><loc_65><loc_27></location>5.2. Generalization</section_header_level_1>
-<text><location><page_6><loc_50><loc_15><loc_89><loc_24></location>TableFormer is evaluated on three major publicly available datasets of different nature to prove the generalization and effectiveness of our model. The datasets used for evaluation are the PubTabNet, FinTabNet and TableBank which stem from the scientific, financial and general domains respectively.</text>
-<text><location><page_6><loc_50><loc_10><loc_89><loc_14></location>We also share our baseline results on the challenging SynthTabNet dataset. Throughout our experiments, the same parameters stated in Sec. 5.1 are utilized.</text>
-<section_header_level_1><location><page_7><loc_8><loc_89><loc_27><loc_91></location>5.3. Datasets and Metrics</section_header_level_1>
-<text><location><page_7><loc_8><loc_83><loc_47><loc_88></location>The Tree-Edit-Distance-Based Similarity (TEDS) metric was introduced in [37]. It represents the prediction, and ground-truth as a tree structure of HTML tags. This similarity is calculated as:</text>
-<formula><location><page_7><loc_14><loc_78><loc_47><loc_81></location></formula>
-<text><location><page_7><loc_8><loc_73><loc_47><loc_77></location>where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDist denotes the tree-edit distance, and | T | represents the number of nodes in T .</text>
-<section_header_level_1><location><page_7><loc_8><loc_70><loc_28><loc_72></location>5.4. Quantitative Analysis</section_header_level_1>
-<text><location><page_7><loc_8><loc_50><loc_47><loc_69></location>Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across different datasets by a large margin for predicting the table structure from an image. All the more, our model outperforms pre-trained methods. During the evaluation we do not apply any table filtering. We also provide our baseline results on the SynthTabNet dataset. It has been observed that large tables (e.g. tables that occupy half of the page or more) yield poor predictions. We attribute this issue to the image resizing during the preprocessing step, that produces downsampled images with indistinguishable features. This problem can be addressed by treating such big tables with a separate model which accepts a large input image size.</text>
-<table>
-<location><page_7><loc_9><loc_26><loc_46><loc_48></location>
-<caption>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption>
-<row_0><col_0><col_header>Model</col_0><col_1><col_header>Dataset</col_1><col_2><col_header>Simple</col_2><col_3><col_header>TEDS Complex</col_3><col_4><col_header>All</col_4></row_0>
-<row_1><col_0><row_header>EDD</col_0><col_1><body>PTN</col_1><col_2><body>91.1</col_2><col_3><body>88.7</col_3><col_4><body>89.9</col_4></row_1>
-<row_2><col_0><row_header>GTE</col_0><col_1><body>PTN</col_1><col_2><body>-</col_2><col_3><body>-</col_3><col_4><body>93.01</col_4></row_2>
-<row_3><col_0><row_header>TableFormer</col_0><col_1><body>PTN</col_1><col_2><body>98.5</col_2><col_3><body>95.0</col_3><col_4><body>96.75</col_4></row_3>
-<row_4><col_0><row_header>EDD</col_0><col_1><body>FTN</col_1><col_2><body>88.4</col_2><col_3><body>92.08</col_3><col_4><body>90.6</col_4></row_4>
-<row_5><col_0><row_header>GTE</col_0><col_1><body>FTN</col_1><col_2><body>-</col_2><col_3><body>-</col_3><col_4><body>87.14</col_4></row_5>
-<row_6><col_0><row_header>GTE (FT)</col_0><col_1><body>FTN</col_1><col_2><body>-</col_2><col_3><body>-</col_3><col_4><body>91.02</col_4></row_6>
-<row_7><col_0><row_header>TableFormer</col_0><col_1><body>FTN</col_1><col_2><body>97.5</col_2><col_3><body>96.0</col_3><col_4><body>96.8</col_4></row_7>
-<row_8><col_0><row_header>EDD</col_0><col_1><body>TB</col_1><col_2><body>86.0</col_2><col_3><body>-</col_3><col_4><body>86.0</col_4></row_8>
-<row_9><col_0><row_header>TableFormer</col_0><col_1><body>TB</col_1><col_2><body>89.6</col_2><col_3><body>-</col_3><col_4><body>89.6</col_4></row_9>
-<row_10><col_0><row_header>TableFormer</col_0><col_1><body>STN</col_1><col_2><body>96.9</col_2><col_3><body>95.7</col_3><col_4><body>96.7</col_4></row_10>
-</table>
-<text><location><page_7><loc_8><loc_21><loc_43><loc_22></location>FT: Model was trained on PubTabNet then finetuned.</text>
-<text><location><page_7><loc_8><loc_10><loc_47><loc_19></location>Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate</text>
-<text><location><page_7><loc_50><loc_71><loc_89><loc_91></location>our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.</text>
-<table>
-<location><page_7><loc_50><loc_62><loc_87><loc_69></location>
-<caption>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption>
-<row_0><col_0><col_header>Model</col_0><col_1><col_header>Dataset</col_1><col_2><col_header>mAP</col_2><col_3><col_header>mAP (PP)</col_3></row_0>
-<row_1><col_0><body>EDD+BBox</col_0><col_1><body>PubTabNet</col_1><col_2><body>79.2</col_2><col_3><body>82.7</col_3></row_1>
-<row_2><col_0><body>TableFormer</col_0><col_1><body>PubTabNet</col_1><col_2><body>82.1</col_2><col_3><body>86.8</col_3></row_2>
-<row_3><col_0><body>TableFormer</col_0><col_1><body>SynthTabNet</col_1><col_2><body>87.7</col_2><col_3><body>-</col_3></row_3>
-</table>
-<text><location><page_7><loc_50><loc_34><loc_89><loc_54></location>Cell Content. In this section, we evaluate the entire pipeline of recovering a table with content. Here we put our approach to test by capitalizing on extracting content from the PDF cells rather than decoding from images. Tab. 4 shows the TEDs score of HTML code representing the structure of the table along with the content inserted in the data cell and compared with the ground-truth. Our method achieved a 5.3% increase over the state-of-the-art, and commercial solutions. We believe our scores would be higher if the HTML ground-truth matched the extracted PDF cell content. Unfortunately, there are small discrepancies such as spacings around words or special characters with various unicode representations.</text>
-<table>
-<location><page_7><loc_54><loc_19><loc_85><loc_32></location>
-<caption>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption>
-<row_0><col_0><body>Model</col_0><col_1><col_header>Simple</col_1><col_2><col_header>TEDS Complex</col_2><col_3><col_header>All</col_3></row_0>
-<row_1><col_0><row_header>Tabula</col_0><col_1><body>78.0</col_1><col_2><body>57.8</col_2><col_3><body>67.9</col_3></row_1>
-<row_2><col_0><row_header>Traprange</col_0><col_1><body>60.8</col_1><col_2><body>49.9</col_2><col_3><body>55.4</col_3></row_2>
-<row_3><col_0><row_header>Camelot</col_0><col_1><body>80.0</col_1><col_2><body>66.0</col_2><col_3><body>73.0</col_3></row_3>
-<row_4><col_0><row_header>Acrobat Pro</col_0><col_1><body>68.9</col_1><col_2><body>61.8</col_2><col_3><body>65.3</col_3></row_4>
-<row_5><col_0><row_header>EDD</col_0><col_1><body>91.2</col_1><col_2><body>85.4</col_2><col_3><body>88.3</col_3></row_5>
-<row_6><col_0><row_header>TableFormer</col_0><col_1><body>95.4</col_1><col_2><body>90.1</col_2><col_3><body>93.6</col_3></row_6>
-</table>
-<unordered_list>
-<list_item><location><page_8><loc_9><loc_89><loc_10><loc_90></location>a.</list_item>
-<list_item><location><page_8><loc_11><loc_89><loc_82><loc_90></location>Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</list_item>
+<text><loc_41><loc_411><loc_234><loc_439>The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe</text>
+<footnote><loc_50><loc_445><loc_150><loc_450>$^{1}$https://github.com/IBM/SynthTabNet</footnote>
+<page_footer><loc_241><loc_463><loc_245><loc_469>2</page_footer>
+<text><loc_252><loc_47><loc_445><loc_68>its results & performance in Sec. 5. As a conclusion, we describe how this new model-architecture can be re-purposed for other tasks in the computer-vision community.</text>
+<section_header_level_1><loc_252><loc_77><loc_407><loc_84>2. Previous work and State of the Art</section_header_level_1>
+<text><loc_252><loc_90><loc_445><loc_209>Identifying the structure of a table has been an outstanding problem in the document-parsing community, that motivates many organised public challenges [6, 4, 14]. The difficulty of the problem can be attributed to a number of factors. First, there is a large variety in the shapes and sizes of tables. Such large variety requires a flexible method. This is especially true for complex column- and row headers, which can be extremely intricate and demanding. A second factor of complexity is the lack of data with regard to table-structure. Until the publication of PubTabNet [37], there were no large datasets (i.e. > 100 K tables) that provided structure information. This happens primarily due to the fact that tables are notoriously time-consuming to annotate by hand. However, this has definitely changed in recent years with the deliverance of PubTabNet [37], FinTabNet [36], TableBank [17] etc.</text>
+<text><loc_252><loc_211><loc_445><loc_284>Before the rising popularity of deep neural networks, the community relied heavily on heuristic and/or statistical methods to do table structure identification [3, 7, 11, 5, 13, 28]. Although such methods work well on constrained tables [12], a more data-driven approach can be applied due to the advent of convolutional neural networks (CNNs) and the availability of large datasets. To the best-of-our knowledge, there are currently two different types of network architecture that are being pursued for state-of-the-art tablestructure identification.</text>
+<text><loc_252><loc_286><loc_445><loc_450>Image-to-Text networks : In this type of network, one predicts a sequence of tokens starting from an encoded image. Such sequences of tokens can be HTML table tags [37, 17] or LaTeX symbols[10]. The choice of symbols is ultimately not very important, since one can be transformed into the other. There are however subtle variations in the Image-to-Text networks. The easiest network architectures are "image-encoder → text-decoder" (IETD), similar to network architectures that try to provide captions to images [32]. In these IETD networks, one expects as output the LaTeX/HTML string of the entire table, i.e. the symbols necessary for creating the table with the content of the table. Another approach is the "image-encoder → dual decoder" (IEDD) networks. In these type of networks, one has two consecutive decoders with different purposes. The first decoder is the tag-decoder , i.e. it only produces the HTML/LaTeX tags which construct an empty table. The second content-decoder uses the encoding of the image in combination with the output encoding of each cell-tag (from the tag-decoder ) to generate the textual content of each table cell. The network architecture of IEDD is certainly more elaborate, but it has the advantage that one can pre-train the</text>
+<page_break>
+<text><loc_41><loc_47><loc_204><loc_53>tag-decoder which is constrained to the table-tags.</text>
+<text><loc_41><loc_55><loc_234><loc_174>In practice, both network architectures (IETD and IEDD) require an implicit, custom trained object-characterrecognition (OCR) to obtain the content of the table-cells. In the case of IETD, this OCR engine is implicit in the decoder similar to [24]. For the IEDD, the OCR is solely embedded in the content-decoder. This reliance on a custom, implicit OCR decoder is of course problematic. OCR is a well known and extremely tough problem, that often needs custom training for each individual language. However, the limited availability for non-english content in the current datasets, makes it impractical to apply the IETD and IEDD methods on tables with other languages. Additionally, OCR can be completely omitted if the tables originate from programmatic PDF documents with known positions of each cell. The latter was the inspiration for the work of this paper.</text>
+<text><loc_41><loc_176><loc_234><loc_310>Graph Neural networks : Graph Neural networks (GNN's) take a radically different approach to tablestructure extraction. Note that one table cell can constitute out of multiple text-cells. To obtain the table-structure, one creates an initial graph, where each of the text-cells becomes a node in the graph similar to [33, 34, 2]. Each node is then associated with en embedding vector coming from the encoded image, its coordinates and the encoded text. Furthermore, nodes that represent adjacent text-cells are linked. Graph Convolutional Networks (GCN's) based methods take the image as an input, but also the position of the text-cells and their content [18]. The purpose of a GCN is to transform the input graph into a new graph, which replaces the old links with new ones. The new links then represent the table-structure. With this approach, one can avoid the need to build custom OCR decoders. However, the quality of the reconstructed structure is not comparable to the current state-of-the-art [18].</text>
+<text><loc_41><loc_312><loc_234><loc_393>Hybrid Deep Learning-Rule-Based approach : A popular current model for table-structure identification is the use of a hybrid Deep Learning-Rule-Based approach similar to [27, 29]. In this approach, one first detects the position of the table-cells with object detection (e.g. YoloVx or MaskRCNN), then classifies the table into different types (from its images) and finally uses different rule-sets to obtain its table-structure. Currently, this approach achieves stateof-the-art results, but is not an end-to-end deep-learning method. As such, new rules need to be written if different types of tables are encountered.</text>
+<section_header_level_1><loc_41><loc_401><loc_86><loc_408>3. Datasets</section_header_level_1>
+<text><loc_41><loc_414><loc_234><loc_450>We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and TableBank [17] datasets to train and evaluate our models. These datasets span over various appearance styles and content. We also introduce our own synthetically generated SynthTabNet dataset to fix an im-</text>
+<page_footer><loc_241><loc_463><loc_245><loc_469>3</page_footer>
+<picture><loc_255><loc_50><loc_450><loc_158><caption><loc_252><loc_169><loc_445><loc_182>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption></picture>
+<text><loc_252><loc_200><loc_357><loc_206>balance in the previous datasets.</text>
+<text><loc_252><loc_209><loc_445><loc_396>The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as "simple" when it does not contain row spans or column spans, otherwise it is "complex". The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits.</text>
+<text><loc_252><loc_399><loc_445><loc_450>Due to the heterogeneity across the dataset formats, it was necessary to combine all available data into one homogenized dataset before we could train our models for practical purposes. Given the size of PubTabNet, we adopted its annotation format and we extracted and converted all tables as PNG images with a resolution of 72 dpi. Additionally, we have filtered out tables with extreme sizes due to small</text>
+<page_break>
+<text><loc_41><loc_47><loc_234><loc_61>amount of such tables, and kept only those ones ranging between 1*1 and 20*10 (rows/columns).</text>
+<text><loc_41><loc_64><loc_234><loc_198>The availability of the bounding boxes for all table cells is essential to train our models. In order to distinguish between empty and non-empty bounding boxes, we have introduced a binary class in the annotation. Unfortunately, the original datasets either omit the bounding boxes for whole tables (e.g. TableBank) or they narrow their scope only to non-empty cells. Therefore, it was imperative to introduce a data pre-processing procedure that generates the missing bounding boxes out of the annotation information. This procedure first parses the provided table structure and calculates the dimensions of the most fine-grained grid that covers the table structure. Notice that each table cell may occupy multiple grid squares due to row or column spans. In case of PubTabNet we had to compute missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes.</text>
+<text><loc_41><loc_201><loc_234><loc_274>As it is illustrated in Fig. 2, the table distributions from all datasets are skewed towards simpler structures with fewer number of rows/columns. Additionally, there is very limited variance in the table styles, which in case of PubTabNet and FinTabNet means one styling format for the majority of the tables. Similar limitations appear also in the type of table content, which in some cases (e.g. FinTabNet) is restricted to a certain domain. Ultimately, the lack of diversity in the training dataset damages the ability of the models to generalize well on unseen data.</text>
+<text><loc_41><loc_277><loc_234><loc_396>Motivated by those observations we aimed at generating a synthetic table dataset named SynthTabNet . This approach offers control over: 1) the size of the dataset, 2) the table structure, 3) the table style and 4) the type of content. The complexity of the table structure is described by the size of the table header and the table body, as well as the percentage of the table cells covered by row spans and column spans. A set of carefully designed styling templates provides the basis to build a wide range of table appearances. Lastly, the table content is generated out of a curated collection of text corpora. By controlling the size and scope of the synthetic datasets we are able to train and evaluate our models in a variety of different conditions. For example, we can first generate a highly diverse dataset to train our models and then evaluate their performance on other synthetic datasets which are focused on a specific domain.</text>
+<text><loc_41><loc_399><loc_234><loc_450>In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third</text>
+<page_footer><loc_241><loc_463><loc_245><loc_469>4</page_footer>
+<otsl><loc_254><loc_46><loc_444><loc_98><ecel><ched>Tags<ched>Bbox<ched>Size<ched>Format<nl><rhed>PubTabNet<fcel>3<fcel>3<fcel>509k<fcel>PNG<nl><rhed>FinTabNet<fcel>3<fcel>3<fcel>112k<fcel>PDF<nl><rhed>TableBank<fcel>3<fcel>7<fcel>145k<fcel>JPEG<nl><rhed>Combined-Tabnet(*)<fcel>3<fcel>3<fcel>400k<fcel>PNG<nl><rhed>Combined(**)<fcel>3<fcel>3<fcel>500k<fcel>PNG<nl><rhed>SynthTabNet<fcel>3<fcel>3<fcel>600k<fcel>PNG<nl><caption><loc_252><loc_106><loc_445><loc_142>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption></otsl>
+<text><loc_252><loc_158><loc_445><loc_186>one adopts a colorful appearance with high contrast and the last one contains tables with sparse content. Lastly, we have combined all synthetic datasets into one big unified synthetic dataset of 600k examples.</text>
+<text><loc_262><loc_188><loc_443><loc_194>Tab. 1 summarizes the various attributes of the datasets.</text>
+<section_header_level_1><loc_252><loc_203><loc_364><loc_210>4. The TableFormer model</section_header_level_1>
+<text><loc_252><loc_216><loc_445><loc_282>Given the image of a table, TableFormer is able to predict: 1) a sequence of tokens that represent the structure of a table, and 2) a bounding box coupled to a subset of those tokens. The conversion of an image into a sequence of tokens is a well-known task [35, 16]. While attention is often used as an implicit method to associate each token of the sequence with a position in the original image, an explicit association between the individual table-cells and the image bounding boxes is also required.</text>
+<section_header_level_1><loc_252><loc_289><loc_343><loc_295>4.1. Model architecture.</section_header_level_1>
+<text><loc_252><loc_301><loc_445><loc_420>We now describe in detail the proposed method, which is composed of three main components, see Fig. 4. Our CNN Backbone Network encodes the input as a feature vector of predefined length. The input feature vector of the encoded image is passed to the Structure Decoder to produce a sequence of HTML tags that represent the structure of the table. With each prediction of an HTML standard data cell (' < td > ') the hidden state of that cell is passed to the Cell BBox Decoder. As for spanning cells, such as row or column span, the tag is broken down to ' < ', 'rowspan=' or 'colspan=', with the number of spanning cells (attribute), and ' > '. The hidden state attached to ' < ' is passed to the Cell BBox Decoder. A shared feed forward network (FFN) receives the hidden states from the Structure Decoder, to provide the final detection predictions of the bounding box coordinates and their classification.</text>
+<text><loc_252><loc_422><loc_445><loc_450>CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table image and encodes it as a vector of predefined length. The network has been modified by removing the linear and pooling layer, as we are not per-</text>
+<page_break>
+<picture><loc_61><loc_49><loc_425><loc_116><caption><loc_41><loc_129><loc_445><loc_142>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption></picture>
+<picture><loc_43><loc_163><loc_233><loc_320><caption><loc_41><loc_333><loc_234><loc_429>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption></picture>
+<text><loc_252><loc_158><loc_445><loc_186>forming classification, and adding an adaptive pooling layer of size 28*28. ResNet by default downsamples the image resolution by 32 and then the encoded image is provided to both the Structure Decoder , and Cell BBox Decoder .</text>
+<text><loc_252><loc_188><loc_445><loc_261>Structure Decoder. The transformer architecture of this component is based on the work proposed in [31]. After extensive experimentation, the Structure Decoder is modeled as a transformer encoder with two encoder layers and a transformer decoder made from a stack of 4 decoder layers that comprise mainly of multi-head attention and feed forward layers. This configuration uses fewer layers and heads in comparison to networks applied to other problems (e.g. "Scene Understanding", "Image Captioning"), something which we relate to the simplicity of table images.</text>
+<text><loc_252><loc_263><loc_445><loc_344>The transformer encoder receives an encoded image from the CNN Backbone Network and refines it through a multi-head dot-product attention layer, followed by a Feed Forward Network. During training, the transformer decoder receives as input the output feature produced by the transformer encoder, and the tokenized input of the HTML ground-truth tags. Using a stack of multi-head attention layers, different aspects of the tag sequence could be inferred. This is achieved by each attention head on a layer operating in a different subspace, and then combining altogether their attention score.</text>
+<text><loc_252><loc_346><loc_445><loc_412>Cell BBox Decoder. Our architecture allows to simultaneously predict HTML tags and bounding boxes for each table cell without the need of a separate object detector end to end. This approach is inspired by DETR [1] which employs a Transformer Encoder, and Decoder that looks for a specific number of object queries (potential object detections). As our model utilizes a transformer architecture, the hidden state of the < td > ' and ' < ' HTML structure tags become the object query.</text>
+<text><loc_252><loc_414><loc_445><loc_450>The encoding generated by the CNN Backbone Network along with the features acquired for every data cell from the Transformer Decoder are then passed to the attention network. The attention network takes both inputs and learns to provide an attention weighted encoding. This weighted at-</text>
+<page_footer><loc_241><loc_463><loc_245><loc_469>5</page_footer>
+<page_break>
+<text><loc_41><loc_47><loc_234><loc_98>tention encoding is then multiplied to the encoded image to produce a feature for each table cell. Notice that this is different than the typical object detection problem where imbalances between the number of detections and the amount of objects may exist. In our case, we know up front that the produced detections always match with the table cells in number and correspondence.</text>
+<text><loc_41><loc_101><loc_234><loc_152>The output features for each table cell are then fed into the feed-forward network (FFN). The FFN consists of a Multi-Layer Perceptron (3 layers with ReLU activation function) that predicts the normalized coordinates for the bounding box of each table cell. Finally, the predicted bounding boxes are classified based on whether they are empty or not using a linear layer.</text>
+<text><loc_41><loc_154><loc_234><loc_280>Loss Functions. We formulate a multi-task loss Eq. 2 to train our network. The Cross-Entropy loss (denoted as l$_{s}$ ) is used to train the Structure Decoder which predicts the structure tokens. As for the Cell BBox Decoder it is trained with a combination of losses denoted as l$_{box}$ . l$_{box}$ consists of the generally used l$_{1}$ loss for object detection and the IoU loss ( l$_{iou}$ ) to be scale invariant as explained in [25]. In comparison to DETR, we do not use the Hungarian algorithm [15] to match the predicted bounding boxes with the ground-truth boxes, as we have already achieved a one-toone match through two steps: 1) Our token input sequence is naturally ordered, therefore the hidden states of the table data cells are also in order when they are provided as input to the Cell BBox Decoder , and 2) Our bounding boxes generation mechanism (see Sec. 3) ensures a one-to-one mapping between the cell content and its bounding box for all post-processed datasets.</text>
+<text><loc_41><loc_283><loc_234><loc_296>The loss used to train the TableFormer can be defined as following:</text>
+<formula><loc_102><loc_311><loc_234><loc_326></formula>
+<text><loc_41><loc_335><loc_230><loc_341>where λ ∈ [0, 1], and λ$_{iou}$, λ$_{l}$$_{1}$ ∈$_{R}$ are hyper-parameters.</text>
+<section_header_level_1><loc_41><loc_351><loc_141><loc_358>5. Experimental Results</section_header_level_1>
+<section_header_level_1><loc_41><loc_364><loc_146><loc_370>5.1. Implementation Details</section_header_level_1>
+<text><loc_41><loc_376><loc_234><loc_404>TableFormer uses ResNet-18 as the CNN Backbone Network . The input images are resized to 448*448 pixels and the feature map has a dimension of 28*28. Additionally, we enforce the following input constraints:</text>
+<formula><loc_75><loc_413><loc_234><loc_428></formula>
+<text><loc_41><loc_437><loc_234><loc_450>Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved</text>
+<page_footer><loc_241><loc_463><loc_245><loc_469>6</page_footer>
+<text><loc_252><loc_47><loc_445><loc_68>runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.</text>
+<text><loc_252><loc_73><loc_445><loc_207>The Transformer Encoder consists of two "Transformer Encoder Layers", with an input feature size of 512, feed forward network of 1024, and 4 attention heads. As for the Transformer Decoder it is composed of four "Transformer Decoder Layers" with similar input and output dimensions as the "Transformer Encoder Layers". Even though our model uses fewer layers and heads than the default implementation parameters, our extensive experimentation has proved this setup to be more suitable for table images. We attribute this finding to the inherent design of table images, which contain mostly lines and text, unlike the more elaborate content present in other scopes (e.g. the COCO dataset). Moreover, we have added ResNet blocks to the inputs of the Structure Decoder and Cell BBox Decoder. This prevents a decoder having a stronger influence over the learned weights which would damage the other prediction task (structure vs bounding boxes), but learn task specific weights instead. Lastly our dropout layers are set to 0.5.</text>
+<text><loc_252><loc_212><loc_445><loc_271>For training, TableFormer is trained with 3 Adam optimizers, each one for the CNN Backbone Network , Structure Decoder , and Cell BBox Decoder . Taking the PubTabNet as an example for our parameter set up, the initializing learning rate is 0.001 for 12 epochs with a batch size of 24, and λ set to 0.5. Afterwards, we reduce the learning rate to 0.0001, the batch size to 18 and train for 12 more epochs or convergence.</text>
+<text><loc_252><loc_276><loc_445><loc_350>TableFormer is implemented with PyTorch and Torchvision libraries [22]. To speed up the inference, the image undergoes a single forward pass through the CNN Backbone Network and transformer encoder. This eliminates the overhead of generating the same features for each decoding step. Similarly, we employ a 'caching' technique to preform faster autoregressive decoding. This is achieved by storing the features of decoded tokens so we can reuse them for each time step. Therefore, we only compute the attention for each new tag.</text>
+<section_header_level_1><loc_252><loc_366><loc_325><loc_372>5.2. Generalization</section_header_level_1>
+<text><loc_252><loc_381><loc_445><loc_424>TableFormer is evaluated on three major publicly available datasets of different nature to prove the generalization and effectiveness of our model. The datasets used for evaluation are the PubTabNet, FinTabNet and TableBank which stem from the scientific, financial and general domains respectively.</text>
+<text><loc_252><loc_430><loc_445><loc_450>We also share our baseline results on the challenging SynthTabNet dataset. Throughout our experiments, the same parameters stated in Sec. 5.1 are utilized.</text>
+<page_break>
+<section_header_level_1><loc_41><loc_47><loc_137><loc_53>5.3. Datasets and Metrics</section_header_level_1>
+<text><loc_41><loc_59><loc_234><loc_87>The Tree-Edit-Distance-Based Similarity (TEDS) metric was introduced in [37]. It represents the prediction, and ground-truth as a tree structure of HTML tags. This similarity is calculated as:</text>
+<formula><loc_70><loc_95><loc_234><loc_109></formula>
+<text><loc_41><loc_114><loc_234><loc_135>where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDist denotes the tree-edit distance, and | T | represents the number of nodes in T .</text>
+<section_header_level_1><loc_41><loc_142><loc_139><loc_148>5.4. Quantitative Analysis</section_header_level_1>
+<text><loc_41><loc_154><loc_234><loc_250>Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across different datasets by a large margin for predicting the table structure from an image. All the more, our model outperforms pre-trained methods. During the evaluation we do not apply any table filtering. We also provide our baseline results on the SynthTabNet dataset. It has been observed that large tables (e.g. tables that occupy half of the page or more) yield poor predictions. We attribute this issue to the image resizing during the preprocessing step, that produces downsampled images with indistinguishable features. This problem can be addressed by treating such big tables with a separate model which accepts a large input image size.</text>
+<otsl><loc_44><loc_258><loc_231><loc_368><ched>Model<ched>Dataset<ched>Simple<ched>TEDS Complex<ched>All<nl><rhed>EDD<fcel>PTN<fcel>91.1<fcel>88.7<fcel>89.9<nl><rhed>GTE<fcel>PTN<fcel>-<fcel>-<fcel>93.01<nl><rhed>TableFormer<fcel>PTN<fcel>98.5<fcel>95.0<fcel>96.75<nl><rhed>EDD<fcel>FTN<fcel>88.4<fcel>92.08<fcel>90.6<nl><rhed>GTE<fcel>FTN<fcel>-<fcel>-<fcel>87.14<nl><rhed>GTE (FT)<fcel>FTN<fcel>-<fcel>-<fcel>91.02<nl><rhed>TableFormer<fcel>FTN<fcel>97.5<fcel>96.0<fcel>96.8<nl><rhed>EDD<fcel>TB<fcel>86.0<fcel>-<fcel>86.0<nl><rhed>TableFormer<fcel>TB<fcel>89.6<fcel>-<fcel>89.6<nl><rhed>TableFormer<fcel>STN<fcel>96.9<fcel>95.7<fcel>96.7<nl><caption><loc_41><loc_374><loc_234><loc_387>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption></otsl>
+<text><loc_41><loc_389><loc_214><loc_395>FT: Model was trained on PubTabNet then finetuned.</text>
+<text><loc_41><loc_407><loc_234><loc_450>Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate</text>
+<page_footer><loc_241><loc_463><loc_245><loc_469>7</page_footer>
+<text><loc_252><loc_47><loc_445><loc_144>our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.</text>
+<otsl><loc_252><loc_156><loc_436><loc_192><ched>Model<ched>Dataset<ched>mAP<ched>mAP (PP)<nl><fcel>EDD+BBox<fcel>PubTabNet<fcel>79.2<fcel>82.7<nl><fcel>TableFormer<fcel>PubTabNet<fcel>82.1<fcel>86.8<nl><fcel>TableFormer<fcel>SynthTabNet<fcel>87.7<fcel>-<nl><caption><loc_252><loc_200><loc_445><loc_213>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption></otsl>
+<text><loc_252><loc_232><loc_445><loc_328>Cell Content. In this section, we evaluate the entire pipeline of recovering a table with content. Here we put our approach to test by capitalizing on extracting content from the PDF cells rather than decoding from images. Tab. 4 shows the TEDs score of HTML code representing the structure of the table along with the content inserted in the data cell and compared with the ground-truth. Our method achieved a 5.3% increase over the state-of-the-art, and commercial solutions. We believe our scores would be higher if the HTML ground-truth matched the extracted PDF cell content. Unfortunately, there are small discrepancies such as spacings around words or special characters with various unicode representations.</text>
+<otsl><loc_272><loc_341><loc_426><loc_406><fcel>Model<ched>Simple<ched>TEDS Complex<ched>All<nl><rhed>Tabula<fcel>78.0<fcel>57.8<fcel>67.9<nl><rhed>Traprange<fcel>60.8<fcel>49.9<fcel>55.4<nl><rhed>Camelot<fcel>80.0<fcel>66.0<fcel>73.0<nl><rhed>Acrobat Pro<fcel>68.9<fcel>61.8<fcel>65.3<nl><rhed>EDD<fcel>91.2<fcel>85.4<fcel>88.3<nl><rhed>TableFormer<fcel>95.4<fcel>90.1<fcel>93.6<nl><caption><loc_252><loc_415><loc_445><loc_435>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption></otsl>
+<unordered_list><page_break>
+<list_item><loc_44><loc_50><loc_50><loc_55>a.</list_item>
+<list_item><loc_54><loc_50><loc_408><loc_55>Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</list_item>
 </unordered_list>
-<section_header_level_1><location><page_8><loc_9><loc_87><loc_46><loc_88></location>Japanese language (previously unseen by TableFormer):</section_header_level_1>
-<section_header_level_1><location><page_8><loc_50><loc_87><loc_70><loc_88></location>Example table from FinTabNet:</section_header_level_1>
-<figure>
-<location><page_8><loc_8><loc_76><loc_49><loc_87></location>
-</figure>
-<figure>
-<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
-<caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
-</figure>
-<table>
-<location><page_8><loc_9><loc_63><loc_49><loc_72></location>
-<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>論文ファイル</col_2><col_3><col_header>論文ファイル</col_3><col_4><col_header>参考文献</col_4><col_5><col_header>参考文献</col_5></row_0>
-<row_1><col_0><col_header>出典</col_0><col_1><col_header>ファイル 数</col_1><col_2><col_header>英語</col_2><col_3><col_header>日本語</col_3><col_4><col_header>英語</col_4><col_5><col_header>日本語</col_5></row_1>
-<row_2><col_0><row_header>Association for Computational Linguistics(ACL2003)</col_0><col_1><body>65</col_1><col_2><body>65</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_2>
-<row_3><col_0><row_header>Computational Linguistics(COLING2002)</col_0><col_1><body>140</col_1><col_2><body>140</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_3>
-<row_4><col_0><row_header>電気情報通信学会 2003 年総合大会</col_0><col_1><body>150</col_1><col_2><body>8</col_2><col_3><body>142</col_3><col_4><body>223</col_4><col_5><body>147</col_5></row_4>
-<row_5><col_0><row_header>情報処理学会第 65 回全国大会 (2003)</col_0><col_1><body>177</col_1><col_2><body>1</col_2><col_3><body>176</col_3><col_4><body>150</col_4><col_5><body>236</col_5></row_5>
-<row_6><col_0><row_header>第 17 回人工知能学会全国大会 (2003)</col_0><col_1><body>208</col_1><col_2><body>5</col_2><col_3><body>203</col_3><col_4><body>152</col_4><col_5><body>244</col_5></row_6>
-<row_7><col_0><row_header>自然言語処理研究会第 146 〜 155 回</col_0><col_1><body>98</col_1><col_2><body>2</col_2><col_3><body>96</col_3><col_4><body>150</col_4><col_5><body>232</col_5></row_7>
-<row_8><col_0><row_header>WWW から収集した論文</col_0><col_1><body>107</col_1><col_2><body>73</col_2><col_3><body>34</col_3><col_4><body>147</col_4><col_5><body>96</col_5></row_8>
-<row_9><col_0><body></col_0><col_1><body>945</col_1><col_2><body>294</col_2><col_3><body>651</col_3><col_4><body>1122</col_4><col_5><body>955</col_5></row_9>
-</table>
-<table>
-<location><page_8><loc_50><loc_64><loc_90><loc_72></location>
-<caption>Text is aligned to match original for ease of viewing</caption>
-<row_0><col_0><body></col_0><col_1><col_header>Shares (in millions)</col_1><col_2><col_header>Shares (in millions)</col_2><col_3><col_header>Weighted Average Grant Date Fair Value</col_3><col_4><col_header>Weighted Average Grant Date Fair Value</col_4></row_0>
-<row_1><col_0><body></col_0><col_1><col_header>RS U s</col_1><col_2><col_header>PSUs</col_2><col_3><col_header>RSUs</col_3><col_4><col_header>PSUs</col_4></row_1>
-<row_2><col_0><row_header>Nonvested on Janua ry 1</col_0><col_1><body>1. 1</col_1><col_2><body>0.3</col_2><col_3><body>90.10 $</col_3><col_4><body>$ 91.19</col_4></row_2>
-<row_3><col_0><row_header>Granted</col_0><col_1><body>0. 5</col_1><col_2><body>0.1</col_2><col_3><body>117.44</col_3><col_4><body>122.41</col_4></row_3>
-<row_4><col_0><row_header>Vested</col_0><col_1><body>(0. 5 )</col_1><col_2><body>(0.1)</col_2><col_3><body>87.08</col_3><col_4><body>81.14</col_4></row_4>
-<row_5><col_0><row_header>Canceled or forfeited</col_0><col_1><body>(0. 1 )</col_1><col_2><body>-</col_2><col_3><body>102.01</col_3><col_4><body>92.18</col_4></row_5>
-<row_6><col_0><row_header>Nonvested on December 31</col_0><col_1><body>1.0</col_1><col_2><body>0.3</col_2><col_3><body>104.85 $</col_3><col_4><body>$ 104.51</col_4></row_6>
-</table>
-<figure>
-<location><page_8><loc_8><loc_44><loc_35><loc_52></location>
-<caption>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
-</figure>
-<figure>
-<location><page_8><loc_63><loc_44><loc_89><loc_52></location>
-</figure>
-<figure>
-<location><page_8><loc_35><loc_44><loc_61><loc_52></location>
-<caption>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
-</figure>
-<section_header_level_1><location><page_8><loc_8><loc_37><loc_27><loc_38></location>5.5. Qualitative Analysis</section_header_level_1>
-<text><location><page_8><loc_8><loc_10><loc_47><loc_32></location>We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.</text>
-<section_header_level_1><location><page_8><loc_50><loc_37><loc_75><loc_38></location>6. Future Work & Conclusion</section_header_level_1>
-<text><location><page_8><loc_50><loc_18><loc_89><loc_35></location>In this paper, we presented TableFormer an end-to-end transformer based approach to predict table structures and bounding boxes of cells from an image. This approach enables us to recreate the table structure, and extract the cell content from PDF or OCR by using bounding boxes. Additionally, it provides the versatility required in real-world scenarios when dealing with various types of PDF documents, and languages. Furthermore, our method outperforms all state-of-the-arts with a wide margin. Finally, we introduce "SynthTabNet" a challenging synthetically generated dataset that reinforces missing characteristics from other datasets.</text>
-<section_header_level_1><location><page_8><loc_50><loc_14><loc_60><loc_15></location>References</section_header_level_1>
-<unordered_list>
-<list_item><location><page_8><loc_51><loc_10><loc_89><loc_12></location>[1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-</list_item>
+<section_header_level_1><loc_44><loc_60><loc_232><loc_64>Japanese language (previously unseen by TableFormer):</section_header_level_1>
+<section_header_level_1><loc_249><loc_60><loc_352><loc_64>Example table from FinTabNet:</section_header_level_1>
+<picture><loc_41><loc_65><loc_246><loc_118></picture>
+<picture><loc_250><loc_62><loc_453><loc_114><caption><loc_44><loc_131><loc_315><loc_136>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption></picture>
+<otsl><loc_44><loc_138><loc_244><loc_185><ecel><ecel><ched>論文ファイル<lcel><ched>参考文献<lcel><nl><ched>出典<ched>ファイル 数<ched>英語<ched>日本語<ched>英語<ched>日本語<nl><rhed>Association for Computational Linguistics(ACL2003)<fcel>65<fcel>65<fcel>0<fcel>150<fcel>0<nl><rhed>Computational Linguistics(COLING2002)<fcel>140<fcel>140<fcel>0<fcel>150<fcel>0<nl><rhed>電気情報通信学会 2003 年総合大会<fcel>150<fcel>8<fcel>142<fcel>223<fcel>147<nl><rhed>情報処理学会第 65 回全国大会 (2003)<fcel>177<fcel>1<fcel>176<fcel>150<fcel>236<nl><rhed>第 17 回人工知能学会全国大会 (2003)<fcel>208<fcel>5<fcel>203<fcel>152<fcel>244<nl><rhed>自然言語処理研究会第 146 〜 155 回<fcel>98<fcel>2<fcel>96<fcel>150<fcel>232<nl><rhed>WWW から収集した論文<fcel>107<fcel>73<fcel>34<fcel>147<fcel>96<nl><ecel><fcel>945<fcel>294<fcel>651<fcel>1122<fcel>955<nl></otsl>
+<otsl><loc_249><loc_138><loc_450><loc_182><ecel><ched>Shares (in millions)<lcel><ched>Weighted Average Grant Date Fair Value<lcel><nl><ecel><ched>RS U s<ched>PSUs<ched>RSUs<ched>PSUs<nl><rhed>Nonvested on Janua ry 1<fcel>1. 1<fcel>0.3<fcel>90.10 $<fcel>$ 91.19<nl><rhed>Granted<fcel>0. 5<fcel>0.1<fcel>117.44<fcel>122.41<nl><rhed>Vested<fcel>(0. 5 )<fcel>(0.1)<fcel>87.08<fcel>81.14<nl><rhed>Canceled or forfeited<fcel>(0. 1 )<fcel>-<fcel>102.01<fcel>92.18<nl><rhed>Nonvested on December 31<fcel>1.0<fcel>0.3<fcel>104.85 $<fcel>$ 104.51<nl><caption><loc_311><loc_185><loc_449><loc_189>Text is aligned to match original for ease of viewing</caption></otsl>
+<picture><loc_42><loc_240><loc_173><loc_280><caption><loc_41><loc_203><loc_445><loc_231>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption></picture>
+<picture><loc_313><loc_241><loc_443><loc_280></picture>
+<picture><loc_177><loc_240><loc_307><loc_280><caption><loc_51><loc_290><loc_435><loc_295>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption></picture>
+<section_header_level_1><loc_41><loc_310><loc_134><loc_316>5.5. Qualitative Analysis</section_header_level_1>
+<text><loc_41><loc_339><loc_234><loc_450>We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.</text>
+<section_header_level_1><loc_252><loc_310><loc_377><loc_317>6. Future Work & Conclusion</section_header_level_1>
+<text><loc_252><loc_324><loc_445><loc_412>In this paper, we presented TableFormer an end-to-end transformer based approach to predict table structures and bounding boxes of cells from an image. This approach enables us to recreate the table structure, and extract the cell content from PDF or OCR by using bounding boxes. Additionally, it provides the versatility required in real-world scenarios when dealing with various types of PDF documents, and languages. Furthermore, our method outperforms all state-of-the-arts with a wide margin. Finally, we introduce "SynthTabNet" a challenging synthetically generated dataset that reinforces missing characteristics from other datasets.</text>
+<section_header_level_1><loc_252><loc_424><loc_298><loc_431>References</section_header_level_1>
+<unordered_list><list_item><loc_256><loc_438><loc_445><loc_450>[1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-</list_item>
 </unordered_list>
-<unordered_list>
-<list_item><location><page_9><loc_11><loc_85><loc_47><loc_90></location>end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5</list_item>
-<list_item><location><page_9><loc_9><loc_81><loc_47><loc_85></location>[2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3</list_item>
-<list_item><location><page_9><loc_9><loc_77><loc_47><loc_81></location>[3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2</list_item>
-<list_item><location><page_9><loc_9><loc_71><loc_47><loc_76></location>[4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2</list_item>
-<list_item><location><page_9><loc_9><loc_66><loc_47><loc_71></location>[5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2</list_item>
-<list_item><location><page_9><loc_9><loc_60><loc_47><loc_65></location>[6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2</list_item>
-<list_item><location><page_9><loc_9><loc_56><loc_47><loc_60></location>[7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2</list_item>
-<list_item><location><page_9><loc_9><loc_49><loc_47><loc_56></location>[8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1</list_item>
-<list_item><location><page_9><loc_9><loc_45><loc_47><loc_49></location>[9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1</list_item>
-<list_item><location><page_9><loc_8><loc_39><loc_47><loc_44></location>[10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2</list_item>
-<list_item><location><page_9><loc_8><loc_32><loc_47><loc_39></location>[11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2</list_item>
-<list_item><location><page_9><loc_8><loc_25><loc_47><loc_32></location>[12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2</list_item>
-<list_item><location><page_9><loc_8><loc_18><loc_47><loc_25></location>[13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2</list_item>
-<list_item><location><page_9><loc_8><loc_14><loc_47><loc_18></location>[14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2</list_item>
-<list_item><location><page_9><loc_8><loc_10><loc_47><loc_14></location>[15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6</list_item>
+<page_footer><loc_241><loc_463><loc_245><loc_469>8</page_footer>
+<unordered_list><page_break>
+<list_item><loc_57><loc_48><loc_234><loc_74>end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5</list_item>
+<list_item><loc_45><loc_76><loc_234><loc_95>[2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3</list_item>
+<list_item><loc_45><loc_97><loc_234><loc_116>[3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2</list_item>
+<list_item><loc_45><loc_118><loc_234><loc_143>[4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2</list_item>
+<list_item><loc_45><loc_146><loc_234><loc_171>[5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2</list_item>
+<list_item><loc_45><loc_173><loc_234><loc_199>[6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2</list_item>
+<list_item><loc_45><loc_201><loc_234><loc_220>[7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2</list_item>
+<list_item><loc_45><loc_222><loc_234><loc_255>[8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1</list_item>
+<list_item><loc_45><loc_257><loc_234><loc_276>[9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1</list_item>
+<list_item><loc_41><loc_278><loc_234><loc_304>[10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2</list_item>
+<list_item><loc_41><loc_306><loc_234><loc_339>[11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2</list_item>
+<list_item><loc_41><loc_341><loc_234><loc_373>[12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2</list_item>
+<list_item><loc_41><loc_375><loc_234><loc_408>[13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2</list_item>
+<list_item><loc_41><loc_410><loc_234><loc_429>[14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2</list_item>
+<list_item><loc_41><loc_431><loc_234><loc_450>[15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6</list_item>
 </unordered_list>
-<unordered_list>
-<list_item><location><page_9><loc_50><loc_82><loc_89><loc_90></location>[16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4</list_item>
-<list_item><location><page_9><loc_50><loc_78><loc_89><loc_82></location>[17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3</list_item>
-<list_item><location><page_9><loc_50><loc_67><loc_89><loc_78></location>[18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3</list_item>
-<list_item><location><page_9><loc_50><loc_59><loc_89><loc_67></location>[19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1</list_item>
-<list_item><location><page_9><loc_50><loc_53><loc_89><loc_58></location>[20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2</list_item>
-<list_item><location><page_9><loc_50><loc_45><loc_89><loc_53></location>[21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1</list_item>
-<list_item><location><page_9><loc_50><loc_30><loc_89><loc_44></location>[22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6</list_item>
-<list_item><location><page_9><loc_50><loc_21><loc_89><loc_29></location>[23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1</list_item>
-<list_item><location><page_9><loc_50><loc_16><loc_89><loc_21></location>[24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3</list_item>
-<list_item><location><page_9><loc_50><loc_10><loc_89><loc_15></location>[25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on</list_item>
+<page_footer><loc_241><loc_463><loc_245><loc_469>9</page_footer>
+<unordered_list><list_item><loc_252><loc_48><loc_445><loc_88>[16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4</list_item>
+<list_item><loc_252><loc_90><loc_445><loc_109>[17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3</list_item>
+<list_item><loc_252><loc_111><loc_445><loc_164>[18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3</list_item>
+<list_item><loc_252><loc_166><loc_445><loc_206>[19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1</list_item>
+<list_item><loc_252><loc_208><loc_445><loc_234>[20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2</list_item>
+<list_item><loc_252><loc_236><loc_445><loc_276>[21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1</list_item>
+<list_item><loc_252><loc_278><loc_445><loc_352>[22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6</list_item>
+<list_item><loc_252><loc_354><loc_445><loc_394>[23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1</list_item>
+<list_item><loc_252><loc_396><loc_445><loc_422>[24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3</list_item>
+<list_item><loc_252><loc_424><loc_445><loc_450>[25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on</list_item>
 </unordered_list>
-<text><location><page_10><loc_11><loc_88><loc_47><loc_90></location>Computer Vision and Pattern Recognition , pages 658-666, 2019. 6</text>
-<unordered_list>
-<list_item><location><page_10><loc_8><loc_80><loc_47><loc_88></location>[26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1</list_item>
-<list_item><location><page_10><loc_8><loc_71><loc_47><loc_79></location>[27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3</list_item>
-<list_item><location><page_10><loc_8><loc_66><loc_47><loc_71></location>[28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2</list_item>
-<list_item><location><page_10><loc_8><loc_59><loc_47><loc_65></location>[29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3</list_item>
-<list_item><location><page_10><loc_8><loc_52><loc_47><loc_58></location>[30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1</list_item>
-<list_item><location><page_10><loc_8><loc_42><loc_47><loc_51></location>[31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5</list_item>
-<list_item><location><page_10><loc_8><loc_37><loc_47><loc_42></location>[32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2</list_item>
-<list_item><location><page_10><loc_8><loc_31><loc_47><loc_36></location>[33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3</list_item>
-<list_item><location><page_10><loc_8><loc_25><loc_47><loc_31></location>[34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3</list_item>
-<list_item><location><page_10><loc_8><loc_20><loc_47><loc_25></location>[35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4</list_item>
-<list_item><location><page_10><loc_8><loc_13><loc_47><loc_19></location>[36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3</list_item>
-<list_item><location><page_10><loc_8><loc_10><loc_47><loc_12></location>[37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,</list_item>
+<page_break>
+<text><loc_57><loc_48><loc_234><loc_60>Computer Vision and Pattern Recognition , pages 658-666, 2019. 6</text>
+<unordered_list><list_item><loc_41><loc_62><loc_234><loc_102>[26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1</list_item>
+<list_item><loc_41><loc_104><loc_234><loc_143>[27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3</list_item>
+<list_item><loc_41><loc_145><loc_234><loc_171>[28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2</list_item>
+<list_item><loc_41><loc_173><loc_234><loc_206>[29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3</list_item>
+<list_item><loc_41><loc_208><loc_234><loc_241>[30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1</list_item>
+<list_item><loc_41><loc_243><loc_234><loc_290>[31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5</list_item>
+<list_item><loc_41><loc_292><loc_234><loc_317>[32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2</list_item>
+<list_item><loc_41><loc_320><loc_234><loc_345>[33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3</list_item>
+<list_item><loc_41><loc_347><loc_234><loc_373>[34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3</list_item>
+<list_item><loc_41><loc_375><loc_234><loc_401>[35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4</list_item>
+<list_item><loc_41><loc_403><loc_234><loc_436>[36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3</list_item>
+<list_item><loc_41><loc_438><loc_234><loc_450>[37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,</list_item>
 </unordered_list>
-<unordered_list>
-<list_item><location><page_10><loc_54><loc_85><loc_89><loc_90></location>and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7</list_item>
-<list_item><location><page_10><loc_50><loc_80><loc_89><loc_85></location>[38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1</list_item>
+<page_footer><loc_239><loc_463><loc_247><loc_469>10</page_footer>
+<unordered_list><list_item><loc_269><loc_48><loc_445><loc_74>and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7</list_item>
+<list_item><loc_252><loc_76><loc_445><loc_102>[38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1</list_item>
 </unordered_list>
-<section_header_level_1><location><page_11><loc_22><loc_83><loc_76><loc_86></location>TableFormer: Table Structure Understanding with Transformers Supplementary Material</section_header_level_1>
-<section_header_level_1><location><page_11><loc_8><loc_78><loc_29><loc_80></location>1. Details on the datasets</section_header_level_1>
-<section_header_level_1><location><page_11><loc_8><loc_76><loc_25><loc_77></location>1.1. Data preparation</section_header_level_1>
-<text><location><page_11><loc_8><loc_51><loc_47><loc_75></location>As a first step of our data preparation process, we have calculated statistics over the datasets across the following dimensions: (1) table size measured in the number of rows and columns, (2) complexity of the table, (3) strictness of the provided HTML structure and (4) completeness (i.e. no omitted bounding boxes). A table is considered to be simple if it does not contain row spans or column spans. Additionally, a table has a strict HTML structure if every row has the same number of columns after taking into account any row or column spans. Therefore a strict HTML structure looks always rectangular. However, HTML is a lenient encoding format, i.e. tables with rows of different sizes might still be regarded as correct due to implicit display rules. These implicit rules leave room for ambiguity, which we want to avoid. As such, we prefer to have "strict" tables, i.e. tables where every row has exactly the same length.</text>
-<text><location><page_11><loc_8><loc_21><loc_47><loc_51></location>We have developed a technique that tries to derive a missing bounding box out of its neighbors. As a first step, we use the annotation data to generate the most fine-grained grid that covers the table structure. In case of strict HTML tables, all grid squares are associated with some table cell and in the presence of table spans a cell extends across multiple grid squares. When enough bounding boxes are known for a rectangular table, it is possible to compute the geometrical border lines between the grid rows and columns. Eventually this information is used to generate the missing bounding boxes. Additionally, the existence of unused grid squares indicates that the table rows have unequal number of columns and the overall structure is non-strict. The generation of missing bounding boxes for non-strict HTML tables is ambiguous and therefore quite challenging. Thus, we have decided to simply discard those tables. In case of PubTabNet we have computed missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes.</text>
-<text><location><page_11><loc_8><loc_18><loc_47><loc_20></location>Figure 7 illustrates the distribution of the tables across different dimensions per dataset.</text>
-<section_header_level_1><location><page_11><loc_8><loc_15><loc_25><loc_16></location>1.2. Synthetic datasets</section_header_level_1>
-<text><location><page_11><loc_8><loc_10><loc_47><loc_14></location>Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear-</text>
-<text><location><page_11><loc_50><loc_74><loc_89><loc_79></location>ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).</text>
-<text><location><page_11><loc_50><loc_71><loc_89><loc_73></location>The process of generating a synthetic dataset can be decomposed into the following steps:</text>
-<unordered_list>
-<list_item><location><page_11><loc_50><loc_60><loc_89><loc_70></location>1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).</list_item>
-<list_item><location><page_11><loc_50><loc_43><loc_89><loc_60></location>2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.</list_item>
-<list_item><location><page_11><loc_50><loc_37><loc_89><loc_43></location>3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.</list_item>
-<list_item><location><page_11><loc_50><loc_31><loc_89><loc_37></location>4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.</list_item>
-<list_item><location><page_11><loc_50><loc_23><loc_89><loc_31></location>5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.</list_item>
+<page_break>
+<section_header_level_1><loc_109><loc_70><loc_380><loc_86>TableFormer: Table Structure Understanding with Transformers Supplementary Material</section_header_level_1>
+<section_header_level_1><loc_41><loc_102><loc_144><loc_109>1. Details on the datasets</section_header_level_1>
+<section_header_level_1><loc_41><loc_114><loc_123><loc_120>1.1. Data preparation</section_header_level_1>
+<text><loc_41><loc_126><loc_234><loc_245>As a first step of our data preparation process, we have calculated statistics over the datasets across the following dimensions: (1) table size measured in the number of rows and columns, (2) complexity of the table, (3) strictness of the provided HTML structure and (4) completeness (i.e. no omitted bounding boxes). A table is considered to be simple if it does not contain row spans or column spans. Additionally, a table has a strict HTML structure if every row has the same number of columns after taking into account any row or column spans. Therefore a strict HTML structure looks always rectangular. However, HTML is a lenient encoding format, i.e. tables with rows of different sizes might still be regarded as correct due to implicit display rules. These implicit rules leave room for ambiguity, which we want to avoid. As such, we prefer to have "strict" tables, i.e. tables where every row has exactly the same length.</text>
+<text><loc_41><loc_247><loc_234><loc_396>We have developed a technique that tries to derive a missing bounding box out of its neighbors. As a first step, we use the annotation data to generate the most fine-grained grid that covers the table structure. In case of strict HTML tables, all grid squares are associated with some table cell and in the presence of table spans a cell extends across multiple grid squares. When enough bounding boxes are known for a rectangular table, it is possible to compute the geometrical border lines between the grid rows and columns. Eventually this information is used to generate the missing bounding boxes. Additionally, the existence of unused grid squares indicates that the table rows have unequal number of columns and the overall structure is non-strict. The generation of missing bounding boxes for non-strict HTML tables is ambiguous and therefore quite challenging. Thus, we have decided to simply discard those tables. In case of PubTabNet we have computed missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes.</text>
+<text><loc_41><loc_398><loc_234><loc_411>Figure 7 illustrates the distribution of the tables across different dimensions per dataset.</text>
+<section_header_level_1><loc_41><loc_418><loc_125><loc_424>1.2. Synthetic datasets</section_header_level_1>
+<text><loc_41><loc_430><loc_234><loc_451>Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear-</text>
+<text><loc_252><loc_103><loc_445><loc_131>ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).</text>
+<text><loc_252><loc_133><loc_445><loc_147>The process of generating a synthetic dataset can be decomposed into the following steps:</text>
+<unordered_list><list_item><loc_252><loc_149><loc_445><loc_200>1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).</list_item>
+<list_item><loc_252><loc_202><loc_445><loc_283>2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.</list_item>
+<list_item><loc_252><loc_286><loc_445><loc_314>3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.</list_item>
+<list_item><loc_252><loc_316><loc_445><loc_345>4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.</list_item>
+<list_item><loc_252><loc_347><loc_445><loc_383>5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.</list_item>
 </unordered_list>
-<section_header_level_1><location><page_11><loc_50><loc_18><loc_89><loc_21></location>2. Prediction post-processing for PDF documents</section_header_level_1>
-<text><location><page_11><loc_50><loc_10><loc_89><loc_17></location>Although TableFormer can predict the table structure and the bounding boxes for tables recognized inside PDF documents, this is not enough when a full reconstruction of the original table is required. This happens mainly due the following reasons:</text>
-<figure>
-<location><page_12><loc_9><loc_81><loc_89><loc_91></location>
-<caption>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
-</figure>
-<unordered_list>
-<list_item><location><page_12><loc_10><loc_71><loc_47><loc_73></location>· TableFormer output does not include the table cell content.</list_item>
-<list_item><location><page_12><loc_10><loc_67><loc_47><loc_69></location>· There are occasional inaccuracies in the predictions of the bounding boxes.</list_item>
+<section_header_level_1><loc_252><loc_393><loc_445><loc_408>2. Prediction post-processing for PDF documents</section_header_level_1>
+<text><loc_252><loc_415><loc_445><loc_451>Although TableFormer can predict the table structure and the bounding boxes for tables recognized inside PDF documents, this is not enough when a full reconstruction of the original table is required. This happens mainly due the following reasons:</text>
+<page_footer><loc_239><loc_463><loc_247><loc_469>11</page_footer>
+<page_break>
+<picture><loc_44><loc_47><loc_445><loc_93><caption><loc_41><loc_104><loc_445><loc_118>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption></picture>
+<unordered_list><list_item><loc_50><loc_133><loc_234><loc_146>· TableFormer output does not include the table cell content.</list_item>
+<list_item><loc_50><loc_154><loc_234><loc_167>· There are occasional inaccuracies in the predictions of the bounding boxes.</list_item>
 </unordered_list>
-<text><location><page_12><loc_8><loc_50><loc_47><loc_65></location>However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.</text>
-<text><location><page_12><loc_8><loc_47><loc_47><loc_50></location>Here is a step-by-step description of the prediction postprocessing:</text>
-<unordered_list>
-<list_item><location><page_12><loc_8><loc_42><loc_47><loc_47></location>1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.</list_item>
-<list_item><location><page_12><loc_8><loc_36><loc_47><loc_42></location>2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.</list_item>
-<list_item><location><page_12><loc_8><loc_33><loc_47><loc_36></location>3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones.</list_item>
-<list_item><location><page_12><loc_8><loc_29><loc_47><loc_33></location>3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.</list_item>
-<list_item><location><page_12><loc_8><loc_24><loc_47><loc_28></location>4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:</list_item>
+<text><loc_41><loc_176><loc_234><loc_250>However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.</text>
+<text><loc_41><loc_252><loc_234><loc_265>Here is a step-by-step description of the prediction postprocessing:</text>
+<unordered_list><list_item><loc_41><loc_267><loc_234><loc_288>1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.</list_item>
+<list_item><loc_41><loc_290><loc_234><loc_318>2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.</list_item>
+<list_item><loc_41><loc_320><loc_234><loc_334>3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones.</list_item>
+<list_item><loc_41><loc_336><loc_234><loc_356>3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.</list_item>
+<list_item><loc_41><loc_359><loc_234><loc_379>4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:</list_item>
 </unordered_list>
-<formula><location><page_12><loc_18><loc_17><loc_47><loc_21></location></formula>
-<text><location><page_12><loc_8><loc_13><loc_47><loc_16></location>where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point.</text>
-<unordered_list>
-<list_item><location><page_12><loc_8><loc_10><loc_47><loc_13></location>5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-</list_item>
+<formula><loc_90><loc_394><loc_234><loc_413></formula>
+<text><loc_41><loc_421><loc_234><loc_435>where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point.</text>
+<unordered_list><list_item><loc_41><loc_437><loc_234><loc_450>5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-</list_item>
 </unordered_list>
-<text><location><page_12><loc_50><loc_68><loc_89><loc_73></location>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</text>
-<unordered_list>
-<list_item><location><page_12><loc_50><loc_65><loc_89><loc_67></location>6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.</list_item>
-<list_item><location><page_12><loc_50><loc_51><loc_89><loc_64></location>7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.</list_item>
-<list_item><location><page_12><loc_50><loc_42><loc_89><loc_51></location>8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.</list_item>
-<list_item><location><page_12><loc_50><loc_28><loc_89><loc_41></location>9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.</list_item>
+<text><loc_252><loc_133><loc_445><loc_161>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</text>
+<unordered_list><list_item><loc_252><loc_164><loc_445><loc_177>6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.</list_item>
+<list_item><loc_252><loc_179><loc_445><loc_245>7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.</list_item>
+<list_item><loc_252><loc_247><loc_445><loc_290>8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.</list_item>
+<list_item><loc_252><loc_293><loc_445><loc_359>9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.</list_item>
 </unordered_list>
-<text><location><page_12><loc_50><loc_24><loc_89><loc_28></location>9a. Compute the top and bottom boundary of the horizontal band for each grid row (min/max y coordinates per row).</text>
-<unordered_list>
-<list_item><location><page_12><loc_50><loc_21><loc_89><loc_23></location>9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row.</list_item>
-<list_item><location><page_12><loc_50><loc_16><loc_89><loc_20></location>9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column).</list_item>
-<list_item><location><page_12><loc_50><loc_13><loc_89><loc_16></location>9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column.</list_item>
-<list_item><location><page_12><loc_50><loc_10><loc_89><loc_13></location>9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or-</list_item>
+<text><loc_252><loc_361><loc_445><loc_381>9a. Compute the top and bottom boundary of the horizontal band for each grid row (min/max y coordinates per row).</text>
+<unordered_list><list_item><loc_252><loc_384><loc_445><loc_397>9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row.</list_item>
+<list_item><loc_252><loc_399><loc_445><loc_420>9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column).</list_item>
+<list_item><loc_252><loc_422><loc_445><loc_435>9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column.</list_item>
+<list_item><loc_252><loc_437><loc_445><loc_450>9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or-</list_item>
 </unordered_list>
-<text><location><page_13><loc_8><loc_89><loc_15><loc_91></location>phan cell.</text>
-<text><location><page_13><loc_8><loc_86><loc_47><loc_89></location>9f. Otherwise create a new structural cell and match it wit the orphan cell.</text>
-<text><location><page_13><loc_8><loc_83><loc_47><loc_86></location>Aditional images with examples of TableFormer predictions and post-processing can be found below.</text>
-<table>
-<location><page_13><loc_14><loc_73><loc_39><loc_80></location>
-</table>
-<table>
-<location><page_13><loc_14><loc_63><loc_39><loc_70></location>
-</table>
-<table>
-<location><page_13><loc_14><loc_54><loc_39><loc_61></location>
-</table>
-<table>
-<location><page_13><loc_14><loc_38><loc_41><loc_50></location>
-<caption>Figure 8: Example of a table with multi-line header.</caption>
-</table>
-<table>
-<location><page_13><loc_51><loc_83><loc_91><loc_87></location>
-</table>
-<table>
-<location><page_13><loc_51><loc_77><loc_91><loc_80></location>
-</table>
-<table>
-<location><page_13><loc_51><loc_71><loc_91><loc_75></location>
-</table>
-<figure>
-<location><page_13><loc_51><loc_63><loc_70><loc_68></location>
-</figure>
-<table>
-<location><page_13><loc_51><loc_63><loc_70><loc_68></location>
-<caption>Figure 9: Example of a table with big empty distance between cells.</caption>
-</table>
-<table>
-<location><page_13><loc_55><loc_45><loc_80><loc_51></location>
-</table>
-<table>
-<location><page_13><loc_55><loc_37><loc_80><loc_43></location>
-</table>
-<table>
-<location><page_13><loc_55><loc_28><loc_80><loc_34></location>
-</table>
-<figure>
-<location><page_13><loc_55><loc_16><loc_85><loc_25></location>
-</figure>
-<table>
-<location><page_13><loc_55><loc_16><loc_85><loc_25></location>
-<caption>Figure 10: Example of a complex table with empty cells.</caption>
-</table>
-<table>
-<location><page_14><loc_8><loc_57><loc_46><loc_65></location>
-</table>
-<figure>
-<location><page_14><loc_8><loc_56><loc_46><loc_87></location>
-<caption>Figure 11: Simple table with different style and empty cells.</caption>
-</figure>
-<table>
-<location><page_14><loc_8><loc_38><loc_51><loc_43></location>
-</table>
-<table>
-<location><page_14><loc_8><loc_32><loc_51><loc_36></location>
-</table>
-<table>
-<location><page_14><loc_8><loc_25><loc_51><loc_30></location>
-</table>
-<figure>
-<location><page_14><loc_8><loc_17><loc_29><loc_23></location>
-<caption>Figure 12: Simple table predictions and post processing.</caption>
-</figure>
-<table>
-<location><page_14><loc_52><loc_73><loc_87><loc_80></location>
-</table>
-<table>
-<location><page_14><loc_52><loc_65><loc_87><loc_71></location>
-</table>
-<table>
-<location><page_14><loc_54><loc_55><loc_86><loc_64></location>
-</table>
-<figure>
-<location><page_14><loc_52><loc_55><loc_87><loc_89></location>
-<caption>Figure 13: Table predictions example on colorful table.</caption>
-</figure>
-<table>
-<location><page_14><loc_52><loc_40><loc_85><loc_46></location>
-</table>
-<table>
-<location><page_14><loc_52><loc_32><loc_85><loc_38></location>
-</table>
-<table>
-<location><page_14><loc_52><loc_25><loc_85><loc_31></location>
-</table>
-<table>
-<location><page_14><loc_52><loc_16><loc_87><loc_23></location>
-<caption>Figure 14: Example with multi-line text.</caption>
-</table>
-<figure>
-<location><page_15><loc_9><loc_69><loc_46><loc_83></location>
-</figure>
-<table>
-<location><page_15><loc_9><loc_69><loc_46><loc_83></location>
-</table>
-<figure>
-<location><page_15><loc_9><loc_53><loc_46><loc_67></location>
-</figure>
-<table>
-<location><page_15><loc_9><loc_53><loc_46><loc_67></location>
-</table>
-<figure>
-<location><page_15><loc_9><loc_37><loc_46><loc_51></location>
-</figure>
-<figure>
-<location><page_15><loc_8><loc_20><loc_52><loc_36></location>
-</figure>
-<table>
-<location><page_15><loc_8><loc_20><loc_52><loc_36></location>
-<caption>Figure 15: Example with triangular table.</caption>
-</table>
-<table>
-<location><page_15><loc_53><loc_72><loc_86><loc_85></location>
-</table>
-<table>
-<location><page_15><loc_53><loc_57><loc_86><loc_69></location>
-</table>
-<figure>
-<location><page_15><loc_53><loc_41><loc_86><loc_54></location>
-</figure>
-<table>
-<location><page_15><loc_53><loc_41><loc_86><loc_54></location>
-</table>
-<figure>
-<location><page_15><loc_58><loc_20><loc_81><loc_38></location>
-</figure>
-<table>
-<location><page_15><loc_58><loc_20><loc_81><loc_38></location>
-<caption>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
-</table>
-<figure>
-<location><page_16><loc_11><loc_37><loc_86><loc_68></location>
-<caption>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
-</figure>
-</document>
+<page_footer><loc_239><loc_463><loc_247><loc_469>12</page_footer>
+<page_break>
+<text><loc_41><loc_47><loc_73><loc_53>phan cell.</text>
+<text><loc_41><loc_55><loc_234><loc_68>9f. Otherwise create a new structural cell and match it wit the orphan cell.</text>
+<text><loc_41><loc_70><loc_234><loc_83>Aditional images with examples of TableFormer predictions and post-processing can be found below.</text>
+<otsl><loc_69><loc_99><loc_195><loc_135></otsl>
+<otsl><loc_68><loc_148><loc_195><loc_184></otsl>
+<otsl><loc_69><loc_195><loc_195><loc_232></otsl>
+<otsl><loc_68><loc_250><loc_203><loc_308><caption><loc_52><loc_317><loc_223><loc_323>Figure 8: Example of a table with multi-line header.</caption></otsl>
+<page_footer><loc_239><loc_463><loc_247><loc_469>13</page_footer>
+<otsl><loc_254><loc_64><loc_454><loc_86></otsl>
+<otsl><loc_253><loc_98><loc_454><loc_117></otsl>
+<otsl><loc_253><loc_124><loc_454><loc_147></otsl>
+<picture><loc_253><loc_160><loc_348><loc_185></picture>
+<otsl><loc_253><loc_160><loc_348><loc_185><caption><loc_252><loc_194><loc_445><loc_207>Figure 9: Example of a table with big empty distance between cells.</caption></otsl>
+<otsl><loc_274><loc_245><loc_400><loc_276></otsl>
+<otsl><loc_274><loc_287><loc_400><loc_317></otsl>
+<otsl><loc_274><loc_328><loc_401><loc_358></otsl>
+<picture><loc_273><loc_374><loc_424><loc_420></picture>
+<otsl><loc_273><loc_374><loc_424><loc_420><caption><loc_255><loc_430><loc_443><loc_435>Figure 10: Example of a complex table with empty cells.</caption></otsl>
+<page_break>
+<otsl><loc_42><loc_173><loc_231><loc_217></otsl>
+<picture><loc_42><loc_66><loc_231><loc_218><caption><loc_41><loc_225><loc_234><loc_238>Figure 11: Simple table with different style and empty cells.</caption></picture>
+<otsl><loc_42><loc_286><loc_254><loc_310></otsl>
+<otsl><loc_42><loc_318><loc_254><loc_342></otsl>
+<otsl><loc_42><loc_350><loc_254><loc_374></otsl>
+<picture><loc_41><loc_386><loc_145><loc_414><caption><loc_45><loc_424><loc_230><loc_430>Figure 12: Simple table predictions and post processing.</caption></picture>
+<page_footer><loc_239><loc_463><loc_247><loc_469>14</page_footer>
+<otsl><loc_261><loc_102><loc_437><loc_135></otsl>
+<otsl><loc_261><loc_143><loc_437><loc_177></otsl>
+<otsl><loc_268><loc_182><loc_428><loc_226></otsl>
+<picture><loc_260><loc_57><loc_437><loc_227><caption><loc_258><loc_235><loc_440><loc_240>Figure 13: Table predictions example on colorful table.</caption></picture>
+<otsl><loc_261><loc_272><loc_424><loc_302></otsl>
+<otsl><loc_261><loc_309><loc_424><loc_338></otsl>
+<otsl><loc_261><loc_345><loc_425><loc_374></otsl>
+<otsl><loc_261><loc_385><loc_436><loc_422><caption><loc_282><loc_432><loc_416><loc_437>Figure 14: Example with multi-line text.</caption></otsl>
+<page_break>
+<picture><loc_45><loc_86><loc_228><loc_157></picture>
+<otsl><loc_45><loc_86><loc_228><loc_157></otsl>
+<picture><loc_44><loc_164><loc_228><loc_236></picture>
+<otsl><loc_44><loc_164><loc_228><loc_236></otsl>
+<picture><loc_45><loc_243><loc_229><loc_314></picture>
+<picture><loc_41><loc_319><loc_261><loc_399></picture>
+<otsl><loc_41><loc_319><loc_261><loc_399><caption><loc_69><loc_407><loc_206><loc_412>Figure 15: Example with triangular table.</caption></otsl>
+<page_footer><loc_239><loc_463><loc_247><loc_469>15</page_footer>
+<otsl><loc_264><loc_77><loc_430><loc_141></otsl>
+<otsl><loc_264><loc_153><loc_430><loc_217></otsl>
+<picture><loc_264><loc_229><loc_430><loc_293></picture>
+<otsl><loc_264><loc_229><loc_430><loc_293></otsl>
+<picture><loc_289><loc_308><loc_405><loc_401></picture>
+<otsl><loc_289><loc_308><loc_405><loc_401><caption><loc_252><loc_412><loc_445><loc_425>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption></otsl>
+<page_break>
+<picture><loc_55><loc_160><loc_432><loc_314><caption><loc_41><loc_321><loc_445><loc_334>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption></picture>
+<page_footer><loc_239><loc_463><loc_247><loc_469>16</page_footer>
+</doctag>
--- a/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
@ -1,240 +1,157 @@
-<document>
-<section_header_level_1><location><page_1><loc_18><loc_85><loc_83><loc_89></location>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</section_header_level_1>
-<text><location><page_1><loc_15><loc_77><loc_32><loc_83></location>Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com</text>
-<text><location><page_1><loc_42><loc_77><loc_58><loc_83></location>Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com</text>
-<text><location><page_1><loc_69><loc_77><loc_85><loc_83></location>Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com</text>
-<text><location><page_1><loc_28><loc_70><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com</text>
-<text><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</text>
-<section_header_level_1><location><page_1><loc_9><loc_67><loc_18><loc_69></location>ABSTRACT</section_header_level_1>
-<text><location><page_1><loc_9><loc_33><loc_48><loc_67></location>Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.</text>
-<section_header_level_1><location><page_1><loc_9><loc_29><loc_22><loc_30></location>CCS CONCEPTS</section_header_level_1>
-<text><location><page_1><loc_9><loc_25><loc_49><loc_29></location>· Information systems → Document structure ; · Applied computing → Document analysis ; · Computing methodologies → Machine learning ; Computer vision ; Object detection ;</text>
-<text><location><page_1><loc_9><loc_15><loc_48><loc_20></location>Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).</text>
-<text><location><page_1><loc_9><loc_14><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA</text>
-<text><location><page_1><loc_9><loc_13><loc_31><loc_14></location>© 2022 Copyright held by the owner/author(s).</text>
-<text><location><page_1><loc_9><loc_12><loc_26><loc_13></location>ACM ISBN 978-1-4503-9385-0/22/08.</text>
-<text><location><page_1><loc_9><loc_11><loc_27><loc_12></location>https://doi.org/10.1145/3534678.3539043</text>
-<figure>
-<location><page_1><loc_53><loc_34><loc_90><loc_68></location>
-<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
-</figure>
-<section_header_level_1><location><page_1><loc_52><loc_24><loc_62><loc_25></location>KEYWORDS</section_header_level_1>
-<text><location><page_1><loc_52><loc_21><loc_91><loc_23></location>PDF document conversion, layout segmentation, object-detection, data set, Machine Learning</text>
-<section_header_level_1><location><page_1><loc_52><loc_18><loc_66><loc_19></location>ACM Reference Format:</section_header_level_1>
-<text><location><page_1><loc_52><loc_11><loc_91><loc_18></location>Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043</text>
-<section_header_level_1><location><page_2><loc_9><loc_88><loc_26><loc_89></location>1 INTRODUCTION</section_header_level_1>
-<text><location><page_2><loc_9><loc_71><loc_50><loc_86></location>Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1.</text>
-<text><location><page_2><loc_9><loc_37><loc_48><loc_71></location>A key problem in the process of document conversion is to understand the structure of a single document page, i.e. which segments of text should be grouped together in a unit. To train models for this task, there are currently two large datasets available to the community, PubLayNet [6] and DocBank [7]. They were introduced in 2019 and 2020 respectively and significantly accelerated the implementation of layout detection and segmentation models due to their sizes of 300K and 500K ground-truth pages. These sizes were achieved by leveraging an automation approach. The benefit of automated ground-truth generation is obvious: one can generate large ground-truth datasets at virtually no cost. However, the automation introduces a constraint on the variability in the dataset, because corresponding structured source data must be available. PubLayNet and DocBank were both generated from scientific document repositories (PubMed and arXiv), which provide XML or L A T E X sources. Those scientific documents present a limited variability in their layouts, because they are typeset in uniform templates provided by the publishers. Obviously, documents such as technical manuals, annual company reports, legal text, government tenders, etc. have very different and partially unique layouts. As a consequence, the layout predictions obtained from models trained on PubLayNet or DocBank is very reasonable when applied on scientific documents. However, for more artistic or free-style layouts, we see sub-par prediction quality from these models, which we demonstrate in Section 5.</text>
-<text><location><page_2><loc_9><loc_27><loc_48><loc_36></location>In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-truth using bounding-boxes for 11 distinct class labels on 80863 unique document pages, of which a fraction carry double- or triple-annotations. DocLayNet is similar in spirit to PubLayNet and DocBank and will likewise be made available to the public 1 in order to stimulate the document-layout analysis community. It distinguishes itself in the following aspects:</text>
-<unordered_list>
-<list_item><location><page_2><loc_11><loc_22><loc_48><loc_26></location>(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.</list_item>
-<list_item><location><page_2><loc_11><loc_20><loc_48><loc_22></location>(2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.</list_item>
-<list_item><location><page_2><loc_11><loc_15><loc_48><loc_19></location>(3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.</list_item>
-<list_item><location><page_2><loc_11><loc_13><loc_48><loc_15></location>(4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.</list_item>
+<doctag><page_header><loc_15><loc_138><loc_30><loc_350>arXiv:2206.01062v1 [cs.CV] 2 Jun 2022</page_header>
+<section_header_level_1><loc_88><loc_53><loc_413><loc_76>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</section_header_level_1>
+<text><loc_74><loc_84><loc_158><loc_114>Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com</text>
+<text><loc_208><loc_84><loc_292><loc_114>Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com</text>
+<text><loc_343><loc_84><loc_426><loc_114>Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com</text>
+<text><loc_141><loc_121><loc_225><loc_151>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com</text>
+<text><loc_275><loc_121><loc_359><loc_151>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</text>
+<section_header_level_1><loc_44><loc_156><loc_91><loc_163>ABSTRACT</section_header_level_1>
+<text><loc_44><loc_166><loc_241><loc_337>Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.</text>
+<section_header_level_1><loc_44><loc_348><loc_110><loc_354>CCS CONCEPTS</section_header_level_1>
+<text><loc_44><loc_357><loc_243><loc_377>· Information systems → Document structure ; · Applied computing → Document analysis ; · Computing methodologies → Machine learning ; Computer vision ; Object detection ;</text>
+<text><loc_44><loc_401><loc_241><loc_425>Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).</text>
+<text><loc_44><loc_426><loc_162><loc_430>KDD '22, August 14-18, 2022, Washington, DC, USA</text>
+<text><loc_44><loc_432><loc_153><loc_436>© 2022 Copyright held by the owner/author(s).</text>
+<text><loc_44><loc_437><loc_128><loc_441>ACM ISBN 978-1-4503-9385-0/22/08.</text>
+<text><loc_44><loc_442><loc_136><loc_446>https://doi.org/10.1145/3534678.3539043</text>
+<picture><loc_264><loc_158><loc_452><loc_332><caption><loc_260><loc_341><loc_457><loc_353>Figure 1: Four examples of complex page layouts across different document categories</caption></picture>
+<section_header_level_1><loc_260><loc_374><loc_310><loc_381>KEYWORDS</section_header_level_1>
+<text><loc_260><loc_384><loc_457><loc_396>PDF document conversion, layout segmentation, object-detection, data set, Machine Learning</text>
+<section_header_level_1><loc_260><loc_404><loc_331><loc_409>ACM Reference Format:</section_header_level_1>
+<text><loc_260><loc_410><loc_457><loc_447>Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043</text>
+<page_break>
+<page_header><loc_44><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
+<section_header_level_1><loc_44><loc_54><loc_128><loc_61>1 INTRODUCTION</section_header_level_1>
+<text><loc_44><loc_70><loc_248><loc_145>Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1.</text>
+<text><loc_44><loc_146><loc_241><loc_317>A key problem in the process of document conversion is to understand the structure of a single document page, i.e. which segments of text should be grouped together in a unit. To train models for this task, there are currently two large datasets available to the community, PubLayNet [6] and DocBank [7]. They were introduced in 2019 and 2020 respectively and significantly accelerated the implementation of layout detection and segmentation models due to their sizes of 300K and 500K ground-truth pages. These sizes were achieved by leveraging an automation approach. The benefit of automated ground-truth generation is obvious: one can generate large ground-truth datasets at virtually no cost. However, the automation introduces a constraint on the variability in the dataset, because corresponding structured source data must be available. PubLayNet and DocBank were both generated from scientific document repositories (PubMed and arXiv), which provide XML or L A T E X sources. Those scientific documents present a limited variability in their layouts, because they are typeset in uniform templates provided by the publishers. Obviously, documents such as technical manuals, annual company reports, legal text, government tenders, etc. have very different and partially unique layouts. As a consequence, the layout predictions obtained from models trained on PubLayNet or DocBank is very reasonable when applied on scientific documents. However, for more artistic or free-style layouts, we see sub-par prediction quality from these models, which we demonstrate in Section 5.</text>
+<text><loc_44><loc_319><loc_241><loc_366>In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-truth using bounding-boxes for 11 distinct class labels on 80863 unique document pages, of which a fraction carry double- or triple-annotations. DocLayNet is similar in spirit to PubLayNet and DocBank and will likewise be made available to the public 1 in order to stimulate the document-layout analysis community. It distinguishes itself in the following aspects:</text>
+<unordered_list><list_item><loc_53><loc_369><loc_241><loc_388>(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.</list_item>
+<list_item><loc_53><loc_390><loc_240><loc_402>(2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.</list_item>
+<list_item><loc_53><loc_404><loc_241><loc_423>(3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.</list_item>
+<list_item><loc_53><loc_424><loc_241><loc_437>(4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.</list_item>
 </unordered_list>
-<text><location><page_2><loc_56><loc_87><loc_91><loc_89></location>This enables experimentation with annotation uncertainty and quality control analysis.</text>
-<unordered_list>
-<list_item><location><page_2><loc_54><loc_80><loc_91><loc_86></location>(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.</list_item>
+<footnote><loc_44><loc_443><loc_176><loc_447>$^{1}$https://developer.ibm.com/exchanges/data/all/doclaynet</footnote>
+<text><loc_279><loc_55><loc_456><loc_67>This enables experimentation with annotation uncertainty and quality control analysis.</text>
+<unordered_list><list_item><loc_269><loc_69><loc_457><loc_102>(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.</list_item>
 </unordered_list>
-<text><location><page_2><loc_52><loc_72><loc_91><loc_79></location>All aspects outlined above are detailed in Section 3. In Section 4, we will elaborate on how we designed and executed this large-scale human annotation campaign. We will also share key insights and lessons learned that might prove helpful for other parties planning to set up annotation campaigns.</text>
-<text><location><page_2><loc_52><loc_61><loc_91><loc_72></location>In Section 5, we will present baseline accuracy numbers for a variety of object detection methods (Faster R-CNN, Mask R-CNN and YOLOv5) trained on DocLayNet. We further show how the model performance is impacted by varying the DocLayNet dataset size, reducing the label set and modifying the train/test-split. Last but not least, we compare the performance of models trained on PubLayNet, DocBank and DocLayNet and demonstrate that a model trained on DocLayNet provides overall more robust layout recovery.</text>
-<section_header_level_1><location><page_2><loc_52><loc_58><loc_69><loc_59></location>2 RELATED WORK</section_header_level_1>
-<text><location><page_2><loc_52><loc_41><loc_91><loc_56></location>While early approaches in document-layout analysis used rulebased algorithms and heuristics [8], the problem is lately addressed with deep learning methods. The most common approach is to leverage object detection models [9-15]. In the last decade, the accuracy and speed of these models has increased dramatically. Furthermore, most state-of-the-art object detection methods can be trained and applied with very little work, thanks to a standardisation effort of the ground-truth data format [16] and common deep-learning frameworks [17]. Reference data sets such as PubLayNet [6] and DocBank provide their data in the commonly accepted COCO format [16].</text>
-<text><location><page_2><loc_52><loc_30><loc_91><loc_41></location>Lately, new types of ML models for document-layout analysis have emerged in the community [18-21]. These models do not approach the problem of layout analysis purely based on an image representation of the page, as computer vision methods do. Instead, they combine the text tokens and image representation of a page in order to obtain a segmentation. While the reported accuracies appear to be promising, a broadly accepted data format which links geometric and textual features has yet to establish.</text>
-<section_header_level_1><location><page_2><loc_52><loc_27><loc_78><loc_29></location>3 THE DOCLAYNET DATASET</section_header_level_1>
-<text><location><page_2><loc_52><loc_15><loc_91><loc_25></location>DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of labeled, rectangular boundingboxes. We define 11 distinct labels for layout features, namely Caption , Footnote , Formula , List-item , Page-footer , Page-header , Picture , Section-header , Table , Text , and Title . Our reasoning for picking this particular label set is detailed in Section 4.</text>
-<text><location><page_2><loc_52><loc_11><loc_91><loc_14></location>In addition to open intellectual property constraints for the source documents, we required that the documents in DocLayNet adhere to a few conditions. Firstly, we kept scanned documents</text>
-<figure>
-<location><page_3><loc_14><loc_72><loc_43><loc_88></location>
-<caption>Figure 2: Distribution of DocLayNet pages across document categories.</caption>
-</figure>
-<text><location><page_3><loc_9><loc_54><loc_48><loc_64></location>to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing "text in the wild".</text>
-<text><location><page_3><loc_9><loc_36><loc_48><loc_53></location>The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.</text>
-<text><location><page_3><loc_9><loc_23><loc_48><loc_35></location>We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features.</text>
-<text><location><page_3><loc_9><loc_14><loc_48><loc_23></location>To ensure that future benchmarks in the document-layout analysis community can be easily compared, we have split up DocLayNet into pre-defined train-, test- and validation-sets. In this way, we can avoid spurious variations in the evaluation scores due to random splitting in train-, test- and validation-sets. We also ensured that less frequent labels are represented in train and test sets in equal proportions.</text>
-<text><location><page_3><loc_52><loc_80><loc_91><loc_89></location>Table 1 shows the overall frequency and distribution of the labels among the different sets. Importantly, we ensure that subsets are only split on full-document boundaries. This avoids that pages of the same document are spread over train, test and validation set, which can give an undesired evaluation advantage to models and lead to overestimation of their prediction accuracy. We will show the impact of this decision in Section 5.</text>
-<text><location><page_3><loc_52><loc_66><loc_91><loc_79></location>In order to accommodate the different types of models currently in use by the community, we provide DocLayNet in an augmented COCO format [16]. This entails the standard COCO ground-truth file (in JSON format) with the associated page images (in PNG format, 1025 × 1025 pixels). Furthermore, custom fields have been added to each COCO record to specify document category, original document filename and page number. In addition, we also provide the original PDF pages, as well as sidecar files containing parsed PDF text and text-cell coordinates (in JSON). All additional files are linked to the primary page images by their matching filenames.</text>
-<text><location><page_3><loc_52><loc_26><loc_91><loc_65></location>Despite being cost-intense and far less scalable than automation, human annotation has several benefits over automated groundtruth generation. The first and most obvious reason to leverage human annotations is the freedom to annotate any type of document without requiring a programmatic source. For most PDF documents, the original source document is not available. The latter is not a hard constraint with human annotation, but it is for automated methods. A second reason to use human annotations is that the latter usually provide a more natural interpretation of the page layout. The human-interpreted layout can significantly deviate from the programmatic layout used in typesetting. For example, "invisible" tables might be used solely for aligning text paragraphs on columns. Such typesetting tricks might be interpreted by automated methods incorrectly as an actual table, while the human annotation will interpret it correctly as Text or other styles. The same applies to multi-line text elements, when authors decided to space them as "invisible" list elements without bullet symbols. A third reason to gather ground-truth through human annotation is to estimate a "natural" upper bound on the segmentation accuracy. As we will show in Section 4, certain documents featuring complex layouts can have different but equally acceptable layout interpretations. This natural upper bound for segmentation accuracy can be found by annotating the same pages multiple times by different people and evaluating the inter-annotator agreement. Such a baseline consistency evaluation is very useful to define expectations for a good target accuracy in trained deep neural network models and avoid overfitting (see Table 1). On the flip side, achieving high annotation consistency proved to be a key challenge in human annotation, as we outline in Section 4.</text>
-<section_header_level_1><location><page_3><loc_52><loc_22><loc_77><loc_23></location>4 ANNOTATION CAMPAIGN</section_header_level_1>
-<text><location><page_3><loc_52><loc_11><loc_91><loc_20></location>The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four,</text>
-<table>
-<location><page_4><loc_16><loc_63><loc_84><loc_83></location>
-<caption>Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.</caption>
-<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>% of Total</col_2><col_3><col_header>% of Total</col_3><col_4><col_header>% of Total</col_4><col_5><col_header>% of Total</col_5><col_6><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_6><col_7><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_7><col_8><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_8><col_9><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_9><col_10><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_10><col_11><col_header>triple inter-annotator mAP @ 0.5-0.95 (%)</col_11></row_0>
-<row_1><col_0><col_header>class label</col_0><col_1><col_header>Count</col_1><col_2><col_header>Train</col_2><col_3><col_header>Test</col_3><col_4><col_header>Val</col_4><col_5><col_header>All</col_5><col_6><col_header>Fin</col_6><col_7><col_header>Man</col_7><col_8><col_header>Sci</col_8><col_9><col_header>Law</col_9><col_10><col_header>Pat</col_10><col_11><col_header>Ten</col_11></row_1>
-<row_2><col_0><row_header>Caption</col_0><col_1><body>22524</col_1><col_2><body>2.04</col_2><col_3><body>1.77</col_3><col_4><body>2.32</col_4><col_5><body>84-89</col_5><col_6><body>40-61</col_6><col_7><body>86-92</col_7><col_8><body>94-99</col_8><col_9><body>95-99</col_9><col_10><body>69-78</col_10><col_11><body>n/a</col_11></row_2>
-<row_3><col_0><row_header>Footnote</col_0><col_1><body>6318</col_1><col_2><body>0.60</col_2><col_3><body>0.31</col_3><col_4><body>0.58</col_4><col_5><body>83-91</col_5><col_6><body>n/a</col_6><col_7><body>100</col_7><col_8><body>62-88</col_8><col_9><body>85-94</col_9><col_10><body>n/a</col_10><col_11><body>82-97</col_11></row_3>
-<row_4><col_0><row_header>Formula</col_0><col_1><body>25027</col_1><col_2><body>2.25</col_2><col_3><body>1.90</col_3><col_4><body>2.96</col_4><col_5><body>83-85</col_5><col_6><body>n/a</col_6><col_7><body>n/a</col_7><col_8><body>84-87</col_8><col_9><body>86-96</col_9><col_10><body>n/a</col_10><col_11><body>n/a</col_11></row_4>
-<row_5><col_0><row_header>List-item</col_0><col_1><body>185660</col_1><col_2><body>17.19</col_2><col_3><body>13.34</col_3><col_4><body>15.82</col_4><col_5><body>87-88</col_5><col_6><body>74-83</col_6><col_7><body>90-92</col_7><col_8><body>97-97</col_8><col_9><body>81-85</col_9><col_10><body>75-88</col_10><col_11><body>93-95</col_11></row_5>
-<row_6><col_0><row_header>Page-footer</col_0><col_1><body>70878</col_1><col_2><body>6.51</col_2><col_3><body>5.58</col_3><col_4><body>6.00</col_4><col_5><body>93-94</col_5><col_6><body>88-90</col_6><col_7><body>95-96</col_7><col_8><body>100</col_8><col_9><body>92-97</col_9><col_10><body>100</col_10><col_11><body>96-98</col_11></row_6>
-<row_7><col_0><row_header>Page-header</col_0><col_1><body>58022</col_1><col_2><body>5.10</col_2><col_3><body>6.70</col_3><col_4><body>5.06</col_4><col_5><body>85-89</col_5><col_6><body>66-76</col_6><col_7><body>90-94</col_7><col_8><body>98-100</col_8><col_9><body>91-92</col_9><col_10><body>97-99</col_10><col_11><body>81-86</col_11></row_7>
-<row_8><col_0><row_header>Picture</col_0><col_1><body>45976</col_1><col_2><body>4.21</col_2><col_3><body>2.78</col_3><col_4><body>5.31</col_4><col_5><body>69-71</col_5><col_6><body>56-59</col_6><col_7><body>82-86</col_7><col_8><body>69-82</col_8><col_9><body>80-95</col_9><col_10><body>66-71</col_10><col_11><body>59-76</col_11></row_8>
-<row_9><col_0><row_header>Section-header</col_0><col_1><body>142884</col_1><col_2><body>12.60</col_2><col_3><body>15.77</col_3><col_4><body>12.85</col_4><col_5><body>83-84</col_5><col_6><body>76-81</col_6><col_7><body>90-92</col_7><col_8><body>94-95</col_8><col_9><body>87-94</col_9><col_10><body>69-73</col_10><col_11><body>78-86</col_11></row_9>
-<row_10><col_0><row_header>Table</col_0><col_1><body>34733</col_1><col_2><body>3.20</col_2><col_3><body>2.27</col_3><col_4><body>3.60</col_4><col_5><body>77-81</col_5><col_6><body>75-80</col_6><col_7><body>83-86</col_7><col_8><body>98-99</col_8><col_9><body>58-80</col_9><col_10><body>79-84</col_10><col_11><body>70-85</col_11></row_10>
-<row_11><col_0><row_header>Text</col_0><col_1><body>510377</col_1><col_2><body>45.82</col_2><col_3><body>49.28</col_3><col_4><body>45.00</col_4><col_5><body>84-86</col_5><col_6><body>81-86</col_6><col_7><body>88-93</col_7><col_8><body>89-93</col_8><col_9><body>87-92</col_9><col_10><body>71-79</col_10><col_11><body>87-95</col_11></row_11>
-<row_12><col_0><row_header>Title</col_0><col_1><body>5071</col_1><col_2><body>0.47</col_2><col_3><body>0.30</col_3><col_4><body>0.50</col_4><col_5><body>60-72</col_5><col_6><body>24-63</col_6><col_7><body>50-63</col_7><col_8><body>94-100</col_8><col_9><body>82-96</col_9><col_10><body>68-79</col_10><col_11><body>24-56</col_11></row_12>
-<row_13><col_0><row_header>Total</col_0><col_1><body>1107470</col_1><col_2><body>941123</col_2><col_3><body>99816</col_3><col_4><body>66531</col_4><col_5><body>82-83</col_5><col_6><body>71-74</col_6><col_7><body>79-81</col_7><col_8><body>89-94</col_8><col_9><body>86-91</col_9><col_10><body>71-76</col_10><col_11><body>68-85</col_11></row_13>
-</table>
-<figure>
-<location><page_4><loc_9><loc_32><loc_48><loc_61></location>
-<caption>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption>
-</figure>
-<text><location><page_4><loc_9><loc_15><loc_48><loc_20></location>we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.</text>
-<text><location><page_4><loc_9><loc_11><loc_48><loc_14></location>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources</text>
-<text><location><page_4><loc_52><loc_53><loc_91><loc_61></location>include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.</text>
-<text><location><page_4><loc_52><loc_36><loc_91><loc_52></location>Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains.</text>
-<text><location><page_4><loc_52><loc_12><loc_91><loc_36></location>Phase 2: Label selection and guideline. We reviewed the collected documents and identified the most common structural features they exhibit. This was achieved by identifying recurrent layout elements and lead us to the definition of 11 distinct class labels. These 11 class labels are Caption , Footnote , Formula , List-item , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title . Critical factors that were considered for the choice of these class labels were (1) the overall occurrence of the label, (2) the specificity of the label, (3) recognisability on a single page (i.e. no need for context from previous or next page) and (4) overall coverage of the page. Specificity ensures that the choice of label is not ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable by discriminating on</text>
-<text><location><page_5><loc_9><loc_87><loc_48><loc_89></location>the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category.</text>
-<text><location><page_5><loc_9><loc_69><loc_48><loc_86></location>At first sight, the task of visual document-layout interpretation appears intuitive enough to obtain plausible annotations in most cases. However, during early trial-runs in the core team, we observed many cases in which annotators use different annotation styles, especially for documents with challenging layouts. For example, if a figure is presented with subfigures, one annotator might draw a single figure bounding-box, while another might annotate each subfigure separately. The same applies for lists, where one might annotate all list items in one block or each list item separately. In essence, we observed that challenging layouts would be annotated in different but plausible ways. To illustrate this, we show in Figure 4 multiple examples of plausible but inconsistent annotations on the same pages.</text>
-<text><location><page_5><loc_9><loc_57><loc_48><loc_68></location>Obviously, this inconsistency in annotations is not desirable for datasets which are intended to be used for model training. To minimise these inconsistencies, we created a detailed annotation guideline. While perfect consistency across 40 annotation staff members is clearly not possible to achieve, we saw a huge improvement in annotation consistency after the introduction of our annotation guideline. A few selected, non-trivial highlights of the guideline are:</text>
-<unordered_list>
-<list_item><location><page_5><loc_11><loc_51><loc_48><loc_56></location>(1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.</list_item>
-<list_item><location><page_5><loc_11><loc_45><loc_48><loc_50></location>(2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.</list_item>
-<list_item><location><page_5><loc_11><loc_42><loc_48><loc_45></location>(3) For every Caption , there must be exactly one corresponding Picture or Table .</list_item>
-<list_item><location><page_5><loc_11><loc_40><loc_48><loc_42></location>(4) Connected sub-pictures are grouped together in one Picture object.</list_item>
-<list_item><location><page_5><loc_11><loc_38><loc_43><loc_39></location>(5) Formula numbers are included in a Formula object.</list_item>
-<list_item><location><page_5><loc_11><loc_34><loc_48><loc_38></location>(6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.</list_item>
+<text><loc_259><loc_106><loc_457><loc_139>All aspects outlined above are detailed in Section 3. In Section 4, we will elaborate on how we designed and executed this large-scale human annotation campaign. We will also share key insights and lessons learned that might prove helpful for other parties planning to set up annotation campaigns.</text>
+<text><loc_260><loc_141><loc_457><loc_194>In Section 5, we will present baseline accuracy numbers for a variety of object detection methods (Faster R-CNN, Mask R-CNN and YOLOv5) trained on DocLayNet. We further show how the model performance is impacted by varying the DocLayNet dataset size, reducing the label set and modifying the train/test-split. Last but not least, we compare the performance of models trained on PubLayNet, DocBank and DocLayNet and demonstrate that a model trained on DocLayNet provides overall more robust layout recovery.</text>
+<section_header_level_1><loc_260><loc_203><loc_345><loc_209>2 RELATED WORK</section_header_level_1>
+<text><loc_259><loc_219><loc_457><loc_293>While early approaches in document-layout analysis used rulebased algorithms and heuristics [8], the problem is lately addressed with deep learning methods. The most common approach is to leverage object detection models [9-15]. In the last decade, the accuracy and speed of these models has increased dramatically. Furthermore, most state-of-the-art object detection methods can be trained and applied with very little work, thanks to a standardisation effort of the ground-truth data format [16] and common deep-learning frameworks [17]. Reference data sets such as PubLayNet [6] and DocBank provide their data in the commonly accepted COCO format [16].</text>
+<text><loc_260><loc_295><loc_457><loc_348>Lately, new types of ML models for document-layout analysis have emerged in the community [18-21]. These models do not approach the problem of layout analysis purely based on an image representation of the page, as computer vision methods do. Instead, they combine the text tokens and image representation of a page in order to obtain a segmentation. While the reported accuracies appear to be promising, a broadly accepted data format which links geometric and textual features has yet to establish.</text>
+<section_header_level_1><loc_260><loc_357><loc_390><loc_363>3 THE DOCLAYNET DATASET</section_header_level_1>
+<text><loc_260><loc_373><loc_457><loc_426>DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of labeled, rectangular boundingboxes. We define 11 distinct labels for layout features, namely Caption , Footnote , Formula , List-item , Page-footer , Page-header , Picture , Section-header , Table , Text , and Title . Our reasoning for picking this particular label set is detailed in Section 4.</text>
+<text><loc_260><loc_428><loc_456><loc_447>In addition to open intellectual property constraints for the source documents, we required that the documents in DocLayNet adhere to a few conditions. Firstly, we kept scanned documents</text>
+<page_break>
+<page_header><loc_44><loc_38><loc_284><loc_43>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</page_header>
+<page_header><loc_299><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA</page_header>
+<picture><loc_72><loc_59><loc_215><loc_139><caption><loc_44><loc_149><loc_240><loc_161>Figure 2: Distribution of DocLayNet pages across document categories.</caption></picture>
+<text><loc_44><loc_178><loc_240><loc_232>to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing "text in the wild".</text>
+<text><loc_44><loc_233><loc_241><loc_322>The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.</text>
+<text><loc_44><loc_323><loc_241><loc_384>We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features.</text>
+<text><loc_44><loc_385><loc_241><loc_432>To ensure that future benchmarks in the document-layout analysis community can be easily compared, we have split up DocLayNet into pre-defined train-, test- and validation-sets. In this way, we can avoid spurious variations in the evaluation scores due to random splitting in train-, test- and validation-sets. We also ensured that less frequent labels are represented in train and test sets in equal proportions.</text>
+<footnote><loc_44><loc_443><loc_160><loc_447>$^{2}$e.g. AAPL from https://www.annualreports.com/</footnote>
+<text><loc_259><loc_55><loc_457><loc_102>Table 1 shows the overall frequency and distribution of the labels among the different sets. Importantly, we ensure that subsets are only split on full-document boundaries. This avoids that pages of the same document are spread over train, test and validation set, which can give an undesired evaluation advantage to models and lead to overestimation of their prediction accuracy. We will show the impact of this decision in Section 5.</text>
+<text><loc_260><loc_104><loc_456><loc_171>In order to accommodate the different types of models currently in use by the community, we provide DocLayNet in an augmented COCO format [16]. This entails the standard COCO ground-truth file (in JSON format) with the associated page images (in PNG format, 1025 × 1025 pixels). Furthermore, custom fields have been added to each COCO record to specify document category, original document filename and page number. In addition, we also provide the original PDF pages, as well as sidecar files containing parsed PDF text and text-cell coordinates (in JSON). All additional files are linked to the primary page images by their matching filenames.</text>
+<text><loc_259><loc_173><loc_457><loc_372>Despite being cost-intense and far less scalable than automation, human annotation has several benefits over automated groundtruth generation. The first and most obvious reason to leverage human annotations is the freedom to annotate any type of document without requiring a programmatic source. For most PDF documents, the original source document is not available. The latter is not a hard constraint with human annotation, but it is for automated methods. A second reason to use human annotations is that the latter usually provide a more natural interpretation of the page layout. The human-interpreted layout can significantly deviate from the programmatic layout used in typesetting. For example, "invisible" tables might be used solely for aligning text paragraphs on columns. Such typesetting tricks might be interpreted by automated methods incorrectly as an actual table, while the human annotation will interpret it correctly as Text or other styles. The same applies to multi-line text elements, when authors decided to space them as "invisible" list elements without bullet symbols. A third reason to gather ground-truth through human annotation is to estimate a "natural" upper bound on the segmentation accuracy. As we will show in Section 4, certain documents featuring complex layouts can have different but equally acceptable layout interpretations. This natural upper bound for segmentation accuracy can be found by annotating the same pages multiple times by different people and evaluating the inter-annotator agreement. Such a baseline consistency evaluation is very useful to define expectations for a good target accuracy in trained deep neural network models and avoid overfitting (see Table 1). On the flip side, achieving high annotation consistency proved to be a key challenge in human annotation, as we outline in Section 4.</text>
+<section_header_level_1><loc_260><loc_383><loc_384><loc_390>4 ANNOTATION CAMPAIGN</section_header_level_1>
+<text><loc_260><loc_399><loc_457><loc_446>The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four,</text>
+<page_break>
+<page_header><loc_44><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
+<otsl><loc_81><loc_87><loc_419><loc_186><ecel><ecel><ched>% of Total<lcel><lcel><lcel><ched>triple inter-annotator mAP @ 0.5-0.95 (%)<lcel><lcel><lcel><lcel><lcel><nl><ched>class label<ched>Count<ched>Train<ched>Test<ched>Val<ched>All<ched>Fin<ched>Man<ched>Sci<ched>Law<ched>Pat<ched>Ten<nl><rhed>Caption<fcel>22524<fcel>2.04<fcel>1.77<fcel>2.32<fcel>84-89<fcel>40-61<fcel>86-92<fcel>94-99<fcel>95-99<fcel>69-78<fcel>n/a<nl><rhed>Footnote<fcel>6318<fcel>0.60<fcel>0.31<fcel>0.58<fcel>83-91<fcel>n/a<fcel>100<fcel>62-88<fcel>85-94<fcel>n/a<fcel>82-97<nl><rhed>Formula<fcel>25027<fcel>2.25<fcel>1.90<fcel>2.96<fcel>83-85<fcel>n/a<fcel>n/a<fcel>84-87<fcel>86-96<fcel>n/a<fcel>n/a<nl><rhed>List-item<fcel>185660<fcel>17.19<fcel>13.34<fcel>15.82<fcel>87-88<fcel>74-83<fcel>90-92<fcel>97-97<fcel>81-85<fcel>75-88<fcel>93-95<nl><rhed>Page-footer<fcel>70878<fcel>6.51<fcel>5.58<fcel>6.00<fcel>93-94<fcel>88-90<fcel>95-96<fcel>100<fcel>92-97<fcel>100<fcel>96-98<nl><rhed>Page-header<fcel>58022<fcel>5.10<fcel>6.70<fcel>5.06<fcel>85-89<fcel>66-76<fcel>90-94<fcel>98-100<fcel>91-92<fcel>97-99<fcel>81-86<nl><rhed>Picture<fcel>45976<fcel>4.21<fcel>2.78<fcel>5.31<fcel>69-71<fcel>56-59<fcel>82-86<fcel>69-82<fcel>80-95<fcel>66-71<fcel>59-76<nl><rhed>Section-header<fcel>142884<fcel>12.60<fcel>15.77<fcel>12.85<fcel>83-84<fcel>76-81<fcel>90-92<fcel>94-95<fcel>87-94<fcel>69-73<fcel>78-86<nl><rhed>Table<fcel>34733<fcel>3.20<fcel>2.27<fcel>3.60<fcel>77-81<fcel>75-80<fcel>83-86<fcel>98-99<fcel>58-80<fcel>79-84<fcel>70-85<nl><rhed>Text<fcel>510377<fcel>45.82<fcel>49.28<fcel>45.00<fcel>84-86<fcel>81-86<fcel>88-93<fcel>89-93<fcel>87-92<fcel>71-79<fcel>87-95<nl><rhed>Title<fcel>5071<fcel>0.47<fcel>0.30<fcel>0.50<fcel>60-72<fcel>24-63<fcel>50-63<fcel>94-100<fcel>82-96<fcel>68-79<fcel>24-56<nl><rhed>Total<fcel>1107470<fcel>941123<fcel>99816<fcel>66531<fcel>82-83<fcel>71-74<fcel>79-81<fcel>89-94<fcel>86-91<fcel>71-76<fcel>68-85<nl><caption><loc_44><loc_54><loc_456><loc_73>Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.</caption></otsl>
+<picture><loc_43><loc_196><loc_242><loc_341><caption><loc_44><loc_350><loc_242><loc_383>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption></picture>
+<text><loc_44><loc_400><loc_240><loc_426>we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.</text>
+<text><loc_44><loc_428><loc_241><loc_447>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources</text>
+<text><loc_260><loc_197><loc_457><loc_237>include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.</text>
+<text><loc_260><loc_239><loc_457><loc_320>Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains.</text>
+<text><loc_259><loc_321><loc_457><loc_438>Phase 2: Label selection and guideline. We reviewed the collected documents and identified the most common structural features they exhibit. This was achieved by identifying recurrent layout elements and lead us to the definition of 11 distinct class labels. These 11 class labels are Caption , Footnote , Formula , List-item , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title . Critical factors that were considered for the choice of these class labels were (1) the overall occurrence of the label, (2) the specificity of the label, (3) recognisability on a single page (i.e. no need for context from previous or next page) and (4) overall coverage of the page. Specificity ensures that the choice of label is not ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable by discriminating on</text>
+<footnote><loc_260><loc_443><loc_302><loc_448>$^{3}$https://arxiv.org/</footnote>
+<page_break>
+<page_header><loc_44><loc_38><loc_284><loc_43>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</page_header>
+<page_header><loc_299><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA</page_header>
+<text><loc_44><loc_55><loc_240><loc_67>the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category.</text>
+<text><loc_44><loc_69><loc_241><loc_157>At first sight, the task of visual document-layout interpretation appears intuitive enough to obtain plausible annotations in most cases. However, during early trial-runs in the core team, we observed many cases in which annotators use different annotation styles, especially for documents with challenging layouts. For example, if a figure is presented with subfigures, one annotator might draw a single figure bounding-box, while another might annotate each subfigure separately. The same applies for lists, where one might annotate all list items in one block or each list item separately. In essence, we observed that challenging layouts would be annotated in different but plausible ways. To illustrate this, we show in Figure 4 multiple examples of plausible but inconsistent annotations on the same pages.</text>
+<text><loc_44><loc_159><loc_241><loc_213>Obviously, this inconsistency in annotations is not desirable for datasets which are intended to be used for model training. To minimise these inconsistencies, we created a detailed annotation guideline. While perfect consistency across 40 annotation staff members is clearly not possible to achieve, we saw a huge improvement in annotation consistency after the introduction of our annotation guideline. A few selected, non-trivial highlights of the guideline are:</text>
+<unordered_list><list_item><loc_53><loc_220><loc_240><loc_246>(1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.</list_item>
+<list_item><loc_53><loc_248><loc_241><loc_274>(2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.</list_item>
+<list_item><loc_53><loc_275><loc_240><loc_288>(3) For every Caption , there must be exactly one corresponding Picture or Table .</list_item>
+<list_item><loc_53><loc_289><loc_240><loc_301>(4) Connected sub-pictures are grouped together in one Picture object.</list_item>
+<list_item><loc_53><loc_303><loc_216><loc_308>(5) Formula numbers are included in a Formula object.</list_item>
+<list_item><loc_53><loc_310><loc_240><loc_329>(6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.</list_item>
 </unordered_list>
-<text><location><page_5><loc_9><loc_27><loc_48><loc_33></location>The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.</text>
-<text><location><page_5><loc_9><loc_11><loc_48><loc_27></location>Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations</text>
-<figure>
-<location><page_5><loc_52><loc_42><loc_91><loc_89></location>
-<caption>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption>
-</figure>
-<text><location><page_5><loc_65><loc_42><loc_78><loc_42></location>05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0</text>
-<text><location><page_5><loc_52><loc_31><loc_91><loc_34></location>were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.</text>
-<text><location><page_5><loc_52><loc_10><loc_91><loc_31></location>Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted</text>
-<table>
-<location><page_6><loc_10><loc_56><loc_47><loc_75></location>
-<caption>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</caption>
-<row_0><col_0><body></col_0><col_1><col_header>human</col_1><col_2><col_header>MRCNN</col_2><col_3><col_header>MRCNN</col_3><col_4><col_header>FRCNN</col_4><col_5><col_header>YOLO</col_5></row_0>
-<row_1><col_0><body></col_0><col_1><col_header>human</col_1><col_2><col_header>R50</col_2><col_3><col_header>R101</col_3><col_4><col_header>R101</col_4><col_5><col_header>v5x6</col_5></row_1>
-<row_2><col_0><row_header>Caption</col_0><col_1><body>84-89</col_1><col_2><body>68.4</col_2><col_3><body>71.5</col_3><col_4><body>70.1</col_4><col_5><body>77.7</col_5></row_2>
-<row_3><col_0><row_header>Footnote</col_0><col_1><body>83-91</col_1><col_2><body>70.9</col_2><col_3><body>71.8</col_3><col_4><body>73.7</col_4><col_5><body>77.2</col_5></row_3>
-<row_4><col_0><row_header>Formula</col_0><col_1><body>83-85</col_1><col_2><body>60.1</col_2><col_3><body>63.4</col_3><col_4><body>63.5</col_4><col_5><body>66.2</col_5></row_4>
-<row_5><col_0><row_header>List-item</col_0><col_1><body>87-88</col_1><col_2><body>81.2</col_2><col_3><body>80.8</col_3><col_4><body>81.0</col_4><col_5><body>86.2</col_5></row_5>
-<row_6><col_0><row_header>Page-footer</col_0><col_1><body>93-94</col_1><col_2><body>61.6</col_2><col_3><body>59.3</col_3><col_4><body>58.9</col_4><col_5><body>61.1</col_5></row_6>
-<row_7><col_0><row_header>Page-header</col_0><col_1><body>85-89</col_1><col_2><body>71.9</col_2><col_3><body>70.0</col_3><col_4><body>72.0</col_4><col_5><body>67.9</col_5></row_7>
-<row_8><col_0><row_header>Picture</col_0><col_1><body>69-71</col_1><col_2><body>71.7</col_2><col_3><body>72.7</col_3><col_4><body>72.0</col_4><col_5><body>77.1</col_5></row_8>
-<row_9><col_0><row_header>Section-header</col_0><col_1><body>83-84</col_1><col_2><body>67.6</col_2><col_3><body>69.3</col_3><col_4><body>68.4</col_4><col_5><body>74.6</col_5></row_9>
-<row_10><col_0><row_header>Table</col_0><col_1><body>77-81</col_1><col_2><body>82.2</col_2><col_3><body>82.9</col_3><col_4><body>82.2</col_4><col_5><body>86.3</col_5></row_10>
-<row_11><col_0><row_header>Text</col_0><col_1><body>84-86</col_1><col_2><body>84.6</col_2><col_3><body>85.8</col_3><col_4><body>85.4</col_4><col_5><body>88.1</col_5></row_11>
-<row_12><col_0><row_header>Title</col_0><col_1><body>60-72</col_1><col_2><body>76.7</col_2><col_3><body>80.4</col_3><col_4><body>79.9</col_4><col_5><body>82.7</col_5></row_12>
-<row_13><col_0><row_header>All</col_0><col_1><body>82-83</col_1><col_2><body>72.4</col_2><col_3><body>73.5</col_3><col_4><body>73.4</col_4><col_5><body>76.8</col_5></row_13>
-</table>
-<text><location><page_6><loc_9><loc_27><loc_48><loc_53></location>to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.</text>
-<section_header_level_1><location><page_6><loc_9><loc_24><loc_24><loc_26></location>5 EXPERIMENTS</section_header_level_1>
-<text><location><page_6><loc_9><loc_10><loc_48><loc_23></location>The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this</text>
-<figure>
-<location><page_6><loc_53><loc_67><loc_90><loc_89></location>
-<caption>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption>
-</figure>
-<text><location><page_6><loc_52><loc_49><loc_91><loc_52></location>paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.</text>
-<text><location><page_6><loc_52><loc_39><loc_91><loc_49></location>In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].</text>
-<section_header_level_1><location><page_6><loc_52><loc_36><loc_76><loc_37></location>Baselines for Object Detection</section_header_level_1>
-<text><location><page_6><loc_52><loc_11><loc_91><loc_35></location>In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.</text>
-<text><location><page_7><loc_9><loc_84><loc_48><loc_89></location>Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels.</text>
-<table>
-<location><page_7><loc_13><loc_63><loc_44><loc_81></location>
-<caption>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</caption>
-<row_0><col_0><col_header>Class-count</col_0><col_1><col_header>11</col_1><col_2><col_header>6</col_2><col_3><col_header>5</col_3><col_4><col_header>4</col_4></row_0>
-<row_1><col_0><row_header>Caption</col_0><col_1><body>68</col_1><col_2><body>Text</col_2><col_3><body>Text</col_3><col_4><body>Text</col_4></row_1>
-<row_2><col_0><row_header>Footnote</col_0><col_1><body>71</col_1><col_2><body>Text</col_2><col_3><body>Text</col_3><col_4><body>Text</col_4></row_2>
-<row_3><col_0><row_header>Formula</col_0><col_1><body>60</col_1><col_2><body>Text</col_2><col_3><body>Text</col_3><col_4><body>Text</col_4></row_3>
-<row_4><col_0><row_header>List-item</col_0><col_1><body>81</col_1><col_2><body>Text</col_2><col_3><body>82</col_3><col_4><body>Text</col_4></row_4>
-<row_5><col_0><row_header>Page-footer</col_0><col_1><body>62</col_1><col_2><body>62</col_2><col_3><body>-</col_3><col_4><body>-</col_4></row_5>
-<row_6><col_0><row_header>Page-header</col_0><col_1><body>72</col_1><col_2><body>68</col_2><col_3><body>-</col_3><col_4><body>-</col_4></row_6>
-<row_7><col_0><row_header>Picture</col_0><col_1><body>72</col_1><col_2><body>72</col_2><col_3><body>72</col_3><col_4><body>72</col_4></row_7>
-<row_8><col_0><row_header>Section-header</col_0><col_1><body>68</col_1><col_2><body>67</col_2><col_3><body>69</col_3><col_4><body>68</col_4></row_8>
-<row_9><col_0><row_header>Table</col_0><col_1><body>82</col_1><col_2><body>83</col_2><col_3><body>82</col_3><col_4><body>82</col_4></row_9>
-<row_10><col_0><row_header>Text</col_0><col_1><body>85</col_1><col_2><body>84</col_2><col_3><body>84</col_3><col_4><body>84</col_4></row_10>
-<row_11><col_0><row_header>Title</col_0><col_1><body>77</col_1><col_2><body>Sec.-h.</col_2><col_3><body>Sec.-h.</col_3><col_4><body>Sec.-h.</col_4></row_11>
-<row_12><col_0><row_header>Overall</col_0><col_1><body>72</col_1><col_2><body>73</col_2><col_3><body>78</col_3><col_4><body>77</col_4></row_12>
-</table>
-<section_header_level_1><location><page_7><loc_9><loc_58><loc_21><loc_60></location>Learning Curve</section_header_level_1>
-<text><location><page_7><loc_9><loc_33><loc_48><loc_58></location>One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.</text>
-<section_header_level_1><location><page_7><loc_9><loc_30><loc_27><loc_32></location>Impact of Class Labels</section_header_level_1>
-<text><location><page_7><loc_9><loc_11><loc_48><loc_30></location>The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of</text>
-<table>
-<location><page_7><loc_58><loc_61><loc_85><loc_81></location>
-<row_0><col_0><body>Class-count</col_0><col_1><col_header>11</col_1><col_2><col_header>11</col_2><col_3><col_header>5</col_3><col_4><col_header>5</col_4></row_0>
-<row_1><col_0><body>Split</col_0><col_1><col_header>Doc</col_1><col_2><col_header>Page</col_2><col_3><col_header>Doc</col_3><col_4><col_header>Page</col_4></row_1>
-<row_2><col_0><row_header>Caption</col_0><col_1><body>68</col_1><col_2><body>83</col_2><col_3><body></col_3><col_4><body></col_4></row_2>
-<row_3><col_0><row_header>Footnote</col_0><col_1><body>71</col_1><col_2><body>84</col_2><col_3><body></col_3><col_4><body></col_4></row_3>
-<row_4><col_0><row_header>Formula</col_0><col_1><body>60</col_1><col_2><body>66</col_2><col_3><body></col_3><col_4><body></col_4></row_4>
-<row_5><col_0><row_header>List-item</col_0><col_1><body>81</col_1><col_2><body>88</col_2><col_3><body>82</col_3><col_4><body>88</col_4></row_5>
-<row_6><col_0><row_header>Page-footer</col_0><col_1><body>62</col_1><col_2><body>89</col_2><col_3><body></col_3><col_4><body></col_4></row_6>
-<row_7><col_0><row_header>Page-header</col_0><col_1><body>72</col_1><col_2><body>90</col_2><col_3><body></col_3><col_4><body></col_4></row_7>
-<row_8><col_0><row_header>Picture</col_0><col_1><body>72</col_1><col_2><body>82</col_2><col_3><body>72</col_3><col_4><body>82</col_4></row_8>
-<row_9><col_0><row_header>Section-header</col_0><col_1><body>68</col_1><col_2><body>83</col_2><col_3><body>69</col_3><col_4><body>83</col_4></row_9>
-<row_10><col_0><row_header>Table</col_0><col_1><body>82</col_1><col_2><body>89</col_2><col_3><body>82</col_3><col_4><body>90</col_4></row_10>
-<row_11><col_0><row_header>Text</col_0><col_1><body>85</col_1><col_2><body>91</col_2><col_3><body>84</col_3><col_4><body>90</col_4></row_11>
-<row_12><col_0><row_header>Title</col_0><col_1><body>77</col_1><col_2><body>81</col_2><col_3><body></col_3><col_4><body></col_4></row_12>
-<row_13><col_0><row_header>All</col_0><col_1><body>72</col_1><col_2><body>84</col_2><col_3><body>78</col_3><col_4><body>87</col_4></row_13>
-</table>
-<text><location><page_7><loc_52><loc_47><loc_91><loc_58></location>lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items), the label set of size 4 is the closest to PubLayNet, in the assumption that the List is down-mapped to Text in PubLayNet. The results in Table 3 show that the prediction accuracy on the remaining class labels does not change significantly when other classes are merged into them. The overall macro-average improves by around 5%, in particular when Page-footer and Page-header are excluded.</text>
-<section_header_level_1><location><page_7><loc_52><loc_44><loc_90><loc_46></location>Impact of Document Split in Train and Test Set</section_header_level_1>
-<text><location><page_7><loc_52><loc_25><loc_91><loc_44></location>Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 10% in mAP over the document-wise splitting. Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided.</text>
-<section_header_level_1><location><page_7><loc_52><loc_22><loc_68><loc_23></location>Dataset Comparison</section_header_level_1>
-<text><location><page_7><loc_52><loc_11><loc_91><loc_21></location>Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture ,</text>
-<table>
-<location><page_8><loc_12><loc_57><loc_45><loc_78></location>
-<caption>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</caption>
-<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>Testing on</col_2><col_3><col_header>Testing on</col_3><col_4><col_header>Testing on</col_4></row_0>
-<row_1><col_0><col_header>Training on</col_0><col_1><col_header>labels</col_1><col_2><col_header>PLN</col_2><col_3><col_header>DB</col_3><col_4><col_header>DLN</col_4></row_1>
-<row_2><col_0><row_header>PubLayNet (PLN)</col_0><col_1><row_header>Figure</col_1><col_2><body>96</col_2><col_3><body>43</col_3><col_4><body>23</col_4></row_2>
-<row_3><col_0><row_header>PubLayNet (PLN)</col_0><col_1><row_header>Sec-header</col_1><col_2><body>87</col_2><col_3><body>-</col_3><col_4><body>32</col_4></row_3>
-<row_4><col_0><row_header>PubLayNet (PLN)</col_0><col_1><row_header>Table</col_1><col_2><body>95</col_2><col_3><body>24</col_3><col_4><body>49</col_4></row_4>
-<row_5><col_0><row_header>PubLayNet (PLN)</col_0><col_1><row_header>Text</col_1><col_2><body>96</col_2><col_3><body>-</col_3><col_4><body>42</col_4></row_5>
-<row_6><col_0><row_header>PubLayNet (PLN)</col_0><col_1><row_header>total</col_1><col_2><body>93</col_2><col_3><body>34</col_3><col_4><body>30</col_4></row_6>
-<row_7><col_0><row_header>DocBank (DB)</col_0><col_1><row_header>Figure</col_1><col_2><body>77</col_2><col_3><body>71</col_3><col_4><body>31</col_4></row_7>
-<row_8><col_0><row_header>DocBank (DB)</col_0><col_1><row_header>Table</col_1><col_2><body>19</col_2><col_3><body>65</col_3><col_4><body>22</col_4></row_8>
-<row_9><col_0><row_header>DocBank (DB)</col_0><col_1><row_header>total</col_1><col_2><body>48</col_2><col_3><body>68</col_3><col_4><body>27</col_4></row_9>
-<row_10><col_0><row_header>DocLayNet (DLN)</col_0><col_1><row_header>Figure</col_1><col_2><body>67</col_2><col_3><body>51</col_3><col_4><body>72</col_4></row_10>
-<row_11><col_0><row_header>DocLayNet (DLN)</col_0><col_1><row_header>Sec-header</col_1><col_2><body>53</col_2><col_3><body>-</col_3><col_4><body>68</col_4></row_11>
-<row_12><col_0><row_header>DocLayNet (DLN)</col_0><col_1><row_header>Table</col_1><col_2><body>87</col_2><col_3><body>43</col_3><col_4><body>82</col_4></row_12>
-<row_13><col_0><row_header>DocLayNet (DLN)</col_0><col_1><row_header>Text</col_1><col_2><body>77</col_2><col_3><body>-</col_3><col_4><body>84</col_4></row_13>
-<row_14><col_0><row_header>DocLayNet (DLN)</col_0><col_1><row_header>total</col_1><col_2><body>59</col_2><col_3><body>47</col_3><col_4><body>78</col_4></row_14>
-</table>
-<text><location><page_8><loc_9><loc_44><loc_48><loc_51></location>Section-header , Table and Text . Before training, we either mapped or excluded DocLayNet's other labels as specified in table 3, and also PubLayNet's List to Text . Note that the different clustering of lists (by list-element vs. whole list objects) naturally decreases the mAP score for Text .</text>
-<text><location><page_8><loc_9><loc_26><loc_48><loc_44></location>For comparison of DocBank with DocLayNet, we trained only on Picture and Table clusters of each dataset. We had to exclude Text because successive paragraphs are often grouped together into a single object in DocBank. This paragraph grouping is incompatible with the individual paragraphs of DocLayNet. As can be seen in Table 5, DocLayNet trained models yield better performance compared to the previous datasets. It is noteworthy that the models trained on PubLayNet and DocBank perform very well on their own test set, but have a much lower performance on the foreign datasets. While this also applies to DocLayNet, the difference is far less pronounced. Thus we conclude that DocLayNet trained models are overall more robust and will produce better results for challenging, unseen layouts.</text>
-<section_header_level_1><location><page_8><loc_9><loc_22><loc_25><loc_24></location>Example Predictions</section_header_level_1>
-<text><location><page_8><loc_9><loc_11><loc_48><loc_22></location>To conclude this section, we illustrate the quality of layout predictions one can expect from DocLayNet-trained models by providing a selection of examples without any further post-processing applied. Figure 6 shows selected layout predictions on pages from the test-set of DocLayNet. Results look decent in general across document categories, however one can also observe mistakes such as overlapping clusters of different classes, or entirely missing boxes due to low confidence.</text>
-<section_header_level_1><location><page_8><loc_52><loc_88><loc_66><loc_89></location>6 CONCLUSION</section_header_level_1>
-<text><location><page_8><loc_52><loc_76><loc_91><loc_87></location>In this paper, we presented the DocLayNet dataset. It provides the document conversion and layout analysis research community a new and challenging dataset to improve and fine-tune novel ML methods on. In contrast to many other datasets, DocLayNet was created by human annotation in order to obtain reliable layout ground-truth on a wide variety of publication- and typesettingstyles. Including a large proportion of documents outside the scientific publishing domain adds significant value in this respect.</text>
-<text><location><page_8><loc_52><loc_64><loc_91><loc_76></location>From the dataset, we have derived on the one hand reference metrics for human performance on document-layout annotation (through double and triple annotations) and on the other hand evaluated the baseline performance of commonly used object detection methods. We also illustrated the impact of various dataset-related aspects on model performance through data-ablation experiments, both from a size and class-label perspective. Last but not least, we compared the accuracy of models trained on other public datasets and showed that DocLayNet trained models are more robust.</text>
-<text><location><page_8><loc_52><loc_60><loc_91><loc_64></location>To date, there is still a significant gap between human and ML accuracy on the layout interpretation task, and we hope that this work will inspire the research community to close that gap.</text>
-<section_header_level_1><location><page_8><loc_52><loc_56><loc_63><loc_58></location>REFERENCES</section_header_level_1>
-<unordered_list>
-<list_item><location><page_8><loc_52><loc_53><loc_91><loc_56></location>[1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.</list_item>
-<list_item><location><page_8><loc_52><loc_49><loc_91><loc_53></location>[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.</list_item>
-<list_item><location><page_8><loc_52><loc_46><loc_91><loc_49></location>[3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.</list_item>
-<list_item><location><page_8><loc_52><loc_42><loc_91><loc_46></location>[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.</list_item>
-<list_item><location><page_8><loc_52><loc_38><loc_91><loc_42></location>[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022.</list_item>
-<list_item><location><page_8><loc_52><loc_35><loc_91><loc_38></location>[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019.</list_item>
-<list_item><location><page_8><loc_52><loc_30><loc_91><loc_35></location>[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020.</list_item>
-<list_item><location><page_8><loc_52><loc_27><loc_91><loc_30></location>[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016.</list_item>
-<list_item><location><page_8><loc_52><loc_23><loc_91><loc_27></location>[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014.</list_item>
-<list_item><location><page_8><loc_52><loc_21><loc_91><loc_23></location>[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.</list_item>
-<list_item><location><page_8><loc_52><loc_18><loc_91><loc_21></location>[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.</list_item>
-<list_item><location><page_8><loc_52><loc_15><loc_91><loc_18></location>[12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.</list_item>
-<list_item><location><page_8><loc_52><loc_11><loc_91><loc_15></location>[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu</list_item>
+<text><loc_44><loc_336><loc_241><loc_363>The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.</text>
+<text><loc_44><loc_364><loc_241><loc_446>Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations</text>
+<picture><loc_258><loc_54><loc_457><loc_290><caption><loc_260><loc_299><loc_457><loc_318>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption></picture>
+<text><loc_327><loc_289><loc_389><loc_291>05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0</text>
+<text><loc_259><loc_332><loc_456><loc_344>were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.</text>
+<text><loc_259><loc_346><loc_457><loc_448>Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted</text>
+<page_break>
+<page_header><loc_44><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
+<otsl><loc_51><loc_124><loc_233><loc_222><ecel><ched>human<ched>MRCNN<lcel><ched>FRCNN<ched>YOLO<nl><ecel><ucel><ched>R50<ched>R101<ched>R101<ched>v5x6<nl><rhed>Caption<fcel>84-89<fcel>68.4<fcel>71.5<fcel>70.1<fcel>77.7<nl><rhed>Footnote<fcel>83-91<fcel>70.9<fcel>71.8<fcel>73.7<fcel>77.2<nl><rhed>Formula<fcel>83-85<fcel>60.1<fcel>63.4<fcel>63.5<fcel>66.2<nl><rhed>List-item<fcel>87-88<fcel>81.2<fcel>80.8<fcel>81.0<fcel>86.2<nl><rhed>Page-footer<fcel>93-94<fcel>61.6<fcel>59.3<fcel>58.9<fcel>61.1<nl><rhed>Page-header<fcel>85-89<fcel>71.9<fcel>70.0<fcel>72.0<fcel>67.9<nl><rhed>Picture<fcel>69-71<fcel>71.7<fcel>72.7<fcel>72.0<fcel>77.1<nl><rhed>Section-header<fcel>83-84<fcel>67.6<fcel>69.3<fcel>68.4<fcel>74.6<nl><rhed>Table<fcel>77-81<fcel>82.2<fcel>82.9<fcel>82.2<fcel>86.3<nl><rhed>Text<fcel>84-86<fcel>84.6<fcel>85.8<fcel>85.4<fcel>88.1<nl><rhed>Title<fcel>60-72<fcel>76.7<fcel>80.4<fcel>79.9<fcel>82.7<nl><rhed>All<fcel>82-83<fcel>72.4<fcel>73.5<fcel>73.4<fcel>76.8<nl><caption><loc_44><loc_55><loc_242><loc_116>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</caption></otsl>
+<text><loc_44><loc_234><loc_241><loc_364>to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.</text>
+<section_header_level_1><loc_44><loc_371><loc_120><loc_378>5 EXPERIMENTS</section_header_level_1>
+<text><loc_44><loc_387><loc_241><loc_448>The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this</text>
+<picture><loc_264><loc_57><loc_452><loc_164><caption><loc_260><loc_176><loc_457><loc_216>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption></picture>
+<text><loc_260><loc_242><loc_456><loc_255>paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.</text>
+<text><loc_260><loc_256><loc_456><loc_303>In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].</text>
+<section_header_level_1><loc_260><loc_314><loc_381><loc_320>Baselines for Object Detection</section_header_level_1>
+<text><loc_260><loc_323><loc_456><loc_446>In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.</text>
+<page_break>
+<page_header><loc_44><loc_38><loc_284><loc_43>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</page_header>
+<page_header><loc_299><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA</page_header>
+<text><loc_44><loc_55><loc_242><loc_81>Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels.</text>
+<otsl><loc_66><loc_95><loc_218><loc_187><ched>Class-count<ched>11<ched>6<ched>5<ched>4<nl><rhed>Caption<fcel>68<fcel>Text<fcel>Text<fcel>Text<nl><rhed>Footnote<fcel>71<fcel>Text<fcel>Text<fcel>Text<nl><rhed>Formula<fcel>60<fcel>Text<fcel>Text<fcel>Text<nl><rhed>List-item<fcel>81<fcel>Text<fcel>82<fcel>Text<nl><rhed>Page-footer<fcel>62<fcel>62<fcel>-<fcel>-<nl><rhed>Page-header<fcel>72<fcel>68<fcel>-<fcel>-<nl><rhed>Picture<fcel>72<fcel>72<fcel>72<fcel>72<nl><rhed>Section-header<fcel>68<fcel>67<fcel>69<fcel>68<nl><rhed>Table<fcel>82<fcel>83<fcel>82<fcel>82<nl><rhed>Text<fcel>85<fcel>84<fcel>84<fcel>84<nl><rhed>Title<fcel>77<fcel>Sec.-h.<fcel>Sec.-h.<fcel>Sec.-h.<nl><rhed>Overall<fcel>72<fcel>73<fcel>78<fcel>77<nl><caption><loc_260><loc_55><loc_457><loc_81>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</caption></otsl>
+<section_header_level_1><loc_44><loc_202><loc_107><loc_208>Learning Curve</section_header_level_1>
+<text><loc_43><loc_211><loc_241><loc_334>One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.</text>
+<section_header_level_1><loc_44><loc_342><loc_134><loc_349>Impact of Class Labels</section_header_level_1>
+<text><loc_44><loc_352><loc_241><loc_447>The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of</text>
+<otsl><loc_288><loc_95><loc_427><loc_193><fcel>Class-count<ched>11<lcel><ched>5<lcel><nl><fcel>Split<ched>Doc<ched>Page<ched>Doc<ched>Page<nl><rhed>Caption<fcel>68<fcel>83<ecel><ecel><nl><rhed>Footnote<fcel>71<fcel>84<ecel><ecel><nl><rhed>Formula<fcel>60<fcel>66<ecel><ecel><nl><rhed>List-item<fcel>81<fcel>88<fcel>82<fcel>88<nl><rhed>Page-footer<fcel>62<fcel>89<ecel><ecel><nl><rhed>Page-header<fcel>72<fcel>90<ecel><ecel><nl><rhed>Picture<fcel>72<fcel>82<fcel>72<fcel>82<nl><rhed>Section-header<fcel>68<fcel>83<fcel>69<fcel>83<nl><rhed>Table<fcel>82<fcel>89<fcel>82<fcel>90<nl><rhed>Text<fcel>85<fcel>91<fcel>84<fcel>90<nl><rhed>Title<fcel>77<fcel>81<ecel><ecel><nl><rhed>All<fcel>72<fcel>84<fcel>78<fcel>87<nl></otsl>
+<text><loc_260><loc_209><loc_457><loc_263>lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items), the label set of size 4 is the closest to PubLayNet, in the assumption that the List is down-mapped to Text in PubLayNet. The results in Table 3 show that the prediction accuracy on the remaining class labels does not change significantly when other classes are merged into them. The overall macro-average improves by around 5%, in particular when Page-footer and Page-header are excluded.</text>
+<section_header_level_1><loc_260><loc_271><loc_449><loc_278>Impact of Document Split in Train and Test Set</section_header_level_1>
+<text><loc_259><loc_281><loc_457><loc_376>Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 10% in mAP over the document-wise splitting. Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided.</text>
+<section_header_level_1><loc_260><loc_384><loc_342><loc_391>Dataset Comparison</section_header_level_1>
+<text><loc_260><loc_394><loc_457><loc_447>Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture ,</text>
+<page_break>
+<page_header><loc_44><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
+<otsl><loc_59><loc_109><loc_225><loc_215><ecel><ecel><ched>Testing on<lcel><lcel><nl><ched>Training on<ched>labels<ched>PLN<ched>DB<ched>DLN<nl><rhed>PubLayNet (PLN)<rhed>Figure<fcel>96<fcel>43<fcel>23<nl><ucel><rhed>Sec-header<fcel>87<fcel>-<fcel>32<nl><ucel><rhed>Table<fcel>95<fcel>24<fcel>49<nl><ucel><rhed>Text<fcel>96<fcel>-<fcel>42<nl><ucel><rhed>total<fcel>93<fcel>34<fcel>30<nl><rhed>DocBank (DB)<rhed>Figure<fcel>77<fcel>71<fcel>31<nl><ucel><rhed>Table<fcel>19<fcel>65<fcel>22<nl><ucel><rhed>total<fcel>48<fcel>68<fcel>27<nl><rhed>DocLayNet (DLN)<rhed>Figure<fcel>67<fcel>51<fcel>72<nl><ucel><rhed>Sec-header<fcel>53<fcel>-<fcel>68<nl><ucel><rhed>Table<fcel>87<fcel>43<fcel>82<nl><ucel><rhed>Text<fcel>77<fcel>-<fcel>84<nl><ucel><rhed>total<fcel>59<fcel>47<fcel>78<nl><caption><loc_44><loc_55><loc_242><loc_95>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</caption></otsl>
+<text><loc_44><loc_247><loc_240><loc_280>Section-header , Table and Text . Before training, we either mapped or excluded DocLayNet's other labels as specified in table 3, and also PubLayNet's List to Text . Note that the different clustering of lists (by list-element vs. whole list objects) naturally decreases the mAP score for Text .</text>
+<text><loc_44><loc_281><loc_241><loc_370>For comparison of DocBank with DocLayNet, we trained only on Picture and Table clusters of each dataset. We had to exclude Text because successive paragraphs are often grouped together into a single object in DocBank. This paragraph grouping is incompatible with the individual paragraphs of DocLayNet. As can be seen in Table 5, DocLayNet trained models yield better performance compared to the previous datasets. It is noteworthy that the models trained on PubLayNet and DocBank perform very well on their own test set, but have a much lower performance on the foreign datasets. While this also applies to DocLayNet, the difference is far less pronounced. Thus we conclude that DocLayNet trained models are overall more robust and will produce better results for challenging, unseen layouts.</text>
+<section_header_level_1><loc_44><loc_382><loc_127><loc_388>Example Predictions</section_header_level_1>
+<text><loc_44><loc_392><loc_241><loc_445>To conclude this section, we illustrate the quality of layout predictions one can expect from DocLayNet-trained models by providing a selection of examples without any further post-processing applied. Figure 6 shows selected layout predictions on pages from the test-set of DocLayNet. Results look decent in general across document categories, however one can also observe mistakes such as overlapping clusters of different classes, or entirely missing boxes due to low confidence.</text>
+<section_header_level_1><loc_260><loc_54><loc_331><loc_61>6 CONCLUSION</section_header_level_1>
+<text><loc_260><loc_64><loc_457><loc_118>In this paper, we presented the DocLayNet dataset. It provides the document conversion and layout analysis research community a new and challenging dataset to improve and fine-tune novel ML methods on. In contrast to many other datasets, DocLayNet was created by human annotation in order to obtain reliable layout ground-truth on a wide variety of publication- and typesettingstyles. Including a large proportion of documents outside the scientific publishing domain adds significant value in this respect.</text>
+<text><loc_260><loc_119><loc_457><loc_180>From the dataset, we have derived on the one hand reference metrics for human performance on document-layout annotation (through double and triple annotations) and on the other hand evaluated the baseline performance of commonly used object detection methods. We also illustrated the impact of various dataset-related aspects on model performance through data-ablation experiments, both from a size and class-label perspective. Last but not least, we compared the accuracy of models trained on other public datasets and showed that DocLayNet trained models are more robust.</text>
+<text><loc_259><loc_181><loc_456><loc_201>To date, there is still a significant gap between human and ML accuracy on the layout interpretation task, and we hope that this work will inspire the research community to close that gap.</text>
+<section_header_level_1><loc_260><loc_212><loc_316><loc_218>REFERENCES</section_header_level_1>
+<unordered_list><list_item><loc_262><loc_220><loc_456><loc_234>[1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.</list_item>
+<list_item><loc_262><loc_235><loc_457><loc_254>[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.</list_item>
+<list_item><loc_262><loc_255><loc_456><loc_270>[3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.</list_item>
+<list_item><loc_262><loc_270><loc_457><loc_290>[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.</list_item>
+<list_item><loc_262><loc_291><loc_457><loc_310>[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022.</list_item>
+<list_item><loc_262><loc_311><loc_456><loc_325>[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019.</list_item>
+<list_item><loc_262><loc_326><loc_457><loc_350>[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020.</list_item>
+<list_item><loc_262><loc_351><loc_457><loc_365>[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016.</list_item>
+<list_item><loc_262><loc_366><loc_457><loc_385>[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014.</list_item>
+<list_item><loc_260><loc_386><loc_456><loc_395>[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.</list_item>
+<list_item><loc_260><loc_396><loc_456><loc_410>[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.</list_item>
+<list_item><loc_260><loc_411><loc_457><loc_426>[12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.</list_item>
+<list_item><loc_260><loc_426><loc_457><loc_446>[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu</list_item>
 </unordered_list>
-<figure>
-<location><page_9><loc_9><loc_44><loc_91><loc_89></location>
-<caption>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
-</figure>
-<text><location><page_9><loc_9><loc_36><loc_91><loc_41></location>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</text>
-<text><location><page_9><loc_11><loc_31><loc_48><loc_33></location>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</text>
-<unordered_list>
-<list_item><location><page_9><loc_9><loc_28><loc_48><loc_30></location>[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.</list_item>
-<list_item><location><page_9><loc_9><loc_26><loc_48><loc_27></location>[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.</list_item>
-<list_item><location><page_9><loc_9><loc_23><loc_48><loc_25></location>[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.</list_item>
-<list_item><location><page_9><loc_9><loc_21><loc_48><loc_22></location>[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.</list_item>
-<list_item><location><page_9><loc_9><loc_16><loc_48><loc_20></location>[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.</list_item>
-<list_item><location><page_9><loc_9><loc_10><loc_48><loc_15></location>[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.</list_item>
-<list_item><location><page_9><loc_52><loc_32><loc_91><loc_33></location>[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</list_item>
-<list_item><location><page_9><loc_52><loc_29><loc_91><loc_31></location>[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.</list_item>
-<list_item><location><page_9><loc_52><loc_25><loc_91><loc_28></location>[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</list_item>
-<list_item><location><page_9><loc_52><loc_23><loc_91><loc_24></location>[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</list_item>
-</document>
+<page_break>
+<page_header><loc_44><loc_38><loc_284><loc_43>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</page_header>
+<page_header><loc_299><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA</page_header>
+<picture><loc_43><loc_53><loc_455><loc_279><caption><loc_51><loc_279><loc_260><loc_283>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption></picture>
+<text><loc_44><loc_293><loc_457><loc_319>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</text>
+<text><loc_57><loc_333><loc_241><loc_347>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</text>
+<unordered_list><list_item><loc_44><loc_348><loc_241><loc_362>[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.</list_item>
+<list_item><loc_44><loc_363><loc_240><loc_372>[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.</list_item>
+<list_item><loc_44><loc_373><loc_241><loc_387>[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.</list_item>
+<list_item><loc_44><loc_388><loc_241><loc_397>[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.</list_item>
+<list_item><loc_44><loc_398><loc_241><loc_422>[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.</list_item>
+<list_item><loc_44><loc_423><loc_241><loc_448>[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.</list_item>
+<list_item><loc_260><loc_333><loc_457><loc_342>[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</list_item>
+<list_item><loc_260><loc_343><loc_457><loc_357>[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.</list_item>
+<list_item><loc_260><loc_358><loc_457><loc_377>[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</list_item>
+<list_item><loc_260><loc_378><loc_457><loc_387>[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</list_item>
+</unordered_list>
+</doctag>
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.doctags.txt
@ -1,19 +1,10 @@
-<document>
-<text><location><page_1><loc_22><loc_81><loc_79><loc_85></location>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</text>
-<section_header_level_1><location><page_1><loc_22><loc_77><loc_52><loc_79></location>5.1 Hyper Parameter Optimization</section_header_level_1>
-<text><location><page_1><loc_22><loc_68><loc_79><loc_77></location>We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.</text>
-<table>
-<location><page_1><loc_23><loc_41><loc_78><loc_57></location>
-<caption>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
-<row_0><col_0><col_header>#</col_0><col_1><col_header>#</col_1><col_2><col_header>Language</col_2><col_3><col_header>TEDs</col_3><col_4><col_header>TEDs</col_4><col_5><col_header>TEDs</col_5><col_6><col_header>mAP</col_6><col_7><col_header>Inference</col_7></row_0>
-<row_1><col_0><col_header>enc-layers</col_0><col_1><col_header>dec-layers</col_1><col_2><col_header>Language</col_2><col_3><col_header>simple</col_3><col_4><col_header>complex</col_4><col_5><col_header>all</col_5><col_6><col_header>(0.75)</col_6><col_7><col_header>time (secs)</col_7></row_1>
-<row_2><col_0><body>6</col_0><col_1><body>6</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.965 0.969</col_3><col_4><body>0.934 0.927</col_4><col_5><body>0.955 0.955</col_5><col_6><body>0.88 0.857</col_6><col_7><body>2.73 5.39</col_7></row_2>
-<row_3><col_0><body>4</col_0><col_1><body>4</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.938</col_3><col_4><body>0.904</col_4><col_5><body>0.927</col_5><col_6><body>0.853</col_6><col_7><body>1.97</col_7></row_3>
-<row_4><col_0><body></col_0><col_1><body></col_1><col_2><body>OTSL</col_2><col_3><body>0.952 0.923</col_3><col_4><body>0.909</col_4><col_5><body>0.938</col_5><col_6><body>0.843</col_6><col_7><body>3.77</col_7></row_4>
-<row_5><col_0><body>2</col_0><col_1><body>4</col_1><col_2><body>HTML</col_2><col_3><body>0.945</col_3><col_4><body>0.897 0.901</col_4><col_5><body>0.915 0.931</col_5><col_6><body>0.859 0.834</col_6><col_7><body>1.91 3.81</col_7></row_5>
-<row_6><col_0><body>4</col_0><col_1><body>2</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.952 0.944</col_3><col_4><body>0.92 0.903</col_4><col_5><body>0.942 0.931</col_5><col_6><body>0.857 0.824</col_6><col_7><body>1.22 2</col_7></row_6>
-</table>
-<section_header_level_1><location><page_1><loc_22><loc_35><loc_43><loc_36></location>5.2 Quantitative Results</section_header_level_1>
-<text><location><page_1><loc_22><loc_22><loc_79><loc_34></location>We picked the model parameter configuration that produced the best prediction quality (enc=6, dec=6, heads=8) with PubTabNet alone, then independently trained and evaluated it on three publicly available data sets: PubTabNet (395k samples), FinTabNet (113k samples) and PubTables-1M (about 1M samples). Performance results are presented in Table. 2. It is clearly evident that the model trained on OTSL outperforms HTML across the board, keeping high TEDs and mAP scores even on difficult financial tables (FinTabNet) that contain sparse and large tables.</text>
-<text><location><page_1><loc_22><loc_16><loc_79><loc_22></location>Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.</text>
-</document>
+<doctag><page_header><loc_159><loc_58><loc_366><loc_65>Optimized Table Tokenization for Table Structure Recognition</page_header>
+<page_header><loc_389><loc_58><loc_393><loc_65>9</page_header>
+<text><loc_110><loc_74><loc_393><loc_97>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</text>
+<section_header_level_1><loc_110><loc_105><loc_260><loc_113>5.1 Hyper Parameter Optimization</section_header_level_1>
+<text><loc_110><loc_116><loc_393><loc_161>We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.</text>
+<otsl><loc_114><loc_213><loc_388><loc_296><ched>#<ched>#<ched>Language<ched>TEDs<lcel><lcel><ched>mAP<ched>Inference<nl><ched>enc-layers<ched>dec-layers<ucel><ched>simple<ched>complex<ched>all<ched>(0.75)<ched>time (secs)<nl><fcel>6<fcel>6<fcel>OTSL HTML<fcel>0.965 0.969<fcel>0.934 0.927<fcel>0.955 0.955<fcel>0.88 0.857<fcel>2.73 5.39<nl><fcel>4<fcel>4<fcel>OTSL HTML<fcel>0.938<fcel>0.904<fcel>0.927<fcel>0.853<fcel>1.97<nl><ecel><ecel><fcel>OTSL<fcel>0.952 0.923<fcel>0.909<fcel>0.938<fcel>0.843<fcel>3.77<nl><fcel>2<fcel>4<fcel>HTML<fcel>0.945<fcel>0.897 0.901<fcel>0.915 0.931<fcel>0.859 0.834<fcel>1.91 3.81<nl><fcel>4<fcel>2<fcel>OTSL HTML<fcel>0.952 0.944<fcel>0.92 0.903<fcel>0.942 0.931<fcel>0.857 0.824<fcel>1.22 2<nl><caption><loc_110><loc_172><loc_393><loc_207>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption></otsl>
+<section_header_level_1><loc_110><loc_319><loc_216><loc_327>5.2 Quantitative Results</section_header_level_1>
+<text><loc_110><loc_330><loc_393><loc_390>We picked the model parameter configuration that produced the best prediction quality (enc=6, dec=6, heads=8) with PubTabNet alone, then independently trained and evaluated it on three publicly available data sets: PubTabNet (395k samples), FinTabNet (113k samples) and PubTables-1M (about 1M samples). Performance results are presented in Table. 2. It is clearly evident that the model trained on OTSL outperforms HTML across the board, keeping high TEDs and mAP scores even on difficult financial tables (FinTabNet) that contain sparse and large tables.</text>
+<text><loc_110><loc_390><loc_393><loc_421>Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.</text>
+</doctag>
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt
@ -1,154 +1,149 @@
-<document>
-<section_header_level_1><location><page_1><loc_22><loc_82><loc_79><loc_85></location>Optimized Table Tokenization for Table Structure Recognition</section_header_level_1>
-<text><location><page_1><loc_23><loc_75><loc_78><loc_79></location>Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, [0000 − 0002 − 8088 − 0823]</text>
-<text><location><page_1><loc_38><loc_74><loc_49><loc_75></location>and Peter Staar</text>
-<text><location><page_1><loc_46><loc_72><loc_55><loc_73></location>IBM Research</text>
-<text><location><page_1><loc_36><loc_70><loc_64><loc_71></location>{mly,ahn,nli,cau,taa}@zurich.ibm.com</text>
-<text><location><page_1><loc_27><loc_41><loc_74><loc_66></location>Abstract. Extracting tables from documents is a crucial task in any document conversion pipeline. Recently, transformer-based models have demonstrated that table-structure can be recognized with impressive accuracy using Image-to-Markup-Sequence (Im2Seq) approaches. Taking only the image of a table, such models predict a sequence of tokens (e.g. in HTML, LaTeX) which represent the structure of the table. Since the token representation of the table structure has a significant impact on the accuracy and run-time performance of any Im2Seq model, we investigate in this paper how table-structure representation can be optimised. We propose a new, optimised table-structure language (OTSL) with a minimized vocabulary and specific rules. The benefits of OTSL are that it reduces the number of tokens to 5 (HTML needs 28+) and shortens the sequence length to half of HTML on average. Consequently, model accuracy improves significantly, inference time is halved compared to HTML-based models, and the predicted table structures are always syntactically correct. This in turn eliminates most post-processing needs. Popular table structure data-sets will be published in OTSL format to the community.</text>
-<text><location><page_1><loc_27><loc_37><loc_74><loc_40></location>Keywords: Table Structure Recognition · Data Representation · Transformers · Optimization.</text>
-<section_header_level_1><location><page_1><loc_22><loc_33><loc_37><loc_34></location>1 Introduction</section_header_level_1>
-<text><location><page_1><loc_22><loc_21><loc_79><loc_31></location>Tables are ubiquitous in documents such as scientific papers, patents, reports, manuals, specification sheets or marketing material. They often encode highly valuable information and therefore need to be extracted with high accuracy. Unfortunately, tables appear in documents in various sizes, styling and structure, making it difficult to recover their correct structure with simple analytical methods. Therefore, accurate table extraction is achieved these days with machine-learning based methods.</text>
-<text><location><page_1><loc_22><loc_16><loc_79><loc_20></location>In modern document understanding systems [1,15], table extraction is typically a two-step process. Firstly, every table on a page is located with a bounding box, and secondly, their logical row and column structure is recognized. As of</text>
-<figure>
-<location><page_2><loc_24><loc_46><loc_76><loc_74></location>
-<caption>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption>
-</figure>
-<text><location><page_2><loc_22><loc_34><loc_79><loc_43></location>today, table detection in documents is a well understood problem, and the latest state-of-the-art (SOTA) object detection methods provide an accuracy comparable to human observers [7,8,10,14,23]. On the other hand, the problem of table structure recognition (TSR) is a lot more challenging and remains a very active area of research, in which many novel machine learning algorithms are being explored [3,4,5,9,11,12,13,14,17,18,21,22].</text>
-<text><location><page_2><loc_22><loc_16><loc_79><loc_34></location>Recently emerging SOTA methods for table structure recognition employ transformer-based models, in which an image of the table is provided to the network in order to predict the structure of the table as a sequence of tokens. These image-to-sequence (Im2Seq) models are extremely powerful, since they allow for a purely data-driven solution. The tokens of the sequence typically belong to a markup language such as HTML, Latex or Markdown, which allow to describe table structure as rows, columns and spanning cells in various configurations. In Figure 1, we illustrate how HTML is used to represent the table-structure of a particular example table. Public table-structure data sets such as PubTabNet [22], and FinTabNet [21], which were created in a semi-automated way from paired PDF and HTML sources (e.g. PubMed Central), popularized primarily the use of HTML as ground-truth representation format for TSR.</text>
-<text><location><page_3><loc_22><loc_73><loc_79><loc_85></location>While the majority of research in TSR is currently focused on the development and application of novel neural model architectures, the table structure representation language (e.g. HTML in PubTabNet and FinTabNet) is usually adopted as is for the sequence tokenization in Im2Seq models. In this paper, we aim for the opposite and investigate the impact of the table structure representation language with an otherwise unmodified Im2Seq transformer-based architecture. Since the current state-of-the-art Im2Seq model is TableFormer [9], we select this model to perform our experiments.</text>
-<text><location><page_3><loc_22><loc_58><loc_79><loc_73></location>The main contribution of this paper is the introduction of a new optimised table structure language (OTSL), specifically designed to describe table-structure in an compact and structured way for Im2Seq models. OTSL has a number of key features, which make it very attractive to use in Im2Seq models. Specifically, compared to other languages such as HTML, OTSL has a minimized vocabulary which yields short sequence length, strong inherent structure (e.g. strict rectangular layout) and a strict syntax with rules that only look backwards. The latter allows for syntax validation during inference and ensures a syntactically correct table-structure. These OTSL features are illustrated in Figure 1, in comparison to HTML.</text>
-<text><location><page_3><loc_22><loc_45><loc_79><loc_58></location>The paper is structured as follows. In section 2, we give an overview of the latest developments in table-structure reconstruction. In section 3 we review the current HTML table encoding (popularised by PubTabNet and FinTabNet) and discuss its flaws. Subsequently, we introduce OTSL in section 4, which includes the language definition, syntax rules and error-correction procedures. In section 5, we apply OTSL on the TableFormer architecture, compare it to TableFormer models trained on HTML and ultimately demonstrate the advantages of using OTSL. Finally, in section 6 we conclude our work and outline next potential steps.</text>
-<section_header_level_1><location><page_3><loc_22><loc_40><loc_39><loc_42></location>2 Related Work</section_header_level_1>
-<text><location><page_3><loc_22><loc_16><loc_79><loc_38></location>Approaches to formalize the logical structure and layout of tables in electronic documents date back more than two decades [16]. In the recent past, a wide variety of computer vision methods have been explored to tackle the problem of table structure recognition, i.e. the correct identification of columns, rows and spanning cells in a given table. Broadly speaking, the current deeplearning based approaches fall into three categories: object detection (OD) methods, Graph-Neural-Network (GNN) methods and Image-to-Markup-Sequence (Im2Seq) methods. Object-detection based methods [11,12,13,14,21] rely on tablestructure annotation using (overlapping) bounding boxes for training, and produce bounding-box predictions to define table cells, rows, and columns on a table image. Graph Neural Network (GNN) based methods [3,6,17,18], as the name suggests, represent tables as graph structures. The graph nodes represent the content of each table cell, an embedding vector from the table image, or geometric coordinates of the table cell. The edges of the graph define the relationship between the nodes, e.g. if they belong to the same column, row, or table cell.</text>
-<text><location><page_4><loc_22><loc_67><loc_79><loc_85></location>Other work [20] aims at predicting a grid for each table and deciding which cells must be merged using an attention network. Im2Seq methods cast the problem as a sequence generation task [4,5,9,22], and therefore need an internal tablestructure representation language, which is often implemented with standard markup languages (e.g. HTML, LaTeX, Markdown). In theory, Im2Seq methods have a natural advantage over the OD and GNN methods by virtue of directly predicting the table-structure. As such, no post-processing or rules are needed in order to obtain the table-structure, which is necessary with OD and GNN approaches. In practice, this is not entirely true, because a predicted sequence of table-structure markup does not necessarily have to be syntactically correct. Hence, depending on the quality of the predicted sequence, some post-processing needs to be performed to ensure a syntactically valid (let alone correct) sequence.</text>
-<text><location><page_4><loc_22><loc_39><loc_79><loc_67></location>Within the Im2Seq method, we find several popular models, namely the encoder-dual-decoder model (EDD) [22], TableFormer [9], Tabsplitter[2] and Ye et. al. [19]. EDD uses two consecutive long short-term memory (LSTM) decoders to predict a table in HTML representation. The tag decoder predicts a sequence of HTML tags. For each decoded table cell ( <td> ), the attention is passed to the cell decoder to predict the content with an embedded OCR approach. The latter makes it susceptible to transcription errors in the cell content of the table. TableFormer address this reliance on OCR and uses two transformer decoders for HTML structure and cell bounding box prediction in an end-to-end architecture. The predicted cell bounding box is then used to extract text tokens from an originating (digital) PDF page, circumventing any need for OCR. TabSplitter [2] proposes a compact double-matrix representation of table rows and columns to do error detection and error correction of HTML structure sequences based on predictions from [19]. This compact double-matrix representation can not be used directly by the Img2seq model training, so the model uses HTML as an intermediate form. Chi et. al. [4] introduce a data set and a baseline method using bidirectional LSTMs to predict LaTeX code. Kayal [5] introduces Gated ResNet transformers to predict LaTeX code, and a separate OCR module to extract content.</text>
-<text><location><page_4><loc_22><loc_26><loc_79><loc_38></location>Im2Seq approaches have shown to be well-suited for the TSR task and allow a full end-to-end network design that can output the final table structure without pre- or post-processing logic. Furthermore, Im2Seq models have demonstrated to deliver state-of-the-art prediction accuracy [9]. This motivated the authors to investigate if the performance (both in accuracy and inference time) can be further improved by optimising the table structure representation language. We believe this is a necessary step before further improving neural network architectures for this task.</text>
-<section_header_level_1><location><page_4><loc_22><loc_22><loc_44><loc_24></location>3 Problem Statement</section_header_level_1>
-<text><location><page_4><loc_22><loc_16><loc_79><loc_20></location>All known Im2Seq based models for TSR fundamentally work in similar ways. Given an image of a table, the Im2Seq model predicts the structure of the table by generating a sequence of tokens. These tokens originate from a finite vocab-</text>
-<text><location><page_5><loc_22><loc_76><loc_79><loc_85></location>ulary and can be interpreted as a table structure. For example, with the HTML tokens <table> , </table> , <tr> , </tr> , <td> and </td> , one can construct simple table structures without any spanning cells. In reality though, one needs at least 28 HTML tokens to describe the most common complex tables observed in real-world documents [21,22], due to a variety of spanning cells definitions in the HTML token vocabulary.</text>
-<figure>
-<location><page_5><loc_22><loc_57><loc_78><loc_71></location>
-<caption>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption>
-</figure>
-<text><location><page_5><loc_22><loc_33><loc_79><loc_54></location>Obviously, HTML and other general-purpose markup languages were not designed for Im2Seq models. As such, they have some serious drawbacks. First, the token vocabulary needs to be artificially large in order to describe all plausible tabular structures. Since most Im2Seq models use an autoregressive approach, they generate the sequence token by token. Therefore, to reduce inference time, a shorter sequence length is critical. Every table-cell is represented by at least two tokens ( <td> and </td> ). Furthermore, when tokenizing the HTML structure, one needs to explicitly enumerate possible column-spans and row-spans as words. In practice, this ends up requiring 28 different HTML tokens (when including column- and row-spans up to 10 cells) just to describe every table in the PubTabNet dataset. Clearly, not every token is equally represented, as is depicted in Figure 2. This skewed distribution of tokens in combination with variable token row-length makes it challenging for models to learn the HTML structure.</text>
-<text><location><page_5><loc_22><loc_27><loc_79><loc_32></location>Additionally, it would be desirable if the representation would easily allow an early detection of invalid sequences on-the-go, before the prediction of the entire table structure is completed. HTML is not well-suited for this purpose as the verification of incomplete sequences is non-trivial or even impossible.</text>
-<text><location><page_5><loc_22><loc_16><loc_79><loc_26></location>In a valid HTML table, the token sequence must describe a 2D grid of table cells, serialised in row-major ordering, where each row and each column have the same length (while considering row- and column-spans). Furthermore, every opening tag in HTML needs to be matched by a closing tag in a correct hierarchical manner. Since the number of tokens for each table row and column can vary significantly, especially for large tables with many row- and column-spans, it is complex to verify the consistency of predicted structures during sequence</text>
-<text><location><page_6><loc_22><loc_82><loc_79><loc_85></location>generation. Implicitly, this also means that Im2Seq models need to learn these complex syntax rules, simply to deliver valid output.</text>
-<text><location><page_6><loc_22><loc_63><loc_79><loc_82></location>In practice, we observe two major issues with prediction quality when training Im2Seq models on HTML table structure generation from images. On the one hand, we find that on large tables, the visual attention of the model often starts to drift and is not accurately moving forward cell by cell anymore. This manifests itself in either in an increasing location drift for proposed table-cells in later rows on the same column or even complete loss of vertical alignment, as illustrated in Figure 5. Addressing this with post-processing is partially possible, but clearly undesired. On the other hand, we find many instances of predictions with structural inconsistencies or plain invalid HTML output, as shown in Figure 6, which are nearly impossible to properly correct. Both problems seriously impact the TSR model performance, since they reflect not only in the task of pure structure recognition but also in the equally crucial recognition or matching of table cell content.</text>
-<section_header_level_1><location><page_6><loc_22><loc_58><loc_61><loc_60></location>4 Optimised Table Structure Language</section_header_level_1>
-<text><location><page_6><loc_22><loc_44><loc_79><loc_56></location>To mitigate the issues with HTML in Im2Seq-based TSR models laid out before, we propose here our Optimised Table Structure Language (OTSL). OTSL is designed to express table structure with a minimized vocabulary and a simple set of rules, which are both significantly reduced compared to HTML. At the same time, OTSL enables easy error detection and correction during sequence generation. We further demonstrate how the compact structure representation and minimized sequence length improves prediction accuracy and inference time in the TableFormer architecture.</text>
-<section_header_level_1><location><page_6><loc_22><loc_40><loc_43><loc_41></location>4.1 Language Definition</section_header_level_1>
-<text><location><page_6><loc_22><loc_34><loc_79><loc_38></location>In Figure 3, we illustrate how the OTSL is defined. In essence, the OTSL defines only 5 tokens that directly describe a tabular structure based on an atomic 2D grid.</text>
-<text><location><page_6><loc_24><loc_33><loc_67><loc_34></location>The OTSL vocabulary is comprised of the following tokens:</text>
-<unordered_list>
-<list_item><location><page_6><loc_23><loc_30><loc_75><loc_31></location>-"C" cell a new table cell that either has or does not have cell content</list_item>
-<list_item><location><page_6><loc_23><loc_27><loc_79><loc_29></location>-"L" cell left-looking cell , merging with the left neighbor cell to create a span</list_item>
-<list_item><location><page_6><loc_23><loc_24><loc_79><loc_26></location>-"U" cell up-looking cell , merging with the upper neighbor cell to create a span</list_item>
-<list_item><location><page_6><loc_23><loc_22><loc_74><loc_23></location>-"X" cell cross cell , to merge with both left and upper neighbor cells</list_item>
-<list_item><location><page_6><loc_23><loc_20><loc_54><loc_21></location>-"NL" new-line , switch to the next row.</list_item>
+<doctag><page_header><loc_15><loc_132><loc_30><loc_350>arXiv:2305.03393v1 [cs.CV] 5 May 2023</page_header>
+<section_header_level_1><loc_110><loc_73><loc_393><loc_92>Optimized Table Tokenization for Table Structure Recognition</section_header_level_1>
+<text><loc_114><loc_107><loc_389><loc_126>Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, [0000 − 0002 − 8088 − 0823]</text>
+<text><loc_188><loc_123><loc_244><loc_129>and Peter Staar</text>
+<text><loc_228><loc_137><loc_275><loc_142>IBM Research</text>
+<text><loc_182><loc_144><loc_321><loc_149>{mly,ahn,nli,cau,taa}@zurich.ibm.com</text>
+<text><loc_133><loc_171><loc_369><loc_293>Abstract. Extracting tables from documents is a crucial task in any document conversion pipeline. Recently, transformer-based models have demonstrated that table-structure can be recognized with impressive accuracy using Image-to-Markup-Sequence (Im2Seq) approaches. Taking only the image of a table, such models predict a sequence of tokens (e.g. in HTML, LaTeX) which represent the structure of the table. Since the token representation of the table structure has a significant impact on the accuracy and run-time performance of any Im2Seq model, we investigate in this paper how table-structure representation can be optimised. We propose a new, optimised table-structure language (OTSL) with a minimized vocabulary and specific rules. The benefits of OTSL are that it reduces the number of tokens to 5 (HTML needs 28+) and shortens the sequence length to half of HTML on average. Consequently, model accuracy improves significantly, inference time is halved compared to HTML-based models, and the predicted table structures are always syntactically correct. This in turn eliminates most post-processing needs. Popular table structure data-sets will be published in OTSL format to the community.</text>
+<text><loc_133><loc_302><loc_369><loc_314>Keywords: Table Structure Recognition · Data Representation · Transformers · Optimization.</text>
+<section_header_level_1><loc_110><loc_330><loc_187><loc_336>1 Introduction</section_header_level_1>
+<text><loc_110><loc_346><loc_393><loc_397>Tables are ubiquitous in documents such as scientific papers, patents, reports, manuals, specification sheets or marketing material. They often encode highly valuable information and therefore need to be extracted with high accuracy. Unfortunately, tables appear in documents in various sizes, styling and structure, making it difficult to recover their correct structure with simple analytical methods. Therefore, accurate table extraction is achieved these days with machine-learning based methods.</text>
+<text><loc_110><loc_399><loc_393><loc_420>In modern document understanding systems [1,15], table extraction is typically a two-step process. Firstly, every table on a page is located with a bounding box, and secondly, their logical row and column structure is recognized. As of</text>
+<page_break>
+<page_header><loc_110><loc_59><loc_114><loc_64>2</page_header>
+<page_header><loc_137><loc_59><loc_189><loc_64>M. Lysak, et al.</page_header>
+<picture><loc_121><loc_132><loc_379><loc_269><caption><loc_110><loc_80><loc_393><loc_126>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption></picture>
+<text><loc_110><loc_286><loc_393><loc_329>today, table detection in documents is a well understood problem, and the latest state-of-the-art (SOTA) object detection methods provide an accuracy comparable to human observers [7,8,10,14,23]. On the other hand, the problem of table structure recognition (TSR) is a lot more challenging and remains a very active area of research, in which many novel machine learning algorithms are being explored [3,4,5,9,11,12,13,14,17,18,21,22].</text>
+<text><loc_110><loc_331><loc_393><loc_420>Recently emerging SOTA methods for table structure recognition employ transformer-based models, in which an image of the table is provided to the network in order to predict the structure of the table as a sequence of tokens. These image-to-sequence (Im2Seq) models are extremely powerful, since they allow for a purely data-driven solution. The tokens of the sequence typically belong to a markup language such as HTML, Latex or Markdown, which allow to describe table structure as rows, columns and spanning cells in various configurations. In Figure 1, we illustrate how HTML is used to represent the table-structure of a particular example table. Public table-structure data sets such as PubTabNet [22], and FinTabNet [21], which were created in a semi-automated way from paired PDF and HTML sources (e.g. PubMed Central), popularized primarily the use of HTML as ground-truth representation format for TSR.</text>
+<page_break>
+<page_header><loc_159><loc_59><loc_366><loc_64>Optimized Table Tokenization for Table Structure Recognition</page_header>
+<page_header><loc_389><loc_59><loc_393><loc_64>3</page_header>
+<text><loc_110><loc_75><loc_393><loc_133>While the majority of research in TSR is currently focused on the development and application of novel neural model architectures, the table structure representation language (e.g. HTML in PubTabNet and FinTabNet) is usually adopted as is for the sequence tokenization in Im2Seq models. In this paper, we aim for the opposite and investigate the impact of the table structure representation language with an otherwise unmodified Im2Seq transformer-based architecture. Since the current state-of-the-art Im2Seq model is TableFormer [9], we select this model to perform our experiments.</text>
+<text><loc_110><loc_136><loc_393><loc_209>The main contribution of this paper is the introduction of a new optimised table structure language (OTSL), specifically designed to describe table-structure in an compact and structured way for Im2Seq models. OTSL has a number of key features, which make it very attractive to use in Im2Seq models. Specifically, compared to other languages such as HTML, OTSL has a minimized vocabulary which yields short sequence length, strong inherent structure (e.g. strict rectangular layout) and a strict syntax with rules that only look backwards. The latter allows for syntax validation during inference and ensures a syntactically correct table-structure. These OTSL features are illustrated in Figure 1, in comparison to HTML.</text>
+<text><loc_110><loc_211><loc_393><loc_277>The paper is structured as follows. In section 2, we give an overview of the latest developments in table-structure reconstruction. In section 3 we review the current HTML table encoding (popularised by PubTabNet and FinTabNet) and discuss its flaws. Subsequently, we introduce OTSL in section 4, which includes the language definition, syntax rules and error-correction procedures. In section 5, we apply OTSL on the TableFormer architecture, compare it to TableFormer models trained on HTML and ultimately demonstrate the advantages of using OTSL. Finally, in section 6 we conclude our work and outline next potential steps.</text>
+<section_header_level_1><loc_110><loc_292><loc_193><loc_298>2 Related Work</section_header_level_1>
+<text><loc_110><loc_309><loc_396><loc_420>Approaches to formalize the logical structure and layout of tables in electronic documents date back more than two decades [16]. In the recent past, a wide variety of computer vision methods have been explored to tackle the problem of table structure recognition, i.e. the correct identification of columns, rows and spanning cells in a given table. Broadly speaking, the current deeplearning based approaches fall into three categories: object detection (OD) methods, Graph-Neural-Network (GNN) methods and Image-to-Markup-Sequence (Im2Seq) methods. Object-detection based methods [11,12,13,14,21] rely on tablestructure annotation using (overlapping) bounding boxes for training, and produce bounding-box predictions to define table cells, rows, and columns on a table image. Graph Neural Network (GNN) based methods [3,6,17,18], as the name suggests, represent tables as graph structures. The graph nodes represent the content of each table cell, an embedding vector from the table image, or geometric coordinates of the table cell. The edges of the graph define the relationship between the nodes, e.g. if they belong to the same column, row, or table cell.</text>
+<page_break>
+<page_header><loc_110><loc_59><loc_114><loc_64>4</page_header>
+<page_header><loc_137><loc_59><loc_189><loc_64>M. Lysak, et al.</page_header>
+<text><loc_110><loc_75><loc_393><loc_164>Other work [20] aims at predicting a grid for each table and deciding which cells must be merged using an attention network. Im2Seq methods cast the problem as a sequence generation task [4,5,9,22], and therefore need an internal tablestructure representation language, which is often implemented with standard markup languages (e.g. HTML, LaTeX, Markdown). In theory, Im2Seq methods have a natural advantage over the OD and GNN methods by virtue of directly predicting the table-structure. As such, no post-processing or rules are needed in order to obtain the table-structure, which is necessary with OD and GNN approaches. In practice, this is not entirely true, because a predicted sequence of table-structure markup does not necessarily have to be syntactically correct. Hence, depending on the quality of the predicted sequence, some post-processing needs to be performed to ensure a syntactically valid (let alone correct) sequence.</text>
+<text><loc_110><loc_166><loc_393><loc_307>Within the Im2Seq method, we find several popular models, namely the encoder-dual-decoder model (EDD) [22], TableFormer [9], Tabsplitter[2] and Ye et. al. [19]. EDD uses two consecutive long short-term memory (LSTM) decoders to predict a table in HTML representation. The tag decoder predicts a sequence of HTML tags. For each decoded table cell ( <td> ), the attention is passed to the cell decoder to predict the content with an embedded OCR approach. The latter makes it susceptible to transcription errors in the cell content of the table. TableFormer address this reliance on OCR and uses two transformer decoders for HTML structure and cell bounding box prediction in an end-to-end architecture. The predicted cell bounding box is then used to extract text tokens from an originating (digital) PDF page, circumventing any need for OCR. TabSplitter [2] proposes a compact double-matrix representation of table rows and columns to do error detection and error correction of HTML structure sequences based on predictions from [19]. This compact double-matrix representation can not be used directly by the Img2seq model training, so the model uses HTML as an intermediate form. Chi et. al. [4] introduce a data set and a baseline method using bidirectional LSTMs to predict LaTeX code. Kayal [5] introduces Gated ResNet transformers to predict LaTeX code, and a separate OCR module to extract content.</text>
+<text><loc_110><loc_309><loc_393><loc_368>Im2Seq approaches have shown to be well-suited for the TSR task and allow a full end-to-end network design that can output the final table structure without pre- or post-processing logic. Furthermore, Im2Seq models have demonstrated to deliver state-of-the-art prediction accuracy [9]. This motivated the authors to investigate if the performance (both in accuracy and inference time) can be further improved by optimising the table structure representation language. We believe this is a necessary step before further improving neural network architectures for this task.</text>
+<section_header_level_1><loc_110><loc_382><loc_220><loc_389>3 Problem Statement</section_header_level_1>
+<text><loc_110><loc_399><loc_393><loc_420>All known Im2Seq based models for TSR fundamentally work in similar ways. Given an image of a table, the Im2Seq model predicts the structure of the table by generating a sequence of tokens. These tokens originate from a finite vocab-</text>
+<page_break>
+<page_header><loc_159><loc_59><loc_366><loc_64>Optimized Table Tokenization for Table Structure Recognition</page_header>
+<page_header><loc_389><loc_59><loc_393><loc_64>5</page_header>
+<text><loc_110><loc_75><loc_393><loc_118>ulary and can be interpreted as a table structure. For example, with the HTML tokens <table> , </table> , <tr> , </tr> , <td> and </td> , one can construct simple table structures without any spanning cells. In reality though, one needs at least 28 HTML tokens to describe the most common complex tables observed in real-world documents [21,22], due to a variety of spanning cells definitions in the HTML token vocabulary.</text>
+<picture><loc_112><loc_147><loc_389><loc_215><caption><loc_119><loc_140><loc_384><loc_145>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption></picture>
+<text><loc_110><loc_232><loc_393><loc_336>Obviously, HTML and other general-purpose markup languages were not designed for Im2Seq models. As such, they have some serious drawbacks. First, the token vocabulary needs to be artificially large in order to describe all plausible tabular structures. Since most Im2Seq models use an autoregressive approach, they generate the sequence token by token. Therefore, to reduce inference time, a shorter sequence length is critical. Every table-cell is represented by at least two tokens ( <td> and </td> ). Furthermore, when tokenizing the HTML structure, one needs to explicitly enumerate possible column-spans and row-spans as words. In practice, this ends up requiring 28 different HTML tokens (when including column- and row-spans up to 10 cells) just to describe every table in the PubTabNet dataset. Clearly, not every token is equally represented, as is depicted in Figure 2. This skewed distribution of tokens in combination with variable token row-length makes it challenging for models to learn the HTML structure.</text>
+<text><loc_110><loc_338><loc_393><loc_367>Additionally, it would be desirable if the representation would easily allow an early detection of invalid sequences on-the-go, before the prediction of the entire table structure is completed. HTML is not well-suited for this purpose as the verification of incomplete sequences is non-trivial or even impossible.</text>
+<text><loc_110><loc_369><loc_393><loc_420>In a valid HTML table, the token sequence must describe a 2D grid of table cells, serialised in row-major ordering, where each row and each column have the same length (while considering row- and column-spans). Furthermore, every opening tag in HTML needs to be matched by a closing tag in a correct hierarchical manner. Since the number of tokens for each table row and column can vary significantly, especially for large tables with many row- and column-spans, it is complex to verify the consistency of predicted structures during sequence</text>
+<page_break>
+<page_header><loc_110><loc_59><loc_114><loc_64>6</page_header>
+<page_header><loc_137><loc_59><loc_189><loc_64>M. Lysak, et al.</page_header>
+<text><loc_110><loc_75><loc_393><loc_88>generation. Implicitly, this also means that Im2Seq models need to learn these complex syntax rules, simply to deliver valid output.</text>
+<text><loc_110><loc_91><loc_393><loc_187>In practice, we observe two major issues with prediction quality when training Im2Seq models on HTML table structure generation from images. On the one hand, we find that on large tables, the visual attention of the model often starts to drift and is not accurately moving forward cell by cell anymore. This manifests itself in either in an increasing location drift for proposed table-cells in later rows on the same column or even complete loss of vertical alignment, as illustrated in Figure 5. Addressing this with post-processing is partially possible, but clearly undesired. On the other hand, we find many instances of predictions with structural inconsistencies or plain invalid HTML output, as shown in Figure 6, which are nearly impossible to properly correct. Both problems seriously impact the TSR model performance, since they reflect not only in the task of pure structure recognition but also in the equally crucial recognition or matching of table cell content.</text>
+<section_header_level_1><loc_110><loc_202><loc_304><loc_209>4 Optimised Table Structure Language</section_header_level_1>
+<text><loc_110><loc_220><loc_393><loc_279>To mitigate the issues with HTML in Im2Seq-based TSR models laid out before, we propose here our Optimised Table Structure Language (OTSL). OTSL is designed to express table structure with a minimized vocabulary and a simple set of rules, which are both significantly reduced compared to HTML. At the same time, OTSL enables easy error detection and correction during sequence generation. We further demonstrate how the compact structure representation and minimized sequence length improves prediction accuracy and inference time in the TableFormer architecture.</text>
+<section_header_level_1><loc_110><loc_294><loc_214><loc_300>4.1 Language Definition</section_header_level_1>
+<text><loc_110><loc_309><loc_393><loc_329>In Figure 3, we illustrate how the OTSL is defined. In essence, the OTSL defines only 5 tokens that directly describe a tabular structure based on an atomic 2D grid.</text>
+<text><loc_122><loc_332><loc_334><loc_337>The OTSL vocabulary is comprised of the following tokens:</text>
+<unordered_list><list_item><loc_115><loc_346><loc_376><loc_352>-"C" cell a new table cell that either has or does not have cell content</list_item>
+<list_item><loc_115><loc_354><loc_393><loc_367>-"L" cell left-looking cell , merging with the left neighbor cell to create a span</list_item>
+<list_item><loc_115><loc_369><loc_393><loc_382>-"U" cell up-looking cell , merging with the upper neighbor cell to create a span</list_item>
+<list_item><loc_115><loc_385><loc_371><loc_390>-"X" cell cross cell , to merge with both left and upper neighbor cells</list_item>
+<list_item><loc_115><loc_393><loc_268><loc_398>-"NL" new-line , switch to the next row.</list_item>
 </unordered_list>
-<text><location><page_6><loc_22><loc_16><loc_79><loc_19></location>A notable attribute of OTSL is that it has the capability of achieving lossless conversion to HTML.</text>
-<figure>
-<location><page_7><loc_27><loc_65><loc_73><loc_79></location>
-<caption>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
-</figure>
-<section_header_level_1><location><page_7><loc_22><loc_60><loc_40><loc_61></location>4.2 Language Syntax</section_header_level_1>
-<text><location><page_7><loc_22><loc_58><loc_59><loc_59></location>The OTSL representation follows these syntax rules:</text>
-<unordered_list>
-<list_item><location><page_7><loc_23><loc_54><loc_79><loc_56></location>1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</list_item>
-<list_item><location><page_7><loc_23><loc_51><loc_79><loc_53></location>2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell.</list_item>
+<text><loc_110><loc_407><loc_393><loc_420>A notable attribute of OTSL is that it has the capability of achieving lossless conversion to HTML.</text>
+<page_break>
+<page_header><loc_159><loc_59><loc_366><loc_64>Optimized Table Tokenization for Table Structure Recognition</page_header>
+<page_header><loc_389><loc_59><loc_393><loc_64>7</page_header>
+<picture><loc_135><loc_103><loc_367><loc_177><caption><loc_110><loc_79><loc_393><loc_98>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption></picture>
+<section_header_level_1><loc_110><loc_193><loc_202><loc_198>4.2 Language Syntax</section_header_level_1>
+<text><loc_110><loc_205><loc_297><loc_211>The OTSL representation follows these syntax rules:</text>
+<unordered_list><list_item><loc_114><loc_219><loc_393><loc_232>1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</list_item>
+<list_item><loc_114><loc_234><loc_393><loc_247>2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell.</list_item>
 </unordered_list>
-<section_header_level_1><location><page_7><loc_23><loc_49><loc_37><loc_50></location>3. Cross cell rule :</section_header_level_1>
-<unordered_list>
-<list_item><location><page_7><loc_25><loc_44><loc_79><loc_49></location>The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell.</list_item>
-<list_item><location><page_7><loc_23><loc_43><loc_78><loc_44></location>4. First row rule : Only "L" cells and "C" cells are allowed in the first row.</list_item>
-<list_item><location><page_7><loc_23><loc_40><loc_79><loc_43></location>5. First column rule : Only "U" cells and "C" cells are allowed in the first column.</list_item>
-<list_item><location><page_7><loc_23><loc_37><loc_79><loc_40></location>6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token.</list_item>
+<section_header_level_1><loc_114><loc_249><loc_185><loc_255>3. Cross cell rule :</section_header_level_1>
+<unordered_list><list_item><loc_124><loc_257><loc_393><loc_278>The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell.</list_item>
+<list_item><loc_114><loc_280><loc_388><loc_285>4. First row rule : Only "L" cells and "C" cells are allowed in the first row.</list_item>
+<list_item><loc_114><loc_287><loc_393><loc_300>5. First column rule : Only "U" cells and "C" cells are allowed in the first column.</list_item>
+<list_item><loc_114><loc_302><loc_393><loc_315>6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token.</list_item>
 </unordered_list>
-<text><location><page_7><loc_22><loc_19><loc_79><loc_35></location>The application of these rules gives OTSL a set of unique properties. First of all, the OTSL enforces a strictly rectangular structure representation, where every new-line token starts a new row. As a consequence, all rows and all columns have exactly the same number of tokens, irrespective of cell spans. Secondly, the OTSL representation is unambiguous: Every table structure is represented in one way. In this representation every table cell corresponds to a "C"-cell token, which in case of spans is always located in the top-left corner of the table cell definition. Third, OTSL syntax rules are only backward-looking. As a consequence, every predicted token can be validated straight during sequence generation by looking at the previously predicted sequence. As such, OTSL can guarantee that every predicted sequence is syntactically valid.</text>
-<text><location><page_7><loc_22><loc_16><loc_79><loc_19></location>These characteristics can be easily learned by sequence generator networks, as we demonstrate further below. We find strong indications that this pattern</text>
-<text><location><page_8><loc_22><loc_82><loc_79><loc_85></location>reduces significantly the column drift seen in the HTML based models (see Figure 5).</text>
-<section_header_level_1><location><page_8><loc_22><loc_78><loc_52><loc_80></location>4.3 Error-detection and -mitigation</section_header_level_1>
-<text><location><page_8><loc_22><loc_62><loc_79><loc_77></location>The design of OTSL allows to validate a table structure easily on an unfinished sequence. The detection of an invalid sequence token is a clear indication of a prediction mistake, however a valid sequence by itself does not guarantee prediction correctness. Different heuristics can be used to correct token errors in an invalid sequence and thus increase the chances for accurate predictions. Such heuristics can be applied either after the prediction of each token, or at the end on the entire predicted sequence. For example a simple heuristic which can correct the predicted OTSL sequence on-the-fly is to verify if the token with the highest prediction confidence invalidates the predicted sequence, and replace it by the token with the next highest confidence until OTSL rules are satisfied.</text>
-<section_header_level_1><location><page_8><loc_22><loc_58><loc_37><loc_59></location>5 Experiments</section_header_level_1>
-<text><location><page_8><loc_22><loc_43><loc_79><loc_56></location>To evaluate the impact of OTSL on prediction accuracy and inference times, we conducted a series of experiments based on the TableFormer model (Figure 4) with two objectives: Firstly we evaluate the prediction quality and performance of OTSL vs. HTML after performing Hyper Parameter Optimization (HPO) on the canonical PubTabNet data set. Secondly we pick the best hyper-parameters found in the first step and evaluate how OTSL impacts the performance of TableFormer after training on other publicly available data sets (FinTabNet, PubTables-1M [14]). The ground truth (GT) from all data sets has been converted into OTSL format for this purpose, and will be made publicly available.</text>
-<figure>
-<location><page_8><loc_23><loc_25><loc_77><loc_36></location>
-<caption>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption>
-</figure>
-<text><location><page_8><loc_22><loc_16><loc_79><loc_22></location>We rely on standard metrics such as Tree Edit Distance score (TEDs) for table structure prediction, and Mean Average Precision (mAP) with 0.75 Intersection Over Union (IOU) threshold for the bounding-box predictions of table cells. The predicted OTSL structures were converted back to HTML format in</text>
-<text><location><page_9><loc_22><loc_81><loc_79><loc_85></location>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</text>
-<section_header_level_1><location><page_9><loc_22><loc_78><loc_52><loc_79></location>5.1 Hyper Parameter Optimization</section_header_level_1>
-<text><location><page_9><loc_22><loc_68><loc_79><loc_77></location>We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.</text>
-<table>
-<location><page_9><loc_23><loc_41><loc_78><loc_57></location>
-<caption>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
-<row_0><col_0><col_header>#</col_0><col_1><col_header>#</col_1><col_2><col_header>Language</col_2><col_3><col_header>TEDs</col_3><col_4><col_header>TEDs</col_4><col_5><col_header>TEDs</col_5><col_6><col_header>mAP</col_6><col_7><col_header>Inference</col_7></row_0>
-<row_1><col_0><col_header>enc-layers</col_0><col_1><col_header>dec-layers</col_1><col_2><col_header>Language</col_2><col_3><col_header>simple</col_3><col_4><col_header>complex</col_4><col_5><col_header>all</col_5><col_6><col_header>(0.75)</col_6><col_7><col_header>time (secs)</col_7></row_1>
-<row_2><col_0><body>6</col_0><col_1><body>6</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.965 0.969</col_3><col_4><body>0.934 0.927</col_4><col_5><body>0.955 0.955</col_5><col_6><body>0.88 0.857</col_6><col_7><body>2.73 5.39</col_7></row_2>
-<row_3><col_0><body>4</col_0><col_1><body>4</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.938 0.952</col_3><col_4><body>0.904</col_4><col_5><body>0.927</col_5><col_6><body>0.853</col_6><col_7><body>1.97</col_7></row_3>
-<row_4><col_0><body>2</col_0><col_1><body>4</col_1><col_2><body>OTSL</col_2><col_3><body>0.923 0.945</col_3><col_4><body>0.909 0.897</col_4><col_5><body>0.938</col_5><col_6><body>0.843</col_6><col_7><body>3.77</col_7></row_4>
-<row_5><col_0><body></col_0><col_1><body></col_1><col_2><body>HTML</col_2><col_3><body></col_3><col_4><body>0.901</col_4><col_5><body>0.915 0.931</col_5><col_6><body>0.859 0.834</col_6><col_7><body>1.91 3.81</col_7></row_5>
-<row_6><col_0><body>4</col_0><col_1><body>2</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.952 0.944</col_3><col_4><body>0.92 0.903</col_4><col_5><body>0.942 0.931</col_5><col_6><body>0.857 0.824</col_6><col_7><body>1.22 2</col_7></row_6>
-</table>
-<section_header_level_1><location><page_9><loc_22><loc_35><loc_43><loc_36></location>5.2 Quantitative Results</section_header_level_1>
-<text><location><page_9><loc_22><loc_22><loc_79><loc_34></location>We picked the model parameter configuration that produced the best prediction quality (enc=6, dec=6, heads=8) with PubTabNet alone, then independently trained and evaluated it on three publicly available data sets: PubTabNet (395k samples), FinTabNet (113k samples) and PubTables-1M (about 1M samples). Performance results are presented in Table. 2. It is clearly evident that the model trained on OTSL outperforms HTML across the board, keeping high TEDs and mAP scores even on difficult financial tables (FinTabNet) that contain sparse and large tables.</text>
-<text><location><page_9><loc_22><loc_16><loc_79><loc_22></location>Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.</text>
-<table>
-<location><page_10><loc_23><loc_67><loc_77><loc_80></location>
-<caption>Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).</caption>
-<row_0><col_0><body></col_0><col_1><col_header>Language</col_1><col_2><col_header>TEDs</col_2><col_3><col_header>TEDs</col_3><col_4><col_header>TEDs</col_4><col_5><col_header>mAP(0.75)</col_5><col_6><col_header>Inference time (secs)</col_6></row_0>
-<row_1><col_0><body></col_0><col_1><col_header>Language</col_1><col_2><col_header>simple</col_2><col_3><col_header>complex</col_3><col_4><col_header>all</col_4><col_5><col_header>mAP(0.75)</col_5><col_6><col_header>Inference time (secs)</col_6></row_1>
-<row_2><col_0><row_header>PubTabNet</col_0><col_1><row_header>OTSL</col_1><col_2><body>0.965</col_2><col_3><body>0.934</col_3><col_4><body>0.955</col_4><col_5><body>0.88</col_5><col_6><body>2.73</col_6></row_2>
-<row_3><col_0><row_header>PubTabNet</col_0><col_1><row_header>HTML</col_1><col_2><body>0.969</col_2><col_3><body>0.927</col_3><col_4><body>0.955</col_4><col_5><body>0.857</col_5><col_6><body>5.39</col_6></row_3>
-<row_4><col_0><row_header>FinTabNet</col_0><col_1><row_header>OTSL</col_1><col_2><body>0.955</col_2><col_3><body>0.961</col_3><col_4><body>0.959</col_4><col_5><body>0.862</col_5><col_6><body>1.85</col_6></row_4>
-<row_5><col_0><row_header>FinTabNet</col_0><col_1><row_header>HTML</col_1><col_2><body>0.917</col_2><col_3><body>0.922</col_3><col_4><body>0.92</col_4><col_5><body>0.722</col_5><col_6><body>3.26</col_6></row_5>
-<row_6><col_0><row_header>PubTables-1M</col_0><col_1><row_header>OTSL</col_1><col_2><body>0.987</col_2><col_3><body>0.964</col_3><col_4><body>0.977</col_4><col_5><body>0.896</col_5><col_6><body>1.79</col_6></row_6>
-<row_7><col_0><row_header>PubTables-1M</col_0><col_1><row_header>HTML</col_1><col_2><body>0.983</col_2><col_3><body>0.944</col_3><col_4><body>0.966</col_4><col_5><body>0.889</col_5><col_6><body>3.26</col_6></row_7>
-</table>
-<section_header_level_1><location><page_10><loc_22><loc_62><loc_42><loc_64></location>5.3 Qualitative Results</section_header_level_1>
-<text><location><page_10><loc_22><loc_54><loc_79><loc_61></location>To illustrate the qualitative differences between OTSL and HTML, Figure 5 demonstrates less overlap and more accurate bounding boxes with OTSL. In Figure 6, OTSL proves to be more effective in handling tables with longer token sequences, resulting in even more precise structure prediction and bounding boxes.</text>
-<figure>
-<location><page_10><loc_27><loc_16><loc_74><loc_44></location>
-<caption>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption>
-</figure>
-<text><location><page_10><loc_37><loc_15><loc_38><loc_16></location>μ</text>
-<text><location><page_10><loc_49><loc_12><loc_49><loc_14></location>≥</text>
-<figure>
-<location><page_11><loc_28><loc_20><loc_73><loc_77></location>
-<caption>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption>
-</figure>
-<section_header_level_1><location><page_12><loc_22><loc_84><loc_36><loc_85></location>6 Conclusion</section_header_level_1>
-<text><location><page_12><loc_22><loc_74><loc_79><loc_81></location>We demonstrated that representing tables in HTML for the task of table structure recognition with Im2Seq models is ill-suited and has serious limitations. Furthermore, we presented in this paper an Optimized Table Structure Language (OTSL) which, when compared to commonly used general purpose languages, has several key benefits.</text>
-<text><location><page_12><loc_22><loc_59><loc_79><loc_74></location>First and foremost, given the same network configuration, inference time for a table-structure prediction is about 2 times faster compared to the conventional HTML approach. This is primarily owed to the shorter sequence length of the OTSL representation. Additional performance benefits can be obtained with HPO (hyper parameter optimization). As we demonstrate in our experiments, models trained on OTSL can be significantly smaller, e.g. by reducing the number of encoder and decoder layers, while preserving comparatively good prediction quality. This can further improve inference performance, yielding 5-6 times faster inference speed in OTSL with prediction quality comparable to models trained on HTML (see Table 1).</text>
-<text><location><page_12><loc_22><loc_41><loc_79><loc_59></location>Secondly, OTSL has more inherent structure and a significantly restricted vocabulary size. This allows autoregressive models to perform better in the TED metric, but especially with regards to prediction accuracy of the table-cell bounding boxes (see Table 2). As shown in Figure 5, we observe that the OTSL drastically reduces the drift for table cell bounding boxes at high row count and in sparse tables. This leads to more accurate predictions and a significant reduction in post-processing complexity, which is an undesired necessity in HTML-based Im2Seq models. Significant novelty lies in OTSL syntactical rules, which are few, simple and always backwards looking. Each new token can be validated only by analyzing the sequence of previous tokens, without requiring the entire sequence to detect mistakes. This in return allows to perform structural error detection and correction on-the-fly during sequence generation.</text>
-<section_header_level_1><location><page_12><loc_22><loc_36><loc_32><loc_38></location>References</section_header_level_1>
-<unordered_list>
-<list_item><location><page_12><loc_23><loc_29><loc_79><loc_34></location>1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785</list_item>
-<list_item><location><page_12><loc_23><loc_23><loc_79><loc_28></location>2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)</list_item>
-<list_item><location><page_12><loc_23><loc_20><loc_79><loc_23></location>3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)</list_item>
-<list_item><location><page_12><loc_23><loc_16><loc_79><loc_20></location>4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)</list_item>
+<text><loc_110><loc_324><loc_393><loc_405>The application of these rules gives OTSL a set of unique properties. First of all, the OTSL enforces a strictly rectangular structure representation, where every new-line token starts a new row. As a consequence, all rows and all columns have exactly the same number of tokens, irrespective of cell spans. Secondly, the OTSL representation is unambiguous: Every table structure is represented in one way. In this representation every table cell corresponds to a "C"-cell token, which in case of spans is always located in the top-left corner of the table cell definition. Third, OTSL syntax rules are only backward-looking. As a consequence, every predicted token can be validated straight during sequence generation by looking at the previously predicted sequence. As such, OTSL can guarantee that every predicted sequence is syntactically valid.</text>
+<text><loc_110><loc_407><loc_393><loc_420>These characteristics can be easily learned by sequence generator networks, as we demonstrate further below. We find strong indications that this pattern</text>
+<page_break>
+<page_header><loc_110><loc_59><loc_114><loc_64>8</page_header>
+<page_header><loc_137><loc_59><loc_189><loc_64>M. Lysak, et al.</page_header>
+<text><loc_110><loc_75><loc_393><loc_88>reduces significantly the column drift seen in the HTML based models (see Figure 5).</text>
+<section_header_level_1><loc_110><loc_102><loc_261><loc_108>4.3 Error-detection and -mitigation</section_header_level_1>
+<text><loc_110><loc_115><loc_393><loc_189>The design of OTSL allows to validate a table structure easily on an unfinished sequence. The detection of an invalid sequence token is a clear indication of a prediction mistake, however a valid sequence by itself does not guarantee prediction correctness. Different heuristics can be used to correct token errors in an invalid sequence and thus increase the chances for accurate predictions. Such heuristics can be applied either after the prediction of each token, or at the end on the entire predicted sequence. For example a simple heuristic which can correct the predicted OTSL sequence on-the-fly is to verify if the token with the highest prediction confidence invalidates the predicted sequence, and replace it by the token with the next highest confidence until OTSL rules are satisfied.</text>
+<section_header_level_1><loc_110><loc_203><loc_187><loc_209>5 Experiments</section_header_level_1>
+<text><loc_110><loc_219><loc_393><loc_285>To evaluate the impact of OTSL on prediction accuracy and inference times, we conducted a series of experiments based on the TableFormer model (Figure 4) with two objectives: Firstly we evaluate the prediction quality and performance of OTSL vs. HTML after performing Hyper Parameter Optimization (HPO) on the canonical PubTabNet data set. Secondly we pick the best hyper-parameters found in the first step and evaluate how OTSL impacts the performance of TableFormer after training on other publicly available data sets (FinTabNet, PubTables-1M [14]). The ground truth (GT) from all data sets has been converted into OTSL format for this purpose, and will be made publicly available.</text>
+<picture><loc_115><loc_321><loc_386><loc_375><caption><loc_110><loc_306><loc_393><loc_318>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption></picture>
+<text><loc_110><loc_392><loc_393><loc_420>We rely on standard metrics such as Tree Edit Distance score (TEDs) for table structure prediction, and Mean Average Precision (mAP) with 0.75 Intersection Over Union (IOU) threshold for the bounding-box predictions of table cells. The predicted OTSL structures were converted back to HTML format in</text>
+<page_break>
+<page_header><loc_159><loc_59><loc_366><loc_64>Optimized Table Tokenization for Table Structure Recognition</page_header>
+<page_header><loc_389><loc_59><loc_393><loc_64>9</page_header>
+<text><loc_110><loc_75><loc_393><loc_96>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</text>
+<section_header_level_1><loc_110><loc_107><loc_260><loc_112>5.1 Hyper Parameter Optimization</section_header_level_1>
+<text><loc_110><loc_117><loc_393><loc_160>We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.</text>
+<otsl><loc_114><loc_213><loc_388><loc_296><ched>#<ched>#<ched>Language<ched>TEDs<lcel><lcel><ched>mAP<ched>Inference<nl><ched>enc-layers<ched>dec-layers<ucel><ched>simple<ched>complex<ched>all<ched>(0.75)<ched>time (secs)<nl><fcel>6<fcel>6<fcel>OTSL HTML<fcel>0.965 0.969<fcel>0.934 0.927<fcel>0.955 0.955<fcel>0.88 0.857<fcel>2.73 5.39<nl><fcel>4<fcel>4<fcel>OTSL HTML<fcel>0.938 0.952<fcel>0.904<fcel>0.927<fcel>0.853<fcel>1.97<nl><fcel>2<fcel>4<fcel>OTSL<fcel>0.923 0.945<fcel>0.909 0.897<fcel>0.938<fcel>0.843<fcel>3.77<nl><ecel><ecel><fcel>HTML<ecel><fcel>0.901<fcel>0.915 0.931<fcel>0.859 0.834<fcel>1.91 3.81<nl><fcel>4<fcel>2<fcel>OTSL HTML<fcel>0.952 0.944<fcel>0.92 0.903<fcel>0.942 0.931<fcel>0.857 0.824<fcel>1.22 2<nl><caption><loc_110><loc_174><loc_393><loc_206>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption></otsl>
+<section_header_level_1><loc_110><loc_321><loc_216><loc_326>5.2 Quantitative Results</section_header_level_1>
+<text><loc_110><loc_331><loc_393><loc_390>We picked the model parameter configuration that produced the best prediction quality (enc=6, dec=6, heads=8) with PubTabNet alone, then independently trained and evaluated it on three publicly available data sets: PubTabNet (395k samples), FinTabNet (113k samples) and PubTables-1M (about 1M samples). Performance results are presented in Table. 2. It is clearly evident that the model trained on OTSL outperforms HTML across the board, keeping high TEDs and mAP scores even on difficult financial tables (FinTabNet) that contain sparse and large tables.</text>
+<text><loc_110><loc_392><loc_393><loc_420>Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.</text>
+<page_break>
+<page_header><loc_110><loc_59><loc_118><loc_64>10</page_header>
+<page_header><loc_137><loc_59><loc_189><loc_64>M. Lysak, et al.</page_header>
+<otsl><loc_117><loc_99><loc_385><loc_166><ecel><ched>Language<ched>TEDs<lcel><lcel><ched>mAP(0.75)<ched>Inference time (secs)<nl><ecel><ucel><ched>simple<ched>complex<ched>all<ucel><ucel><nl><rhed>PubTabNet<rhed>OTSL<fcel>0.965<fcel>0.934<fcel>0.955<fcel>0.88<fcel>2.73<nl><ucel><rhed>HTML<fcel>0.969<fcel>0.927<fcel>0.955<fcel>0.857<fcel>5.39<nl><rhed>FinTabNet<rhed>OTSL<fcel>0.955<fcel>0.961<fcel>0.959<fcel>0.862<fcel>1.85<nl><ucel><rhed>HTML<fcel>0.917<fcel>0.922<fcel>0.92<fcel>0.722<fcel>3.26<nl><rhed>PubTables-1M<rhed>OTSL<fcel>0.987<fcel>0.964<fcel>0.977<fcel>0.896<fcel>1.79<nl><ucel><rhed>HTML<fcel>0.983<fcel>0.944<fcel>0.966<fcel>0.889<fcel>3.26<nl><caption><loc_110><loc_73><loc_393><loc_92>Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).</caption></otsl>
+<section_header_level_1><loc_110><loc_182><loc_210><loc_188>5.3 Qualitative Results</section_header_level_1>
+<text><loc_110><loc_196><loc_393><loc_231>To illustrate the qualitative differences between OTSL and HTML, Figure 5 demonstrates less overlap and more accurate bounding boxes with OTSL. In Figure 6, OTSL proves to be more effective in handling tables with longer token sequences, resulting in even more precise structure prediction and bounding boxes.</text>
+<picture><loc_133><loc_281><loc_369><loc_419><caption><loc_110><loc_251><loc_393><loc_278>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption></picture>
+<text><loc_186><loc_420><loc_188><loc_426>μ</text>
+<text><loc_246><loc_432><loc_247><loc_438>≥</text>
+<page_break>
+<page_header><loc_159><loc_59><loc_366><loc_64>Optimized Table Tokenization for Table Structure Recognition</page_header>
+<page_header><loc_385><loc_59><loc_393><loc_64>11</page_header>
+<picture><loc_138><loc_115><loc_365><loc_400><caption><loc_110><loc_79><loc_393><loc_112>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption></picture>
+<page_break>
+<page_header><loc_110><loc_59><loc_118><loc_64>12</page_header>
+<page_header><loc_137><loc_59><loc_189><loc_64>M. Lysak, et al.</page_header>
+<section_header_level_1><loc_110><loc_74><loc_179><loc_81>6 Conclusion</section_header_level_1>
+<text><loc_110><loc_93><loc_393><loc_128>We demonstrated that representing tables in HTML for the task of table structure recognition with Im2Seq models is ill-suited and has serious limitations. Furthermore, we presented in this paper an Optimized Table Structure Language (OTSL) which, when compared to commonly used general purpose languages, has several key benefits.</text>
+<text><loc_110><loc_131><loc_393><loc_204>First and foremost, given the same network configuration, inference time for a table-structure prediction is about 2 times faster compared to the conventional HTML approach. This is primarily owed to the shorter sequence length of the OTSL representation. Additional performance benefits can be obtained with HPO (hyper parameter optimization). As we demonstrate in our experiments, models trained on OTSL can be significantly smaller, e.g. by reducing the number of encoder and decoder layers, while preserving comparatively good prediction quality. This can further improve inference performance, yielding 5-6 times faster inference speed in OTSL with prediction quality comparable to models trained on HTML (see Table 1).</text>
+<text><loc_110><loc_207><loc_393><loc_296>Secondly, OTSL has more inherent structure and a significantly restricted vocabulary size. This allows autoregressive models to perform better in the TED metric, but especially with regards to prediction accuracy of the table-cell bounding boxes (see Table 2). As shown in Figure 5, we observe that the OTSL drastically reduces the drift for table cell bounding boxes at high row count and in sparse tables. This leads to more accurate predictions and a significant reduction in post-processing complexity, which is an undesired necessity in HTML-based Im2Seq models. Significant novelty lies in OTSL syntactical rules, which are few, simple and always backwards looking. Each new token can be validated only by analyzing the sequence of previous tokens, without requiring the entire sequence to detect mistakes. This in return allows to perform structural error detection and correction on-the-fly during sequence generation.</text>
+<section_header_level_1><loc_110><loc_312><loc_162><loc_318>References</section_header_level_1>
+<unordered_list><list_item><loc_114><loc_330><loc_393><loc_356>1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785</list_item>
+<list_item><loc_114><loc_358><loc_393><loc_384>2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)</list_item>
+<list_item><loc_114><loc_386><loc_393><loc_398>3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)</list_item>
+<list_item><loc_114><loc_401><loc_393><loc_420>4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)</list_item>
 </unordered_list>
-<unordered_list>
-<list_item><location><page_13><loc_23><loc_81><loc_79><loc_85></location>5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)</list_item>
-<list_item><location><page_13><loc_23><loc_76><loc_79><loc_81></location>6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)</list_item>
-<list_item><location><page_13><loc_23><loc_73><loc_79><loc_75></location>7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)</list_item>
-<list_item><location><page_13><loc_23><loc_66><loc_79><loc_72></location>8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777</list_item>
-<list_item><location><page_13><loc_23><loc_62><loc_79><loc_66></location>9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022)</list_item>
-<list_item><location><page_13><loc_22><loc_53><loc_79><loc_61></location>10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043</list_item>
-<list_item><location><page_13><loc_22><loc_48><loc_79><loc_53></location>11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020)</list_item>
-<list_item><location><page_13><loc_22><loc_42><loc_79><loc_48></location>12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017)</list_item>
-<list_item><location><page_13><loc_22><loc_37><loc_79><loc_42></location>13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226</list_item>
-<list_item><location><page_13><loc_22><loc_31><loc_79><loc_36></location>14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022)</list_item>
-<list_item><location><page_13><loc_22><loc_23><loc_79><loc_31></location>15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834</list_item>
-<list_item><location><page_13><loc_22><loc_20><loc_79><loc_23></location>16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397</list_item>
-<list_item><location><page_13><loc_22><loc_16><loc_79><loc_20></location>17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)</list_item>
+<page_break>
+<page_header><loc_159><loc_59><loc_366><loc_64>Optimized Table Tokenization for Table Structure Recognition</page_header>
+<page_header><loc_385><loc_59><loc_393><loc_64>13</page_header>
+<unordered_list><list_item><loc_114><loc_76><loc_393><loc_94>5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)</list_item>
+<list_item><loc_114><loc_96><loc_393><loc_122>6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)</list_item>
+<list_item><loc_114><loc_124><loc_393><loc_136>7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)</list_item>
+<list_item><loc_114><loc_138><loc_393><loc_171>8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777</list_item>
+<list_item><loc_114><loc_172><loc_393><loc_191>9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022)</list_item>
+<list_item><loc_110><loc_193><loc_393><loc_233>10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043</list_item>
+<list_item><loc_110><loc_235><loc_393><loc_261>11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020)</list_item>
+<list_item><loc_110><loc_262><loc_393><loc_288>12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017)</list_item>
+<list_item><loc_110><loc_290><loc_393><loc_316>13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226</list_item>
+<list_item><loc_110><loc_318><loc_393><loc_344>14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022)</list_item>
+<list_item><loc_110><loc_345><loc_393><loc_385>15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834</list_item>
+<list_item><loc_110><loc_387><loc_393><loc_399>16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397</list_item>
+<list_item><loc_110><loc_401><loc_393><loc_420>17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)</list_item>
 </unordered_list>
-<unordered_list>
-<list_item><location><page_14><loc_22><loc_81><loc_79><loc_85></location>18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)</list_item>
-<list_item><location><page_14><loc_22><loc_76><loc_79><loc_81></location>19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848</list_item>
-<list_item><location><page_14><loc_22><loc_73><loc_79><loc_75></location>20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)</list_item>
-<list_item><location><page_14><loc_22><loc_66><loc_79><loc_72></location>21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074</list_item>
-<list_item><location><page_14><loc_22><loc_60><loc_79><loc_66></location>22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)</list_item>
-<list_item><location><page_14><loc_22><loc_56><loc_79><loc_60></location>23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)</list_item>
-</document>
+<page_break>
+<page_header><loc_110><loc_59><loc_118><loc_64>14</page_header>
+<page_header><loc_137><loc_59><loc_189><loc_64>M. Lysak, et al.</page_header>
+<unordered_list><list_item><loc_110><loc_76><loc_393><loc_94>18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)</list_item>
+<list_item><loc_110><loc_96><loc_393><loc_122>19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848</list_item>
+<list_item><loc_110><loc_124><loc_393><loc_136>20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)</list_item>
+<list_item><loc_110><loc_138><loc_393><loc_171>21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074</list_item>
+<list_item><loc_110><loc_172><loc_393><loc_198>22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)</list_item>
+<list_item><loc_110><loc_200><loc_393><loc_219>23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)</list_item>
+</unordered_list>
+</doctag>
--- a/tests/data/groundtruth/docling_v2/amt_handbook_sample.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/amt_handbook_sample.doctags.txt
@ -1,23 +1,17 @@
-<document>
-<text><location><page_1><loc_12><loc_88><loc_53><loc_94></location>pulleys, provided the inner race of the bearing is clamped to the supporting structure by the nut and bolt. Plates must be attached to the structure in a positive manner to eliminate rotation or misalignment when tightening the bolts or screws.</text>
-<text><location><page_1><loc_12><loc_77><loc_53><loc_86></location>The two general types of self-locking nuts currently in use are the all-metal type and the fiber lock type. For the sake of simplicity, only three typical kinds of self-locking nuts are considered in this handbook: the Boots self-locking and the stainless steel self-locking nuts, representing the all-metal types; and the elastic stop nut, representing the fiber insert type.</text>
-<section_header_level_1><location><page_1><loc_12><loc_73><loc_28><loc_75></location>Boots Self-Locking Nut</section_header_level_1>
-<text><location><page_1><loc_12><loc_64><loc_54><loc_73></location>The Boots self-locking nut is of one piece, all-metal construction designed to hold tight despite severe vibration. Note in Figure 7-26 that it has two sections and is essentially two nuts in one: a locking nut and a load-carrying nut. The two sections are connected with a spring, which is an integral part of the nut.</text>
-<text><location><page_1><loc_12><loc_52><loc_53><loc_62></location>The spring keeps the locking and load-carrying sections such a distance apart that the two sets of threads are out of phase or spaced so that a bolt, which has been screwed through the load-carrying section, must push the locking section outward against the force of the spring to engage the threads of the locking section properly.</text>
-<text><location><page_1><loc_12><loc_38><loc_54><loc_50></location>The spring, through the medium of the locking section, exerts a constant locking force on the bolt in the same direction as a force that would tighten the nut. In this nut, the load-carrying section has the thread strength of a standard nut of comparable size, while the locking section presses against the threads of the bolt and locks the nut firmly in position. Only a wrench applied to the nut loosens it. The nut can be removed and reused without impairing its efficiency.</text>
-<text><location><page_1><loc_12><loc_33><loc_53><loc_36></location>Boots self-locking nuts are made with three different spring styles and in various shapes and sizes. The wing type that is</text>
-<figure>
-<location><page_1><loc_12><loc_10><loc_52><loc_31></location>
-<caption>Figure 7-26. Self-locking nuts.</caption>
-</figure>
-<text><location><page_1><loc_54><loc_85><loc_95><loc_94></location>the most common ranges in size for No. 6 up to 1 / 4 inch, the Rol-top ranges from 1 / 4 inch to 1 / 6 inch, and the bellows type ranges in size from No. 8 up to 3 / 8 inch. Wing-type nuts are made of anodized aluminum alloy, cadmium-plated carbon steel, or stainless steel. The Rol-top nut is cadmium-plated steel, and the bellows type is made of aluminum alloy only.</text>
-<text><location><page_1><loc_54><loc_83><loc_55><loc_85></location>.</text>
-<section_header_level_1><location><page_1><loc_54><loc_82><loc_76><loc_83></location>Stainless Steel Self-Locking Nut</section_header_level_1>
-<text><location><page_1><loc_54><loc_54><loc_96><loc_81></location>The stainless steel self-locking nut may be spun on and off by hand as its locking action takes places only when the nut is seated against a solid surface and tightened. The nut consists of two parts: a case with a beveled locking shoulder and key and a thread insert with a locking shoulder and slotted keyway. Until the nut is tightened, it spins on the bolt easily, because the threaded insert is the proper size for the bolt. However, when the nut is seated against a solid surface and tightened, the locking shoulder of the insert is pulled downward and wedged against the locking shoulder of the case. This action compresses the threaded insert and causes it to clench the bolt tightly. The cross-sectional view in Figure 7-27 shows how the key of the case fits into the slotted keyway of the insert so that when the case is turned, the threaded insert is turned with it. Note that the slot is wider than the key. This permits the slot to be narrowed and the insert to be compressed when the nut is tightened.</text>
-<section_header_level_1><location><page_1><loc_54><loc_51><loc_65><loc_52></location>Elastic Stop Nut</section_header_level_1>
-<text><location><page_1><loc_54><loc_47><loc_93><loc_50></location>The elastic stop nut is a standard nut with the height increased to accommodate a fiber locking collar. This</text>
-<figure>
-<location><page_1><loc_54><loc_11><loc_94><loc_46></location>
-<caption>Figure 7-27. Stainless steel self-locking nut.</caption>
-</figure>
-</document>
+<doctag><text><loc_61><loc_28><loc_264><loc_60>pulleys, provided the inner race of the bearing is clamped to the supporting structure by the nut and bolt. Plates must be attached to the structure in a positive manner to eliminate rotation or misalignment when tightening the bolts or screws.</text>
+<text><loc_61><loc_69><loc_264><loc_116>The two general types of self-locking nuts currently in use are the all-metal type and the fiber lock type. For the sake of simplicity, only three typical kinds of self-locking nuts are considered in this handbook: the Boots self-locking and the stainless steel self-locking nuts, representing the all-metal types; and the elastic stop nut, representing the fiber insert type.</text>
+<section_header_level_1><loc_61><loc_125><loc_141><loc_133>Boots Self-Locking Nut</section_header_level_1>
+<text><loc_61><loc_134><loc_268><loc_182>The Boots self-locking nut is of one piece, all-metal construction designed to hold tight despite severe vibration. Note in Figure 7-26 that it has two sections and is essentially two nuts in one: a locking nut and a load-carrying nut. The two sections are connected with a spring, which is an integral part of the nut.</text>
+<text><loc_61><loc_191><loc_267><loc_239>The spring keeps the locking and load-carrying sections such a distance apart that the two sets of threads are out of phase or spaced so that a bolt, which has been screwed through the load-carrying section, must push the locking section outward against the force of the spring to engage the threads of the locking section properly.</text>
+<text><loc_61><loc_248><loc_268><loc_311>The spring, through the medium of the locking section, exerts a constant locking force on the bolt in the same direction as a force that would tighten the nut. In this nut, the load-carrying section has the thread strength of a standard nut of comparable size, while the locking section presses against the threads of the bolt and locks the nut firmly in position. Only a wrench applied to the nut loosens it. The nut can be removed and reused without impairing its efficiency.</text>
+<text><loc_61><loc_320><loc_264><loc_336>Boots self-locking nuts are made with three different spring styles and in various shapes and sizes. The wing type that is</text>
+<picture><loc_59><loc_343><loc_261><loc_449><caption><loc_61><loc_454><loc_155><loc_461>Figure 7-26. Self-locking nuts.</caption></picture>
+<text><loc_270><loc_28><loc_473><loc_76>the most common ranges in size for No. 6 up to 1 / 4 inch, the Rol-top ranges from 1 / 4 inch to 1 / 6 inch, and the bellows type ranges in size from No. 8 up to 3 / 8 inch. Wing-type nuts are made of anodized aluminum alloy, cadmium-plated carbon steel, or stainless steel. The Rol-top nut is cadmium-plated steel, and the bellows type is made of aluminum alloy only.</text>
+<text><loc_270><loc_77><loc_274><loc_84>.</text>
+<section_header_level_1><loc_270><loc_85><loc_380><loc_92>Stainless Steel Self-Locking Nut</section_header_level_1>
+<text><loc_270><loc_94><loc_478><loc_231>The stainless steel self-locking nut may be spun on and off by hand as its locking action takes places only when the nut is seated against a solid surface and tightened. The nut consists of two parts: a case with a beveled locking shoulder and key and a thread insert with a locking shoulder and slotted keyway. Until the nut is tightened, it spins on the bolt easily, because the threaded insert is the proper size for the bolt. However, when the nut is seated against a solid surface and tightened, the locking shoulder of the insert is pulled downward and wedged against the locking shoulder of the case. This action compresses the threaded insert and causes it to clench the bolt tightly. The cross-sectional view in Figure 7-27 shows how the key of the case fits into the slotted keyway of the insert so that when the case is turned, the threaded insert is turned with it. Note that the slot is wider than the key. This permits the slot to be narrowed and the insert to be compressed when the nut is tightened.</text>
+<section_header_level_1><loc_270><loc_240><loc_327><loc_247>Elastic Stop Nut</section_header_level_1>
+<text><loc_270><loc_249><loc_465><loc_264>The elastic stop nut is a standard nut with the height increased to accommodate a fiber locking collar. This</text>
+<picture><loc_270><loc_272><loc_470><loc_447><caption><loc_270><loc_452><loc_405><loc_459>Figure 7-27. Stainless steel self-locking nut.</caption></picture>
+<page_footer><loc_453><loc_470><loc_472><loc_478>7-45</page_footer>
+</doctag>
--- a/tests/data/groundtruth/docling_v2/bmj_sample.xml.itxt
+++ b/tests/data/groundtruth/docling_v2/bmj_sample.xml.itxt
@ -0,0 +1,70 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: title: Evolving general practice consul ...  Britain: issues of length and context
+    item-2 at level 2: paragraph: George K Freeman, John P Horder, ... on P Hill, Nayan C Shah, Andrew Wilson
+    item-3 at level 2: paragraph: Centre for Primary Care and Soci ... ersity of Leicester, Leicester LE5 4PW
+    item-4 at level 2: text: In 1999 Shah1 and others said th ...  per consultation in general practice?
+    item-5 at level 2: text: We report on the outcome of exte ...  review identified 14 relevant papers.
+    item-6 at level 2: section_header: Summary points
+      item-7 at level 3: list: group list
+        item-8 at level 4: list_item: Longer consultations are associa ... ith a range of better patient outcomes
+        item-9 at level 4: list_item: Modern consultations in general  ... th more serious and chronic conditions
+        item-10 at level 4: list_item: Increasing patient participation ...  interaction, which demands extra time
+        item-11 at level 4: list_item: Difficulties with access and wit ... e and lead to further pressure on time
+        item-12 at level 4: list_item: Longer consultations should be a ... t to maximise interpersonal continuity
+        item-13 at level 4: list_item: Research on implementation is needed
+    item-14 at level 2: section_header: Longer consultations: benefits for patients
+      item-15 at level 3: text: The systematic review consistent ... ther some doctors insist on more time.
+      item-16 at level 3: text: A national survey in 1998 report ... s the effects of their own experience.
+    item-17 at level 2: section_header: Context of modern consultations
+      item-18 at level 3: text: Shorter consultations were more  ...  potential length of the consultation.
+    item-19 at level 2: section_header: Participatory consultation style
+      item-20 at level 3: text: The most effective consultations ... style usually lengthens consultations.
+    item-21 at level 2: section_header: Extended professional agenda
+      item-22 at level 3: text: The traditional consultation in  ... agerial expectations of good practice.
+      item-23 at level 3: text: Adequate time is essential. It m ...  inevitably leads to pressure on time.
+    item-24 at level 2: section_header: Access problems
+      item-25 at level 3: text: In a service free at the point o ... ort notice squeeze consultation times.
+      item-26 at level 3: text: While appointment systems can an ...  for the inadequate access to doctors.
+      item-27 at level 3: text: In response to perception of del ... ntation is currently being negotiated.
+      item-28 at level 3: text: Virtually all patients think tha ... e that is free at the point of access.
+      item-29 at level 3: text: A further government initiative  ... ealth advice and first line treatment.
+    item-30 at level 2: section_header: Loss of interpersonal continuity
+      item-31 at level 3: text: If a patient has to consult seve ... unning and professional frustration.18
+      item-32 at level 3: text: Mechanic described how loss of l ... patient and professional satisfaction.
+    item-33 at level 2: section_header: Health service reforms
+      item-34 at level 3: text: Finally, for the past 15 years t ... ents and staff) and what is delivered.
+    item-35 at level 2: section_header: The future
+      item-36 at level 3: text: We think that the way ahead must ... p further the care of chronic disease.
+      item-37 at level 3: text: The challenge posed to general p ... ermedicalisation need to be exploited.
+      item-38 at level 3: text: We must ensure better communicat ... between planned and ad hoc consulting.
+    item-39 at level 2: section_header: Next steps
+      item-40 at level 3: text: General practitioners do not beh ... ailable time in complex consultations.
+      item-41 at level 3: text: Devising appropriate incentives  ... and interpersonal knowledge and trust.
+    item-42 at level 2: section_header: Acknowledgments
+      item-43 at level 3: text: We thank the other members of th ... Practitioners for administrative help.
+    item-44 at level 2: section_header: References
+      item-45 at level 3: list: group list
+        item-46 at level 4: list_item: Shah NC. Viewpoint: Consultation ... y men!”. Br J Gen Pract 49:497 (1999).
+        item-47 at level 4: list_item: Mechanic D. How should hamsters  ... BMJ 323:266–268 (2001). PMID: 11485957
+        item-48 at level 4: list_item: Howie JGR, Porter AMD, Heaney DJ ... n Pract 41:48–54 (1991). PMID: 2031735
+        item-49 at level 4: list_item: Howie JGR, Heaney DJ, Maxwell M, ... BMJ 319:738–743 (1999). PMID: 10487999
+        item-50 at level 4: list_item: Kaplan SH, Greenfield S, Ware JE ... c disease. Med Care 27:110–125 (1989).
+        item-51 at level 4: list_item: Airey C, Erens B. National surve ... e, 1998. London: NHS Executive (1999).
+        item-52 at level 4: list_item: Hart JT. Expectations of health  ... h Expect 1:3–13 (1998). PMID: 11281857
+        item-53 at level 4: list_item: Tuckett D, Boulton M, Olson C, W ... London: Tavistock Publications (1985).
+        item-54 at level 4: list_item: General Medical Council. Draft r ... ctors/index.htm (accessed 2 Jan 2002).
+        item-55 at level 4: list_item: Balint M. The doctor, his patien ... the illness. London: Tavistock (1957).
+        item-56 at level 4: list_item: Stott NCH, Davies RH. The except ...  J R Coll Gen Pract 29:210–205 (1979).
+        item-57 at level 4: list_item: Hill AP, Hill AP. Challenges for ... nium. London: King's Fund75–86 (2000).
+        item-58 at level 4: list_item: National service framework for c ... . London: Department of Health (2000).
+        item-59 at level 4: list_item: Hart JT. A new kind of doctor: t ... ommunity. London: Merlin Press (1988).
+        item-60 at level 4: list_item: Morrison I, Smith R. Hamster hea ... J 321:1541–1542 (2000). PMID: 11124164
+        item-61 at level 4: list_item: Arber S, Sawyer L. Do appointmen ...  BMJ 284:478–480 (1982). PMID: 6800503
+        item-62 at level 4: list_item: Hjortdahl P, Borchgrevink CF. Co ... MJ 303:1181–1184 (1991). PMID: 1747619
+        item-63 at level 4: list_item: Howie JGR, Hopton JL, Heaney DJ, ... Pract 42:181–185 (1992). PMID: 1389427
+        item-64 at level 4: list_item: Freeman G, Shepperd S, Robinson  ... ), Summer 2000. London: NCCSDO (2001).
+        item-65 at level 4: list_item: Wilson A, McDonald P, Hayes L, C ... Pract 41:184–187 (1991). PMID: 1878267
+        item-66 at level 4: list_item: De Maeseneer J, Hjortdahl P, Sta ... J 320:1616–1617 (2000). PMID: 10856043
+        item-67 at level 4: list_item: Freeman G, Hjortdahl P. What fut ... MJ 314:1870–1873 (1997). PMID: 9224130
+        item-68 at level 4: list_item: Kibbe DC, Bentz E, McLaughlin CP ... Pract 36:304–308 (1993). PMID: 8454977
+        item-69 at level 4: list_item: Williams M, Neal RD. Time for a  ... ct 48:1783–1786 (1998). PMID: 10198490
--- a/tests/data/groundtruth/docling_v2/bmj_sample.xml.json
+++ b/tests/data/groundtruth/docling_v2/bmj_sample.xml.json
--- a/tests/data/groundtruth/docling_v2/bmj_sample.xml.md
+++ b/tests/data/groundtruth/docling_v2/bmj_sample.xml.md
@ -0,0 +1,105 @@
+# Evolving general practice consultation in Britain: issues of length and context
+
+George K Freeman, John P Horder, John G R Howie, A Pali Hungin, Alison P Hill, Nayan C Shah, Andrew Wilson
+
+Centre for Primary Care and Social Medicine, Imperial College of Science, Technology and Medicine, London W6 8RP; Royal College of General Practitioners, London SW7 1PU; Department of General Practice, University of Edinburgh, Edinburgh EH8 9DX; Centre for Health Studies, University of Durham, Durham DH1 3HN; Kilburn Park Medical Centre, London NW6; Department of General Practice and Primary Health Care, University of Leicester, Leicester LE5 4PW
+
+In 1999 Shah1 and others said that the Royal College of General Practitioners should advocate longer consultations in general practice as a matter of policy. The college set up a working group chaired by A P Hungin, and a systematic review of literature on consultation length in general practice was commissioned. The working group agreed that the available evidence would be hard to interpret without discussion of the changing context within which consultations now take place. For many years general practitioners and those who have surveyed patients' opinions in the United Kingdom have complained about short consultation time, despite a steady increase in actual mean length. Recently Mechanic pointed out that this is also true in the United States.2 Is there any justification for a further increase in mean time allocated per consultation in general practice?
+
+We report on the outcome of extensive debate among a group of general practitioners with an interest in the process of care, with reference to the interim findings of the commissioned systematic review and our personal databases. The review identified 14 relevant papers.
+
+## Summary points
+
+- Longer consultations are associated with a range of better patient outcomes
+- Modern consultations in general practice deal with patients with more serious and chronic conditions
+- Increasing patient participation means more complex interaction, which demands extra time
+- Difficulties with access and with loss of continuity add to perceived stress and poor performance and lead to further pressure on time
+- Longer consultations should be a professional priority, combined with increased use of technology and more flexible practice management to maximise interpersonal continuity
+- Research on implementation is needed
+
+## Longer consultations: benefits for patients
+
+The systematic review consistently showed that doctors with longer consultation times prescribe less and offer more advice on lifestyle and other health promoting activities. Longer consultations have been significantly associated with better recognition and handling of psychosocial problems3 and with better patient enablement.4 Also clinical care for some chronic illnesses is better in practices with longer booked intervals between one appointment and the next.5 It is not clear whether time is itself the main influence or whether some doctors insist on more time.
+
+A national survey in 1998 reported that most (87%) patients were satisfied with the length of their most recent consultation.6 Satisfaction with any service will be high if expectations are met or exceeded. But expectations are modified by previous experience.7 The result is that primary care patients are likely to be satisfied with what they are used to unless the context modifies the effects of their own experience.
+
+## Context of modern consultations
+
+Shorter consultations were more appropriate when the population was younger, when even a brief absence from employment due to sickness required a doctor's note, and when many simple remedies were available only on prescription. Recently at least five important influences have increased the content and hence the potential length of the consultation.
+
+## Participatory consultation style
+
+The most effective consultations are those in which doctors most directly acknowledge and perhaps respond to patients' problems and concerns. In addition, for patients to be committed to taking advantage of medical advice they must agree with both the goals and methods proposed. A landmark publication in the United Kingdom was Meetings Between Experts, which argued that while doctors are the experts about medical problems in general patients are the experts on how they themselves experience these problems.8 New emphasis on teaching consulting skills in general practice advocated specific attention to the patient's agenda, beliefs, understanding, and agreement. Currently the General Medical Council, aware that communication difficulties underlie many complaints about doctors, has further emphasised the importance of involving patients in consultations in its revised guidance to medical schools.9 More patient involvement should give a better outcome, but this participatory style usually lengthens consultations.
+
+## Extended professional agenda
+
+The traditional consultation in general practice was brief.2 The patient presented symptoms and the doctor prescribed treatment. In 1957 Balint gave new insights into the meaning of symptoms.10 By 1979 an enhanced model of consultation was presented, in which the doctors dealt with ongoing as well as presenting problems and added health promotion and education about future appropriate use of services.11 Now, with an ageing population and more community care of chronic illness, there are more issues to be considered at each consultation. Ideas of what constitutes good general practice are more complex.12 Good practice now includes both extended care of chronic medical problems—for example, coronary heart disease13—and a public health role. At first this model was restricted to those who lead change (“early adopters”) and enthusiasts14 but now it is embedded in professional and managerial expectations of good practice.
+
+Adequate time is essential. It may be difficult for an elderly patient with several active problems to undress, be examined, and get adequate professional consideration in under 15 minutes. Here the doctor is faced with the choice of curtailing the consultation or of reducing the time available for the next patient. Having to cope with these situations often contributes to professional dissatisfaction.15 This combination of more care, more options, and more genuine discussion of those options with informed patient choice inevitably leads to pressure on time.
+
+## Access problems
+
+In a service free at the point of access, rising demand will tend to increase rationing by delay. But attempts to improve access by offering more consultations at short notice squeeze consultation times.
+
+While appointment systems can and should reduce queuing time for consultations, they have long tended to be used as a brake on total demand.16 This may seriously erode patients' confidence in being able to see their doctor or nurse when they need to. Patients are offered appointments further ahead but may keep these even if their symptoms have remitted “just in case.” Availability of consultations is thus blocked. Receptionists are then inappropriately blamed for the inadequate access to doctors.
+
+In response to perception of delay, the government has set targets in the NHS plan of “guaranteed access to a primary care professional within 24 hours and to a primary care doctor within 48 hours.” Implementation is currently being negotiated.
+
+Virtually all patients think that they would not consult unless it was absolutely necessary. They do not think they are wasting NHS time and do not like being made to feel so. But underlying general practitioners' willingness to make patients wait several days is their perception that few of the problems are urgent. Patients and general practitioners evidently do not agree about the urgency of so called minor problems. To some extent general practice in the United Kingdom may have scored an “own goal” by setting up perceived access barriers (appointment systems and out of hours cooperatives) in the attempt to increase professional standards and control demand in a service that is free at the point of access.
+
+A further government initiative has been to bypass general practice with new services—notably, walk-in centres (primary care clinics in which no appointment is needed) and NHS Direct (a professional telephone helpline giving advice on simple remedies and access to services). Introduced widely and rapidly, these services each potentially provide significant features of primary care—namely, quick access to skilled health advice and first line treatment.
+
+## Loss of interpersonal continuity
+
+If a patient has to consult several different professionals, particularly over a short period of time, there is inevitable duplication of stories, risk of naive diagnoses, potential for conflicting advice, and perhaps loss of trust. Trust is essential if patients are to accept the “wait and see” management policy which is, or should be, an important part of the management of self limiting conditions, which are often on the boundary between illness and non-illness.17 Such duplication again increases pressure for more extra (unscheduled) consultations resulting in late running and professional frustration.18
+
+Mechanic described how loss of longitudinal (and perhaps personal and relational19) continuity influences the perception and use of time through an inability to build on previous consultations.2 Knowing the doctor well, particularly in smaller practices, is associated with enhanced patient enablement in shorter time.4 Though Mechanic pointed out that three quarters of UK patients have been registered with their general practitioner five years or more, this may be misleading. Practices are growing, with larger teams and more registered patients. Being registered with a doctor in a larger practice is usually no guarantee that the patient will be able to see the same doctor or the doctor of his or her choice, who may be different. Thus the system does not encourage adequate personal continuity. This adds to pressure on time and reduces both patient and professional satisfaction.
+
+## Health service reforms
+
+Finally, for the past 15 years the NHS has experienced unprecedented change with a succession of major administrative reforms. Recent reforms have focused on an NHS led by primary care, including the aim of shifting care from the secondary specialist sector to primary care. One consequence is increased demand for primary care of patients with more serious and less stable problems. With the limited piloting of reforms we do not know whether such major redirection can be achieved without greatly altering the delicate balance between expectations (of both patients and staff) and what is delivered.
+
+## The future
+
+We think that the way ahead must embrace both longer mean consultation times and more flexibility. More time is needed for high quality consultations with patients with major and complex problems of all kinds. But patients also need access to simpler services and advice. This should be more appropriate (and cost less) when it is given by professionals who know the patient and his or her medical history and social circumstances. For doctors, the higher quality associated with longer consultations may lead to greater professional satisfaction and, if these longer consultations are combined with more realistic scheduling, to reduced levels of stress.20 They will also find it easier to develop further the care of chronic disease.
+
+The challenge posed to general practice by walk-in centres and NHS Direct is considerable, and the diversion of funding from primary care is large. The risk of waste and duplication increases as more layers of complexity are added to a primary care service that started out as something familiar, simple, and local and which is still envied in other developed countries.21 Access needs to be simple, and the advantages of personal knowledge and trust in minimising duplication and overmedicalisation need to be exploited.
+
+We must ensure better communication and access so that patients can more easily deal with minor issues and queries with someone they know and trust and avoid the formality and inconvenience of a full face to face consultation. Too often this has to be with a different professional, unfamiliar with the nuances of the case. There should be far more managerial emphasis on helping patients to interact with their chosen practitioner22; such a programme has been described.23 Modern information systems make it much easier to record which doctor(s) a patient prefers to see and to monitor how often this is achieved. The telephone is hardly modern but is underused. Email avoids the problems inherent in arranging simultaneous availability necessary for telephone consultations but at the cost of reducing the communication of emotions. There is a place for both.2 Access without prior appointment is a valued feature of primary care, and we need to know more about the right balance between planned and ad hoc consulting.
+
+## Next steps
+
+General practitioners do not behave in a uniform way. They can be categorised as slow, medium, and fast and react in different ways to changes in consulting speed.18 They are likely to have differing views about a widespread move to lengthen consultation time. We do not need further confirmation that longer consultations are desirable and necessary, but research could show us the best way to learn how to introduce them with minimal disruption to the way in which patients and practices like primary care to be provided.24 We also need to learn how to make the most of available time in complex consultations.
+
+Devising appropriate incentives and helping practices move beyond just reacting to demand in the traditional way by working harder and faster is perhaps our greatest challenge in the United Kingdom. The new primary are trusts need to work together with the growing primary care research networks to carry out the necessary development work. In particular, research is needed on how a primary care team can best provide the right balance of quick access and interpersonal knowledge and trust.
+
+## Acknowledgments
+
+We thank the other members of the working group: Susan Childs, Paul Freeling, Iona Heath, Marshall Marinker, and Bonnie Sibbald. We also thank Fenny Green of the Royal College of General Practitioners for administrative help.
+
+## References
+
+- Shah NC. Viewpoint: Consultation time—time for a change? Still the “perfunctory work of perfunctory men!”. Br J Gen Pract 49:497 (1999).
+- Mechanic D. How should hamsters run? Some observations about sufficient patient time in primary care. BMJ 323:266–268 (2001). PMID: 11485957
+- Howie JGR, Porter AMD, Heaney DJ, Hopton JL. Long to short consultation ratio: a proxy measure of quality of care for general practice. Br J Gen Pract 41:48–54 (1991). PMID: 2031735
+- Howie JGR, Heaney DJ, Maxwell M, Walker JJ, Freeman GK, Rai H. Quality at general practice consultations: cross-sectional survey. BMJ 319:738–743 (1999). PMID: 10487999
+- Kaplan SH, Greenfield S, Ware JE. Assessing the effects of physician-patient interactions on the outcome of chronic disease. Med Care 27:110–125 (1989).
+- Airey C, Erens B. National surveys of NHS patients: general practice, 1998. London: NHS Executive (1999).
+- Hart JT. Expectations of health care: promoted, managed or shared?. Health Expect 1:3–13 (1998). PMID: 11281857
+- Tuckett D, Boulton M, Olson C, Williams A. Meetings between experts: an approach to sharing ideas in medical consultations. London: Tavistock Publications (1985).
+- General Medical Council. Draft recommendations on undergraduate medical education. July 2001. www.gmc-uk.org/med\_ed/tomorrowsdoctors/index.htm (accessed 2 Jan 2002).
+- Balint M. The doctor, his patient and the illness. London: Tavistock (1957).
+- Stott NCH, Davies RH. The exceptional potential in each primary care consultation. J R Coll Gen Pract 29:210–205 (1979).
+- Hill AP, Hill AP. Challenges for primary care. What's gone wrong with health care? Challenges for the new millennium. London: King's Fund75–86 (2000).
+- National service framework for coronary heart disease. London: Department of Health (2000).
+- Hart JT. A new kind of doctor: the general practitioner's part in the health of the community. London: Merlin Press (1988).
+- Morrison I, Smith R. Hamster health care. BMJ 321:1541–1542 (2000). PMID: 11124164
+- Arber S, Sawyer L. Do appointment systems work?. BMJ 284:478–480 (1982). PMID: 6800503
+- Hjortdahl P, Borchgrevink CF. Continuity of care: influence of general practitioners' knowledge about their patients on use of resources in consultations. BMJ 303:1181–1184 (1991). PMID: 1747619
+- Howie JGR, Hopton JL, Heaney DJ, Porter AMD. Attitudes to medical care, the organization of work, and stress among general practitioners. Br J Gen Pract 42:181–185 (1992). PMID: 1389427
+- Freeman G, Shepperd S, Robinson I, Ehrich K, Richards SC, Pitman P. Continuity of care: report of a scoping exercise for the national co-ordinating centre for NHS Service Delivery and Organisation R&amp;D (NCCSDO), Summer 2000. London: NCCSDO (2001).
+- Wilson A, McDonald P, Hayes L, Cooney J. Longer booking intervals in general practice: effects on doctors' stress and arousal. Br J Gen Pract 41:184–187 (1991). PMID: 1878267
+- De Maeseneer J, Hjortdahl P, Starfield B. Fix what's wrong, not what's right, with general practice in Britain. BMJ 320:1616–1617 (2000). PMID: 10856043
+- Freeman G, Hjortdahl P. What future for continuity of care in general practice?. BMJ 314:1870–1873 (1997). PMID: 9224130
+- Kibbe DC, Bentz E, McLaughlin CP. Continuous quality improvement for continuity of care. J Fam Pract 36:304–308 (1993). PMID: 8454977
+- Williams M, Neal RD. Time for a change? The process of lengthening booking intervals in general practice. Br J Gen Pract 48:1783–1786 (1998). PMID: 10198490
--- a/tests/data/groundtruth/docling_v2/code_and_formula.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.doctags.txt
@ -1,16 +1,18 @@
-<document>
-<section_header_level_1><location><page_1><loc_22><loc_83><loc_52><loc_84></location>JavaScript Code Example</section_header_level_1>
-<text><location><page_1><loc_22><loc_63><loc_78><loc_81></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
-<text><location><page_1><loc_22><loc_57><loc_78><loc_63></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,</text>
-<paragraph><location><page_1><loc_36><loc_55><loc_63><loc_56></location>Listing 1: Simple JavaScript Program</paragraph>
-<code><location><page_1><loc_22><loc_49><loc_43><loc_54></location>function add(a, b) { return a + b; } console.log(add(3, 5));</code>
-<text><location><page_1><loc_22><loc_29><loc_78><loc_47></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
-<text><location><page_1><loc_22><loc_23><loc_78><loc_29></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,</text>
-<section_header_level_1><location><page_2><loc_22><loc_84><loc_32><loc_85></location>Formula</section_header_level_1>
-<text><location><page_2><loc_22><loc_66><loc_80><loc_82></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
-<text><location><page_2><loc_22><loc_58><loc_80><loc_65></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt.</text>
-<formula><location><page_2><loc_47><loc_56><loc_56><loc_57></location></formula>
-<text><location><page_2><loc_22><loc_38><loc_80><loc_55></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
-<text><location><page_2><loc_22><loc_29><loc_80><loc_37></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.</text>
-<text><location><page_2><loc_22><loc_21><loc_80><loc_29></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.</text>
-</document>
+<doctag><section_header_level_1><loc_109><loc_79><loc_258><loc_87>JavaScript Code Example</section_header_level_1>
+<text><loc_109><loc_94><loc_390><loc_183>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
+<text><loc_109><loc_185><loc_390><loc_213>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,</text>
+<paragraph><loc_182><loc_221><loc_317><loc_226>Listing 1: Simple JavaScript Program</paragraph>
+<code<loc_110><loc_231><loc_215><loc_257><_unknown_>function add(a, b) { return a + b; } console.log(add(3, 5));</code
+<text><loc_109><loc_265><loc_390><loc_353>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
+<text><loc_109><loc_355><loc_390><loc_383>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,</text>
+<page_footer><loc_248><loc_439><loc_252><loc_445>1</page_footer>
+<page_break>
+<section_header_level_1><loc_112><loc_74><loc_161><loc_82>Formula</section_header_level_1>
+<text><loc_112><loc_89><loc_401><loc_172>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
+<text><loc_112><loc_174><loc_401><loc_208>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt.</text>
+<formula><loc_236><loc_215><loc_278><loc_222></formula>
+<text><loc_112><loc_227><loc_401><loc_311>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
+<text><loc_112><loc_313><loc_401><loc_353>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.</text>
+<text><loc_112><loc_355><loc_401><loc_396>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.</text>
+<page_footer><loc_255><loc_413><loc_259><loc_418>1</page_footer>
+</doctag>
--- a/tests/data/groundtruth/docling_v2/code_and_formula.json
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.json
--- a/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.itxt
+++ b/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.itxt
@ -0,0 +1,2 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: table with [5x4]
--- a/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json
+++ b/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json
@ -0,0 +1,546 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.1.0",
+  "name": "csv-comma-in-cell",
+  "origin": {
+    "mimetype": "text/csv",
+    "binary_hash": 17599039665518552414,
+    "filename": "csv-comma-in-cell.csv"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/tables/0"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [],
+  "pictures": [],
+  "tables": [
+    {
+      "self_ref": "#/tables/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "table",
+      "prov": [],
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "data": {
+        "table_cells": [
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "1",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "2",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "3",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "4",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": ",",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          }
+        ],
+        "num_rows": 5,
+        "num_cols": 4,
+        "grid": [
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "1",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "2",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "3",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "4",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": ",",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ]
+        ]
+      }
+    }
+  ],
+  "key_value_items": [],
+  "pages": {}
+}
--- a/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.md
+++ b/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.md
@ -0,0 +1,6 @@
+| 1   | 2   | 3   | 4   |
+|-----|-----|-----|-----|
+| a   | b   | c   | d   |
+| a   | ,   | c   | d   |
+| a   | b   | c   | d   |
+| a   | b   | c   | d   |
--- a/tests/data/groundtruth/docling_v2/csv-comma.csv.itxt
+++ b/tests/data/groundtruth/docling_v2/csv-comma.csv.itxt
@ -0,0 +1,2 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: table with [6x12]
--- a/tests/data/groundtruth/docling_v2/csv-comma.csv.json
+++ b/tests/data/groundtruth/docling_v2/csv-comma.csv.json
--- a/tests/data/groundtruth/docling_v2/csv-comma.csv.md
+++ b/tests/data/groundtruth/docling_v2/csv-comma.csv.md
@ -0,0 +1,7 @@
+|   Index | Customer Id     | First Name   | Last Name   | Company                         | City              | Country                    | Phone 1                | Phone 2               | Email                       | Subscription Date   | Website                     |
+|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
+|       1 | DD37Cf93aecA6Dc | Sheryl       | Baxter      | Rasmussen Group                 | East Leonard      | Chile                      | 229.077.5154           | 397.884.0519x718      | zunigavanessa@smith.info    | 2020-08-24          | http://www.stephenson.com/  |
+|       2 | 1Ef7b82A4CAAD10 | Preston      | Lozano, Dr  | Vega-Gentry                     | East Jimmychester | Djibouti                   | 5153435776             | 686-620-1820x944      | vmata@colon.com             | 2021-04-23          | http://www.hobbs.com/       |
+|       3 | 6F94879bDAfE5a6 | Roy          | Berry       | Murillo-Perry                   | Isabelborough     | Antigua and Barbuda        | +1-539-402-0259        | (496)978-3969x58947   | beckycarr@hogan.com         | 2020-03-25          | http://www.lawrence.com/    |
+|       4 | 5Cef8BFA16c5e3c | Linda        | Olsen       | Dominguez, Mcmillan and Donovan | Bensonview        | Dominican Republic         | 001-808-617-6467x12895 | +1-813-324-8756       | stanleyblackwell@benson.org | 2020-06-02          | http://www.good-lyons.com/  |
+|       5 | 053d585Ab6b3159 | Joanna       | Bender      | Martin, Lang and Andrade        | West Priscilla    | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net     | 2021-04-17          | https://goodwin-ingram.com/ |
--- a/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.itxt
+++ b/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.itxt
@ -0,0 +1,2 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: table with [5x4]
--- a/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json
+++ b/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json
@ -0,0 +1,534 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.1.0",
+  "name": "csv-inconsistent-header",
+  "origin": {
+    "mimetype": "text/csv",
+    "binary_hash": 5480400768780756370,
+    "filename": "csv-inconsistent-header.csv"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/tables/0"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [],
+  "pictures": [],
+  "tables": [
+    {
+      "self_ref": "#/tables/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "table",
+      "prov": [],
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "data": {
+        "table_cells": [
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "1",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "2",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "3",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          }
+        ],
+        "num_rows": 5,
+        "num_cols": 4,
+        "grid": [
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "1",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "2",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "3",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ]
+        ]
+      }
+    }
+  ],
+  "key_value_items": [],
+  "pages": {}
+}
--- a/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.md
+++ b/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.md
@ -0,0 +1,6 @@
+| 1   | 2   | 3   |    |
+|-----|-----|-----|----|
+| a   | b   | c   | d  |
+| a   | b   | c   | d  |
+| a   | b   | c   | d  |
+| a   | b   | c   | d  |
--- a/tests/data/groundtruth/docling_v2/csv-pipe.csv.itxt
+++ b/tests/data/groundtruth/docling_v2/csv-pipe.csv.itxt
@ -0,0 +1,2 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: table with [6x12]
--- a/tests/data/groundtruth/docling_v2/csv-pipe.csv.json
+++ b/tests/data/groundtruth/docling_v2/csv-pipe.csv.json
--- a/tests/data/groundtruth/docling_v2/csv-pipe.csv.md
+++ b/tests/data/groundtruth/docling_v2/csv-pipe.csv.md
@ -0,0 +1,7 @@
+|   Index | Customer Id     | First Name   | Last Name   | Company                        | City              | Country                    | Phone 1                | Phone 2               | Email                       | Subscription Date   | Website                     |
+|---------|-----------------|--------------|-------------|--------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
+|       1 | DD37Cf93aecA6Dc | Sheryl       | Baxter      | Rasmussen Group                | East Leonard      | Chile                      | 229.077.5154           | 397.884.0519x718      | zunigavanessa@smith.info    | 2020-08-24          | http://www.stephenson.com/  |
+|       2 | 1Ef7b82A4CAAD10 | Preston      | Lozano      | Vega-Gentry                    | East Jimmychester | Djibouti                   | 5153435776             | 686-620-1820x944      | vmata@colon.com             | 2021-04-23          | http://www.hobbs.com/       |
+|       3 | 6F94879bDAfE5a6 | Roy          | Berry       | Murillo-Perry                  | Isabelborough     | Antigua and Barbuda        | +1-539-402-0259        | (496)978-3969x58947   | beckycarr@hogan.com         | 2020-03-25          | http://www.lawrence.com/    |
+|       4 | 5Cef8BFA16c5e3c | Linda        | Olsen       | Dominguez|Mcmillan and Donovan | Bensonview        | Dominican Republic         | 001-808-617-6467x12895 | +1-813-324-8756       | stanleyblackwell@benson.org | 2020-06-02          | http://www.good-lyons.com/  |
+|       5 | 053d585Ab6b3159 | Joanna       | Bender      | Martin|Lang and Andrade        | West Priscilla    | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net     | 2021-04-17          | https://goodwin-ingram.com/ |
--- a/tests/data/groundtruth/docling_v2/csv-semicolon.csv.itxt
+++ b/tests/data/groundtruth/docling_v2/csv-semicolon.csv.itxt
@ -0,0 +1,2 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: table with [6x12]
--- a/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json
+++ b/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json
--- a/tests/data/groundtruth/docling_v2/csv-semicolon.csv.md
+++ b/tests/data/groundtruth/docling_v2/csv-semicolon.csv.md
@ -0,0 +1,7 @@
+|   Index | Customer Id     | First Name   | Last Name   | Company                        | City              | Country                    | Phone 1                | Phone 2               | Email                       | Subscription Date   | Website                     |
+|---------|-----------------|--------------|-------------|--------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
+|       1 | DD37Cf93aecA6Dc | Sheryl       | Baxter      | Rasmussen Group                | East Leonard      | Chile                      | 229.077.5154           | 397.884.0519x718      | zunigavanessa@smith.info    | 2020-08-24          | http://www.stephenson.com/  |
+|       2 | 1Ef7b82A4CAAD10 | Preston      | Lozano      | Vega-Gentry                    | East Jimmychester | Djibouti                   | 5153435776             | 686-620-1820x944      | vmata@colon.com             | 2021-04-23          | http://www.hobbs.com/       |
+|       3 | 6F94879bDAfE5a6 | Roy          | Berry       | Murillo-Perry                  | Isabelborough     | Antigua and Barbuda        | +1-539-402-0259        | (496)978-3969x58947   | beckycarr@hogan.com         | 2020-03-25          | http://www.lawrence.com/    |
+|       4 | 5Cef8BFA16c5e3c | Linda        | Olsen       | Dominguez;Mcmillan and Donovan | Bensonview        | Dominican Republic         | 001-808-617-6467x12895 | +1-813-324-8756       | stanleyblackwell@benson.org | 2020-06-02          | http://www.good-lyons.com/  |
+|       5 | 053d585Ab6b3159 | Joanna       | Bender      | Martin;Lang and Andrade        | West Priscilla    | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net     | 2021-04-17          | https://goodwin-ingram.com/ |
--- a/tests/data/groundtruth/docling_v2/csv-tab.csv.itxt
+++ b/tests/data/groundtruth/docling_v2/csv-tab.csv.itxt
@ -0,0 +1,2 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: table with [6x12]
--- a/tests/data/groundtruth/docling_v2/csv-tab.csv.json
+++ b/tests/data/groundtruth/docling_v2/csv-tab.csv.json
--- a/tests/data/groundtruth/docling_v2/csv-tab.csv.md
+++ b/tests/data/groundtruth/docling_v2/csv-tab.csv.md
@ -0,0 +1,7 @@
+|   Index | Customer Id     | First Name   | Last Name   | Company         | City              | Country                    | Phone 1                | Phone 2               | Email                       | Subscription Date   | Website                     |
+|---------|-----------------|--------------|-------------|-----------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
+|       1 | DD37Cf93aecA6Dc | Sheryl       | Baxter      | Rasmussen Group | East Leonard      | Chile                      | 229.077.5154           | 397.884.0519x718      | zunigavanessa@smith.info    | 2020-08-24          | http://www.stephenson.com/  |
+|       2 | 1Ef7b82A4CAAD10 | Preston      | Lozano      | Vega-Gentry     | East Jimmychester | Djibouti                   | 5153435776             | 686-620-1820x944      | vmata@colon.com             | 2021-04-23          | http://www.hobbs.com/       |
+|       3 | 6F94879bDAfE5a6 | Roy          | Berry       | Murillo-Perry   | Isabelborough     | Antigua and Barbuda        | +1-539-402-0259        | (496)978-3969x58947   | beckycarr@hogan.com         | 2020-03-25          | http://www.lawrence.com/    |
+|       4 | 5Cef8BFA16c5e3c | Linda        | Olsen       | Dominguez	Mcmillan and Donovan                 | Bensonview        | Dominican Republic         | 001-808-617-6467x12895 | +1-813-324-8756       | stanleyblackwell@benson.org | 2020-06-02          | http://www.good-lyons.com/  |
+|       5 | 053d585Ab6b3159 | Joanna       | Bender      | Martin	Lang and Andrade                 | West Priscilla    | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net     | 2021-04-17          | https://goodwin-ingram.com/ |
--- a/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.itxt
+++ b/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.itxt
@ -0,0 +1,2 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: table with [5x4]
--- a/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json
+++ b/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json
@ -0,0 +1,534 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.1.0",
+  "name": "csv-too-few-columns",
+  "origin": {
+    "mimetype": "text/csv",
+    "binary_hash": 6079936590967298763,
+    "filename": "csv-too-few-columns.csv"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/tables/0"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [],
+  "pictures": [],
+  "tables": [
+    {
+      "self_ref": "#/tables/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "table",
+      "prov": [],
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "data": {
+        "table_cells": [
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "1",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "2",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "3",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "4",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "'b'",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          }
+        ],
+        "num_rows": 5,
+        "num_cols": 4,
+        "grid": [
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "1",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "2",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "3",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "4",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "'b'",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ]
+        ]
+      }
+    }
+  ],
+  "key_value_items": [],
+  "pages": {}
+}
--- a/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.md
+++ b/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.md
@ -0,0 +1,6 @@
+| 1   | 2   | 3   | 4   |
+|-----|-----|-----|-----|
+| a   | 'b' | c   | d   |
+| a   | b   | c   |     |
+| a   | b   | c   | d   |
+| a   | b   | c   | d   |
--- a/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.itxt
+++ b/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.itxt
@ -0,0 +1,2 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: table with [5x5]
--- a/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json
+++ b/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json
@ -0,0 +1,618 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.1.0",
+  "name": "csv-too-many-columns",
+  "origin": {
+    "mimetype": "text/csv",
+    "binary_hash": 10142252432152444595,
+    "filename": "csv-too-many-columns.csv"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/tables/0"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [],
+  "pictures": [],
+  "tables": [
+    {
+      "self_ref": "#/tables/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "table",
+      "prov": [],
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "data": {
+        "table_cells": [
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "1",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "2",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "3",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 0,
+            "end_row_offset_idx": 1,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "4",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 1,
+            "end_row_offset_idx": 2,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 2,
+            "end_row_offset_idx": 3,
+            "start_col_offset_idx": 4,
+            "end_col_offset_idx": 5,
+            "text": "e",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 3,
+            "end_row_offset_idx": 4,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 0,
+            "end_col_offset_idx": 1,
+            "text": "a",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 1,
+            "end_col_offset_idx": 2,
+            "text": "b",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 2,
+            "end_col_offset_idx": 3,
+            "text": "c",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          },
+          {
+            "row_span": 1,
+            "col_span": 1,
+            "start_row_offset_idx": 4,
+            "end_row_offset_idx": 5,
+            "start_col_offset_idx": 3,
+            "end_col_offset_idx": 4,
+            "text": "d",
+            "column_header": false,
+            "row_header": false,
+            "row_section": false
+          }
+        ],
+        "num_rows": 5,
+        "num_cols": 5,
+        "grid": [
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "1",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "2",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "3",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "4",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 0,
+              "end_row_offset_idx": 1,
+              "start_col_offset_idx": 4,
+              "end_col_offset_idx": 5,
+              "text": "",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 1,
+              "end_row_offset_idx": 2,
+              "start_col_offset_idx": 4,
+              "end_col_offset_idx": 5,
+              "text": "",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 2,
+              "end_row_offset_idx": 3,
+              "start_col_offset_idx": 4,
+              "end_col_offset_idx": 5,
+              "text": "e",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 3,
+              "end_row_offset_idx": 4,
+              "start_col_offset_idx": 4,
+              "end_col_offset_idx": 5,
+              "text": "",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ],
+          [
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 0,
+              "end_col_offset_idx": 1,
+              "text": "a",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 1,
+              "end_col_offset_idx": 2,
+              "text": "b",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 2,
+              "end_col_offset_idx": 3,
+              "text": "c",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 3,
+              "end_col_offset_idx": 4,
+              "text": "d",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            },
+            {
+              "row_span": 1,
+              "col_span": 1,
+              "start_row_offset_idx": 4,
+              "end_row_offset_idx": 5,
+              "start_col_offset_idx": 4,
+              "end_col_offset_idx": 5,
+              "text": "",
+              "column_header": false,
+              "row_header": false,
+              "row_section": false
+            }
+          ]
+        ]
+      }
+    }
+  ],
+  "key_value_items": [],
+  "pages": {}
+}
--- a/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.md
+++ b/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.md
@ -0,0 +1,6 @@
+| 1   | 2   | 3   | 4   |    |
+|-----|-----|-----|-----|----|
+| a   | b   | c   | d   |    |
+| a   | b   | c   | d   | e  |
+| a   | b   | c   | d   |    |
+| a   | b   | c   | d   |    |
--- a/tests/data/groundtruth/docling_v2/elife-56337.xml.itxt
+++ b/tests/data/groundtruth/docling_v2/elife-56337.xml.itxt
@ -1,165 +1,149 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: title: KRAB-zinc finger protein gene ex ... retrotransposons in the murine lineage
-    item-2 at level 2: paragraph: Wolf Gernot; 1: The Eunice Kenne ... tes of Health: Bethesda: United States
-    item-3 at level 2: section_header: Abstract
-      item-4 at level 3: text: The Krüppel-associated box zinc  ... edundant role restricting TE activity.
-    item-5 at level 2: section_header: Introduction
-      item-6 at level 3: text: Nearly half of the human and mou ... s are active beyond early development.
-      item-7 at level 3: text: TEs, especially long terminal re ... f evolutionarily young KRAB-ZFP genes.
-    item-8 at level 2: section_header: Results
-      item-9 at level 3: section_header: Mouse KRAB-ZFPs target retrotransposons
-        item-10 at level 4: text: We analyzed the RNA expression p ... duplications (Kauzlaric et al., 2017).
-        item-11 at level 4: text: To determine the binding sites o ... ctive in the early embryo (Figure 1A).
-        item-12 at level 4: text: We generally observed that KRAB- ... responsible for this silencing effect.
-        item-13 at level 4: text: To further test the hypothesis t ... t easily evade repression by mutation.
-        item-14 at level 4: text: Our KRAB-ZFP ChIP-seq dataset al ... ntirely shift the mode of DNA binding.
-      item-15 at level 3: section_header: Genetic deletion of KRAB-ZFP gen ...  leads to retrotransposon reactivation
-        item-16 at level 4: text: The majority of KRAB-ZFP genes a ... ung et al., 2014; Deniz et al., 2018).
-      item-17 at level 3: section_header: KRAB-ZFP cluster deletions license TE-borne enhancers
-        item-18 at level 4: text: We next used our RNA-seq dataset ... vating effects of TEs on nearby genes.
-        item-19 at level 4: text: While we generally observed that ... he internal region and not on the LTR.
-      item-20 at level 3: section_header: ETn retrotransposition in Chr4-cl KO and WT mice
-        item-21 at level 4: text: IAP, ETn/ETnERV and MuLV/RLTR4 r ... s may contribute to reduced viability.
-        item-22 at level 4: text: We reasoned that retrotransposon ... Tn insertions at a high recovery rate.
-        item-23 at level 4: text: Using this dataset, we first con ... nsertions in our pedigree (Figure 4A).
-        item-24 at level 4: text: To validate some of the novel ET ... ess might have truncated this element.
-        item-25 at level 4: text: Besides novel ETn insertions tha ... tions (Figure 4—figure supplement 3D).
-        item-26 at level 4: text: Finally, we asked whether there  ... s clearly also play an important role.
-    item-27 at level 2: section_header: Discussion
-      item-28 at level 3: text: C2H2 zinc finger proteins, about ... ) depending upon their insertion site.
-      item-29 at level 3: text: Despite a lack of widespread ETn ... ion of the majority of KRAB-ZFP genes.
-    item-30 at level 2: section_header: Materials and methods
-      item-31 at level 3: section_header: Cell lines and transgenic mice
-        item-32 at level 4: text: Mouse ES cells and F9 EC cells w ... KO/KO and KO/WT (B6/129 F2) offspring.
-      item-33 at level 3: section_header: Generation of KRAB-ZFP expressing cell lines
-        item-34 at level 4: text: KRAB-ZFP ORFs were PCR-amplified ... led and further expanded for ChIP-seq.
-      item-35 at level 3: section_header: CRISPR/Cas9 mediated deletion of KRAB-ZFP clusters and an MMETn insertion
-        item-36 at level 4: text: All gRNAs were expressed from th ... PCR genotyping (Supplementary file 3).
-      item-37 at level 3: section_header: ChIP-seq analysis
-        item-38 at level 4: text: For ChIP-seq analysis of KRAB-ZF ... 010 or Khil et al., 2012 respectively.
-        item-39 at level 4: text: ChIP-seq libraries were construc ...  were re-mapped using Bowtie (--best).
-      item-40 at level 3: section_header: Luciferase reporter assays
-        item-41 at level 4: text: For KRAB-ZFP repression assays,  ... after transfection as described above.
-      item-42 at level 3: section_header: RNA-seq analysis
-        item-43 at level 4: text: Whole RNA was purified using RNe ... lemented in the R function p.adjust().
-      item-44 at level 3: section_header: Reduced representation bisulfite sequencing (RRBS-seq)
-        item-45 at level 4: text: For RRBS-seq analysis, Chr4-cl W ... h sample were considered for analysis.
-      item-46 at level 3: section_header: Retrotransposition assay
-        item-47 at level 4: text: The retrotransposition vectors p ... were stained with Amido Black (Sigma).
-      item-48 at level 3: section_header: Capture-seq screen
-        item-49 at level 4: text: To identify novel retrotransposo ... assembly using the Unicycler software.
-    item-50 at level 2: section_header: Tables
-      item-51 at level 3: table with [9x5]
-        item-51 at level 4: caption: Table 1.: * Number of protein-coding KRAB-ZFP genes identified in a previously published screen (Imbeault et al., 2017) and the ChIP-seq data column indicates the number of KRAB-ZFPs for which ChIP-seq was performed in this study.
-      item-52 at level 3: table with [31x5]
-        item-52 at level 4: caption: Key resources table: 
-    item-53 at level 2: section_header: Figures
-      item-54 at level 3: picture
-        item-54 at level 4: caption: Figure 1.: Genome-wide binding patterns of mouse KRAB-ZFPs.
-(A) Probability heatmap of KRAB-ZFP binding to TEs. Blue color intensity (main field) corresponds to -log10 (adjusted p-value) enrichment of ChIP-seq peak overlap with TE groups (Fisher’s exact test). The green/red color intensity (top panel) represents mean KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) enrichment (respectively) at peaks overlapping significantly targeted TEs (adjusted p-value<1e-5) in WT ES cells. (B) Summarized ChIP-seq signal for indicated KRAB-ZFPs and previously published KAP1 and H3K9me3 in WT ES cells across 127 intact ETn elements. (C) Heatmaps of KRAB-ZFP ChIP-seq signal at ChIP-seq peaks. For better comparison, peaks for all three KRAB-ZFPs were called with the same parameters (p<1e-10, peak enrichment >20). The top panel shows a schematic of the arrangement of the contact amino acid composition of each zinc finger. Zinc fingers are grouped and colored according to similarity, with amino acid differences relative to the five consensus fingers highlighted in white.
-Figure 1—source data 1.KRAB-ZFP expression in 40 mouse tissues and cell lines (ENCODE).Mean values of replicates are shown as log2 transcripts per million.
-Figure 1—source data 2.Probability heatmap of KRAB-ZFP binding to TEs.Values corresponds to -log10 (adjusted p-value) enrichment of ChIP-seq peak overlap with TE groups (Fisher’s exact test).
-      item-55 at level 3: picture
-        item-55 at level 4: caption: Figure 1—figure supplement 1.: ES cell-specific expression of KRAB-ZFP gene clusters.
-(A) Heatmap showing expression patterns of mouse KRAB-ZFPs in 40 mouse tissues and cell lines (ENCODE). Heatmap colors indicate gene expression levels in log2 transcripts per million (TPM). The asterisk indicates a group of 30 KRAB-ZFPs that are exclusively expressed in ES cells. (B) Physical location of the genes encoding for the 30 KRAB-ZFPs that are exclusively expressed in ES cells. (C) Phylogenetic (Maximum likelihood) tree of the KRAB domains of mouse KRAB-ZFPs. KRAB-ZFPs encoded on the gene clusters on chromosome 2 and 4 are highlighted. The scale bar at the bottom indicates amino acid substitutions per site.
-      item-56 at level 3: picture
-        item-56 at level 4: caption: Figure 1—figure supplement 2.: KRAB-ZFP binding motifs and their repression activity.
-(A) Comparison of computationally predicted (bottom) and experimentally determined (top) KRAB-ZFP binding motifs. Only significant pairs are shown (FDR < 0.1). (B) Luciferase reporter assays to confirm KRAB-ZFP repression of the identified target sites. Bars show the luciferase activity (normalized to Renilla luciferase) of reporter plasmids containing the indicated target sites cloned upstream of the SV40 promoter. Reporter plasmids were co-transfected into 293 T cells with a Renilla luciferase plasmid for normalization and plasmids expressing the targeting KRAB-ZFP. Normalized mean luciferase activity (from three replicates) is shown relative to luciferase activity of the reporter plasmid co-transfected with an empty pcDNA3.1 vector.
-      item-57 at level 3: picture
-        item-57 at level 4: caption: Figure 1—figure supplement 3.: KRAB-ZFP binding to ETn retrotransposons.
-(A) Comparison of the PBSLys1,2 sequence with Zfp961 binding motifs in nonrepetitive peaks (Nonrep) and peaks at ETn elements. (B) Retrotransposition assays of original (ETnI1-neoTNF and MusD2-neoTNF Ribet et al., 2004) and modified reporter vectors where the Rex2 or Gm13051 binding motifs where removed. Schematic of reporter vectors are displayed at the top. HeLa cells were transfected as described in the Materials and Methods section and neo-resistant colonies, indicating retrotransposition events, were selected and stained. (C) Stem-loop structure of the ETn RNA export signal, the Gm13051 motif on the corresponding DNA is marked with red circles, the part of the motif that was deleted is indicated with grey crosses (adapted from Legiewicz et al., 2010).
-      item-58 at level 3: picture
-        item-58 at level 4: caption: Figure 2.: Retrotransposon reactivation in KRAB-ZFP cluster KO ES cells.
-(A) RNA-seq analysis of TE expression in five KRAB-ZFP cluster KO ES cells. Green and grey squares on top of the panel represent KRAB-ZFPs with or without ChIP-seq data, respectively, within each deleted gene cluster. Reactivated TEs that are bound by one or several KRAB-ZFPs are indicated by green squares in the panel. Significantly up- and downregulated elements (adjusted p-value<0.05) are highlighted in red and green, respectively. (B) Differential KAP1 binding and H3K9me3 enrichment at TE groups (summarized across all insertions) in Chr2-cl and Chr4-cl KO ES cells. TE groups targeted by one or several KRAB-ZFPs encoded within the deleted clusters are highlighted in blue (differential enrichment over the entire TE sequences) and red (differential enrichment at TE regions that overlap with KRAB-ZFP ChIP-seq peaks). (C) DNA methylation status of CpG sites at indicated TE groups in WT and Chr4-cl KO ES cells grown in serum containing media or in hypomethylation-inducing media (2i + Vitamin C). P-values were calculated using paired t-test.
-Figure 2—source data 1.Differential H3K9me3 and KAP1 distribution in WT and KRAB-ZFP cluster KO ES cells at TE families and KRAB-ZFP bound TE insertions.Differential read counts and statistical testing were determined by DESeq2.
-      item-59 at level 3: picture
-        item-59 at level 4: caption: Figure 2—figure supplement 1.: Epigenetic changes at TEs and TE-borne enhancers in KRAB-ZFP cluster KO ES cells.
-(A) Differential analysis of summative (all individual insertions combined) H3K9me3 enrichment at TE groups in Chr10-cl, Chr13.1-cl and Chr13.2-cl KO ES cells. TE groups targeted by one or several KRAB-ZFPs encoded within the deleted clusters are highlighted in orange (differential enrichment over the entire TE sequences) and red (differential enrichment at TE regions that overlap with KRAB-ZFP ChIP-seq peaks). (B) Top: Schematic view of the Cd59a/Cd59b locus with a 5’ truncated ETn insertion. ChIP-seq (Input subtracted from ChIP) data for overexpressed epitope-tagged Gm13051 (a Chr4-cl KRAB-ZFP) in F9 EC cells, and re-mapped KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) in WT ES cells are shown together with RNA-seq data from Chr4-cl WT and KO ES cells (mapped using Bowtie (-a -m 1 --strata -v 2) to exclude reads that cannot be uniquely mapped). Bottom: Transcriptional activity of a 5 kb fragment with or without fragments of the ETn insertion was tested by luciferase reporter assay in Chr4-cl WT and KO ES cells.
-      item-60 at level 3: picture
-        item-60 at level 4: caption: Figure 3.: TE-dependent gene activation in KRAB-ZFP cluster KO ES cells.
-(A) Differential gene expression in Chr2-cl and Chr4-cl KO ES cells. Significantly up- and downregulated genes (adjusted p-value<0.05) are highlighted in red and green, respectively, KRAB-ZFP genes within the deleted clusters are shown in blue. (B) Correlation of TEs and gene deregulation. Plots show enrichment of TE groups within 100 kb of up- and downregulated genes relative to all genes. Significantly overrepresented LTR and LINE groups (adjusted p-value<0.1) are highlighted in blue and red, respectively. (C) Schematic view of the downstream region of Chst1 where a 5’ truncated ETn insertion is located. ChIP-seq (Input subtracted from ChIP) data for overexpressed epitope-tagged Gm13051 (a Chr4-cl KRAB-ZFP) in F9 EC cells, and re-mapped KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) in WT ES cells are shown together with RNA-seq data from Chr4-cl WT and KO ES cells (mapped using Bowtie (-a -m 1 --strata -v 2) to exclude reads that cannot be uniquely mapped). (D) RT-qPCR analysis of Chst1 mRNA expression in Chr4-cl WT and KO ES cells with or without the CRISPR/Cas9 deleted ETn insertion near Chst1. Values represent mean expression (normalized to Gapdh) from three biological replicates per sample (each performed in three technical replicates) in arbitrary units. Error bars represent standard deviation and asterisks indicate significance (p<0.01, Student’s t-test). n.s.: not significant. (E) Mean coverage of ChIP-seq data (Input subtracted from ChIP) in Chr4-cl WT and KO ES cells over 127 full-length ETn insertions. The binding sites of the Chr4-cl KRAB-ZFPs Rex2 and Gm13051 are indicated by dashed lines.
-      item-61 at level 3: picture
-        item-61 at level 4: caption: Figure 4.: ETn retrotransposition in Chr4-cl KO mice.
-(A) Pedigree of mice used for transposon insertion screening by capture-seq in mice of different strain backgrounds. The number of novel ETn insertions (only present in one animal) are indicated. For animals whose direct ancestors have not been screened, the ETn insertions are shown in parentheses since parental inheritance cannot be excluded in that case. Germ line insertions are indicated by asterisks. All DNA samples were prepared from tail tissues unless noted (-S: spleen, -E: ear, -B:Blood) (B) Statistical analysis of ETn insertion frequency in tail tissue from 30 Chr4-cl KO, KO/WT and WT mice that were derived from one Chr4-c KO x KO/WT and two Chr4-cl KO/WT x KO/WT matings. Only DNA samples that were collected from juvenile tails were considered for this analysis. P-values were calculated using one-sided Wilcoxon Rank Sum Test. In the last panel, KO, WT and KO/WT mice derived from all matings were combined for the statistical analysis.
-Figure 4—source data 1.Coordinates of identified novel ETn insertions and supporting capture-seq read counts.Genomic regions indicate cluster of supporting reads.
-Figure 4—source data 2.Sequences of capture-seq probes used to enrich genomic DNA for ETn and MuLV (RLTR4) insertions.
-      item-62 at level 3: picture
-        item-62 at level 4: caption: Figure 4—figure supplement 1.: Birth statistics of KRAB-ZFP cluster KO mice and TE reactivation in adult tissues.
-(A) Birth statistics of Chr4- and Chr2-cl mice derived from KO/WT x KO/WT matings in different strain backgrounds. (B) RNA-seq analysis of TE expression in Chr2- (left) and Chr4-cl (right) KO tissues. TE groups with the highest reactivation phenotype in ES cells are shown separately. Significantly up- and downregulated elements (adjusted p-value<0.05) are highlighted in red and green, respectively. Experiments were performed in at least two biological replicates.
-      item-63 at level 3: picture
-        item-63 at level 4: caption: Figure 4—figure supplement 2.: Identification of polymorphic ETn and MuLV retrotransposon insertions in Chr4-cl KO and WT mice.
-Heatmaps show normalized capture-seq read counts in RPM (Read Per Million) for identified polymorphic ETn (A) and MuLV (B) loci in different mouse strains. Only loci with strong support for germ line ETn or MuLV insertions (at least 100 or 3000 ETn or MuLV RPM, respectively) in at least two animals are shown. Non-polymorphic insertion loci with high read counts in all screened mice were excluded for better visibility. The sample information (sample name and cell type/tissue) is annotated at the bottom, with the strain information indicated by color at the top. The color gradient indicates log10(RPM+1).
-      item-64 at level 3: picture
-        item-64 at level 4: caption: Figure 4—figure supplement 3.: Confirmation of novel ETn insertions identified by capture-seq.
-(A) PCR validation of novel ETn insertions in genomic DNA of three littermates (IDs: T09673, T09674 and T00436) and their parents (T3913 and T3921). Primer sequences are shown in Supplementary file 3. (B) ETn capture-seq read counts (RPM) at putative novel somatic (loci identified exclusively in one single animal), novel germ line (loci identified in several littermates) insertions, and at B6 reference ETn elements. (C) Heatmap shows capture-seq read counts (RPM) of a Chr4-cl KO mouse (ID: C6733) as determined in different tissues. Each row represents a novel ETn locus that was identified in at least one tissue. The color gradient indicates log10(RPM+1). (D) Heatmap shows the capture-seq RPM in technical replicates using the same Chr4-cl KO DNA sample (rep1/rep2) or replicates with DNA samples prepared from different sections of the tail from the same mouse at different ages (tail1/tail2). Each row represents a novel ETn locus that was identified in at least one of the displayed samples. The color gradient indicates log10(RPM+1).
-    item-65 at level 2: section_header: References
-      item-66 at level 3: list: group list
-        item-67 at level 4: list_item: TL Bailey; M Boden; FA Buske; M  ... arching. Nucleic Acids Research (2009)
-        item-68 at level 4: list_item: C Baust; L Gagnier; GJ Baillie;  ...  the mouse. Journal of Virology (2003)
-        item-69 at level 4: list_item: K Blaschke; KT Ebata; MM Karimi; ... -like state in ES cells. Nature (2013)
-        item-70 at level 4: list_item: A Brodziak; E Ziółko; M Muc-Wier ... erimental and Clinical Research (2012)
-        item-71 at level 4: list_item: N Castro-Diaz; G Ecco; A Colucci ... stem cells. Genes & Development (2014)
-        item-72 at level 4: list_item: EB Chuong; NC Elde; C Feschotte. ... ndogenous retroviruses. Science (2016)
-        item-73 at level 4: list_item: J Dan; Y Liu; N Liu; M Chiourea; ... n silencing. Developmental Cell (2014)
-        item-74 at level 4: list_item: A De Iaco; E Planet; A Coluccio; ... cental mammals. Nature Genetics (2017)
-        item-75 at level 4: list_item: Ö Deniz; L de la Rica; KCL Cheng ... onic stem cells. Genome Biology (2018)
-        item-76 at level 4: list_item: M Dewannieux; T Heidmann. Endoge ... rs. Current Opinion in Virology (2013)
-        item-77 at level 4: list_item: G Ecco; M Cassano; A Kauzlaric;  ... ult tissues. Developmental Cell (2016)
-        item-78 at level 4: list_item: G Ecco; M Imbeault; D Trono. KRAB zinc finger proteins. Development (2017)
-        item-79 at level 4: list_item: JA Frank; C Feschotte. Co-option ... on. Current Opinion in Virology (2017)
-        item-80 at level 4: list_item: L Gagnier; VP Belancio; DL Mager ... ansposon insertions. Mobile DNA (2019)
-        item-81 at level 4: list_item: AC Groner; S Meylan; A Ciuffi; N ... omatin spreading. PLOS Genetics (2010)
-        item-82 at level 4: list_item: DC Hancks; HH Kazazian. Roles fo ... ns in human disease. Mobile DNA (2016)
-        item-83 at level 4: list_item: M Imbeault; PY Helleboid; D Tron ... ene regulatory networks. Nature (2017)
-        item-84 at level 4: list_item: FM Jacobs; D Greenberg; N Nguyen ... SVA/L1 retrotransposons. Nature (2014)
-        item-85 at level 4: list_item: H Kano; H Kurahashi; T Toda. Gen ... e dactylaplasia phenotype. PNAS (2007)
-        item-86 at level 4: list_item: MM Karimi; P Goyal; IA Maksakova ... cripts in mESCs. Cell Stem Cell (2011)
-        item-87 at level 4: list_item: A Kauzlaric; G Ecco; M Cassano;  ... related genetic units. PLOS ONE (2017)
-        item-88 at level 4: list_item: PP Khil; F Smagulova; KM Brick;  ... ction of ssDNA. Genome Research (2012)
-        item-89 at level 4: list_item: F Krueger; SR Andrews. Bismark:  ... eq applications. Bioinformatics (2011)
-        item-90 at level 4: list_item: B Langmead; SL Salzberg. Fast ga ... t with bowtie 2. Nature Methods (2012)
-        item-91 at level 4: list_item: M Legiewicz; AS Zolotukhin; GR P ... Journal of Biological Chemistry (2010)
-        item-92 at level 4: list_item: JA Lehoczky; PE Thomas; KM Patri ... n Polypodia mice. PLOS Genetics (2013)
-        item-93 at level 4: list_item: D Leung; T Du; U Wagner; W Xie;  ...  methyltransferase Setdb1. PNAS (2014)
-        item-94 at level 4: list_item: J Lilue; AG Doran; IT Fiddes; M  ... unctional loci. Nature Genetics (2018)
-        item-95 at level 4: list_item: S Liu; J Brind'Amour; MM Karimi; ... germ cells. Genes & Development (2014)
-        item-96 at level 4: list_item: MI Love; W Huber; S Anders. Mode ... ata with DESeq2. Genome Biology (2014)
-        item-97 at level 4: list_item: F Lugani; R Arora; N Papeta; A P ... short tail mouse. PLOS Genetics (2013)
-        item-98 at level 4: list_item: TS Macfarlan; WD Gifford; S Dris ... ous retrovirus activity. Nature (2012)
-        item-99 at level 4: list_item: IA Maksakova; MT Romanish; L Gag ...  mouse germ line. PLOS Genetics (2006)
-        item-100 at level 4: list_item: T Matsui; D Leung; H Miyashita;  ...  methyltransferase ESET. Nature (2010)
-        item-101 at level 4: list_item: HS Najafabadi; S Mnaimneh; FW Sc ... y lexicon. Nature Biotechnology (2015)
-        item-102 at level 4: list_item: C Nellåker; TM Keane; B Yalcin;  ... 8 mouse strains. Genome Biology (2012)
-        item-103 at level 4: list_item: H O'Geen; S Frietze; PJ Farnham. ... s. Methods in Molecular Biology (2010)
-        item-104 at level 4: list_item: A Patel; P Yang; M Tinkham; M Pr ... ndem zinc finger proteins. Cell (2018)
-        item-105 at level 4: list_item: D Ribet; M Dewannieux; T Heidman ... s-mobilization. Genome Research (2004)
-        item-106 at level 4: list_item: SR Richardson; P Gerdes; DJ Gerh ... d early embryo. Genome Research (2017)
-        item-107 at level 4: list_item: HM Rowe; J Jakobsson; D Mesnard; ... in embryonic stem cells. Nature (2010)
-        item-108 at level 4: list_item: HM Rowe; A Kapopoulou; A Corsino ... nic stem cells. Genome Research (2013)
-        item-109 at level 4: list_item: SN Schauer; PE Carreira; R Shukl ... carcinogenesis. Genome Research (2018)
-        item-110 at level 4: list_item: DC Schultz; K Ayyanathan; D Nego ... r proteins. Genes & Development (2002)
-        item-111 at level 4: list_item: K Semba; K Araki; K Matsumoto; H ...  short tail mice. PLOS Genetics (2013)
-        item-112 at level 4: list_item: SP Sripathy; J Stevens; DC Schul ...  Molecular and Cellular Biology (2006)
-        item-113 at level 4: list_item: JH Thomas; S Schneider. Coevolut ... c finger genes. Genome Research (2011)
-        item-114 at level 4: list_item: PJ Thompson; TS Macfarlan; MC Lo ... tory repertoire. Molecular Cell (2016)
-        item-115 at level 4: list_item: RS Treger; SD Pope; Y Kong; M To ... irus expression SNERV. Immunity (2019)
-        item-116 at level 4: list_item: CN Vlangos; AN Siuniak; D Robins ... Ptf1a expression. PLOS Genetics (2013)
-        item-117 at level 4: list_item: J Wang; G Xie; M Singh; AT Ghanb ... s naive-like stem cells. Nature (2014)
-        item-118 at level 4: list_item: D Wolf; K Hug; SP Goff. TRIM28 m ... iruses in embryonic cells. PNAS (2008)
-        item-119 at level 4: list_item: G Wolf; D Greenberg; TS Macfarla ... ger protein family. Mobile DNA (2015a)
-        item-120 at level 4: list_item: G Wolf; P Yang; AC Füchtbauer; E ... roviruses. Genes & Development (2015b)
-        item-121 at level 4: list_item: M Yamauchi; B Freitag; C Khan; B ...  silencers. Journal of Virology (1995)
-        item-122 at level 4: list_item: Y Zhang; T Liu; CA Meyer; J Eeck ... ChIP-Seq (MACS). Genome Biology (2008)
-  item-123 at level 1: caption: Table 1.: * Number of protein-co ...  ChIP-seq was performed in this study.
-  item-124 at level 1: caption: Key resources table: 
-  item-125 at level 1: caption: Figure 1.: Genome-wide binding p ...  with TE groups (Fisher’s exact test).
-  item-126 at level 1: caption: Figure 1—figure supplement 1.: E ... tes amino acid substitutions per site.
-  item-127 at level 1: caption: Figure 1—figure supplement 2.: K ... sfected with an empty pcDNA3.1 vector.
-  item-128 at level 1: caption: Figure 1—figure supplement 3.: K ... (adapted from Legiewicz et al., 2010).
-  item-129 at level 1: caption: Figure 2.: Retrotransposon react ... cal testing were determined by DESeq2.
-  item-130 at level 1: caption: Figure 2—figure supplement 1.: E ... r assay in Chr4-cl WT and KO ES cells.
-  item-131 at level 1: caption: Figure 3.: TE-dependent gene act ... Gm13051 are indicated by dashed lines.
-  item-132 at level 1: caption: Figure 4.: ETn retrotranspositio ... A for ETn and MuLV (RLTR4) insertions.
-  item-133 at level 1: caption: Figure 4—figure supplement 1.: B ... in at least two biological replicates.
-  item-134 at level 1: caption: Figure 4—figure supplement 2.: I ... color gradient indicates log10(RPM+1).
-  item-135 at level 1: caption: Figure 4—figure supplement 3.: C ... color gradient indicates log10(RPM+1).
+    item-2 at level 2: paragraph: Gernot Wolf, Alberto de Iaco, Mi ...  Ralls, Didier Trono, Todd S Macfarlan
+    item-3 at level 2: paragraph: The Eunice Kennedy Shriver Natio ... Lausanne (EPFL), Lausanne, Switzerland
+    item-4 at level 2: section_header: Abstract
+      item-5 at level 3: text: The Krüppel-associated box zinc  ... edundant role restricting TE activity.
+    item-6 at level 2: section_header: Introduction
+      item-7 at level 3: text: Nearly half of the human and mou ... s are active beyond early development.
+      item-8 at level 3: text: TEs, especially long terminal re ... f evolutionarily young KRAB-ZFP genes.
+    item-9 at level 2: section_header: Results
+      item-10 at level 3: section_header: Mouse KRAB-ZFPs target retrotransposons
+        item-11 at level 4: text: We analyzed the RNA expression p ... duplications (Kauzlaric et al., 2017).
+        item-12 at level 4: text: To determine the binding sites o ... ctive in the early embryo (Figure 1A).
+        item-13 at level 4: picture
+          item-13 at level 5: caption: Figure 1. Genome-wide binding patterns of mouse KRAB-ZFPs. (A) Probability heatmap of KRAB-ZFP binding to TEs. Blue color intensity (main field) corresponds to -log10 (adjusted p-value) enrichment of ChIP-seq peak overlap with TE groups (Fisher’s exact test). The green/red color intensity (top panel) represents mean KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) enrichment (respectively) at peaks overlapping significantly targeted TEs (adjusted p-value<1e-5) in WT ES cells. (B) Summarized ChIP-seq signal for indicated KRAB-ZFPs and previously published KAP1 and H3K9me3 in WT ES cells across 127 intact ETn elements. (C) Heatmaps of KRAB-ZFP ChIP-seq signal at ChIP-seq peaks. For better comparison, peaks for all three KRAB-ZFPs were called with the same parameters (p<1e-10, peak enrichment >20). The top panel shows a schematic of the arrangement of the contact amino acid composition of each zinc finger. Zinc fingers are grouped and colored according to similarity, with amino acid differences relative to the five consensus fingers highlighted in white.
+        item-14 at level 4: table with [9x5]
+          item-14 at level 5: caption: Table 1. KRAB-ZFP genes clusters in the mouse genome that were investigated in this study. * Number of protein-coding KRAB-ZFP genes identified in a previously published screen (Imbeault et al., 2017) and the ChIP-seq data column indicates the number of KRAB-ZFPs for which ChIP-seq was performed in this study.
+        item-15 at level 4: text: We generally observed that KRAB- ... responsible for this silencing effect.
+        item-16 at level 4: text: To further test the hypothesis t ... t easily evade repression by mutation.
+        item-17 at level 4: text: Our KRAB-ZFP ChIP-seq dataset al ... ntirely shift the mode of DNA binding.
+      item-18 at level 3: section_header: Genetic deletion of KRAB-ZFP gen ...  leads to retrotransposon reactivation
+        item-19 at level 4: text: The majority of KRAB-ZFP genes a ... ung et al., 2014; Deniz et al., 2018).
+        item-20 at level 4: picture
+          item-20 at level 5: caption: Figure 2. Retrotransposon reactivation in KRAB-ZFP cluster KO ES cells. (A) RNA-seq analysis of TE expression in five KRAB-ZFP cluster KO ES cells. Green and grey squares on top of the panel represent KRAB-ZFPs with or without ChIP-seq data, respectively, within each deleted gene cluster. Reactivated TEs that are bound by one or several KRAB-ZFPs are indicated by green squares in the panel. Significantly up- and downregulated elements (adjusted p-value<0.05) are highlighted in red and green, respectively. (B) Differential KAP1 binding and H3K9me3 enrichment at TE groups (summarized across all insertions) in Chr2-cl and Chr4-cl KO ES cells. TE groups targeted by one or several KRAB-ZFPs encoded within the deleted clusters are highlighted in blue (differential enrichment over the entire TE sequences) and red (differential enrichment at TE regions that overlap with KRAB-ZFP ChIP-seq peaks). (C) DNA methylation status of CpG sites at indicated TE groups in WT and Chr4-cl KO ES cells grown in serum containing media or in hypomethylation-inducing media (2i + Vitamin C). P-values were calculated using paired t-test.
+      item-21 at level 3: section_header: KRAB-ZFP cluster deletions license TE-borne enhancers
+        item-22 at level 4: text: We next used our RNA-seq dataset ... vating effects of TEs on nearby genes.
+        item-23 at level 4: picture
+          item-23 at level 5: caption: Figure 3. TE-dependent gene activation in KRAB-ZFP cluster KO ES cells. (A) Differential gene expression in Chr2-cl and Chr4-cl KO ES cells. Significantly up- and downregulated genes (adjusted p-value<0.05) are highlighted in red and green, respectively, KRAB-ZFP genes within the deleted clusters are shown in blue. (B) Correlation of TEs and gene deregulation. Plots show enrichment of TE groups within 100 kb of up- and downregulated genes relative to all genes. Significantly overrepresented LTR and LINE groups (adjusted p-value<0.1) are highlighted in blue and red, respectively. (C) Schematic view of the downstream region of Chst1 where a 5’ truncated ETn insertion is located. ChIP-seq (Input subtracted from ChIP) data for overexpressed epitope-tagged Gm13051 (a Chr4-cl KRAB-ZFP) in F9 EC cells, and re-mapped KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) in WT ES cells are shown together with RNA-seq data from Chr4-cl WT and KO ES cells (mapped using Bowtie (-a -m 1 --strata -v 2) to exclude reads that cannot be uniquely mapped). (D) RT-qPCR analysis of Chst1 mRNA expression in Chr4-cl WT and KO ES cells with or without the CRISPR/Cas9 deleted ETn insertion near Chst1. Values represent mean expression (normalized to Gapdh) from three biological replicates per sample (each performed in three technical replicates) in arbitrary units. Error bars represent standard deviation and asterisks indicate significance (p<0.01, Student’s t-test). n.s.: not significant. (E) Mean coverage of ChIP-seq data (Input subtracted from ChIP) in Chr4-cl WT and KO ES cells over 127 full-length ETn insertions. The binding sites of the Chr4-cl KRAB-ZFPs Rex2 and Gm13051 are indicated by dashed lines.
+        item-24 at level 4: text: While we generally observed that ... he internal region and not on the LTR.
+      item-25 at level 3: section_header: ETn retrotransposition in Chr4-cl KO and WT mice
+        item-26 at level 4: text: IAP, ETn/ETnERV and MuLV/RLTR4 r ... s may contribute to reduced viability.
+        item-27 at level 4: text: We reasoned that retrotransposon ... Tn insertions at a high recovery rate.
+        item-28 at level 4: text: Using this dataset, we first con ... nsertions in our pedigree (Figure 4A).
+        item-29 at level 4: picture
+          item-29 at level 5: caption: Figure 4. ETn retrotransposition in Chr4-cl KO mice. (A) Pedigree of mice used for transposon insertion screening by capture-seq in mice of different strain backgrounds. The number of novel ETn insertions (only present in one animal) are indicated. For animals whose direct ancestors have not been screened, the ETn insertions are shown in parentheses since parental inheritance cannot be excluded in that case. Germ line insertions are indicated by asterisks. All DNA samples were prepared from tail tissues unless noted (-S: spleen, -E: ear, -B:Blood) (B) Statistical analysis of ETn insertion frequency in tail tissue from 30 Chr4-cl KO, KO/WT and WT mice that were derived from one Chr4-c KO x KO/WT and two Chr4-cl KO/WT x KO/WT matings. Only DNA samples that were collected from juvenile tails were considered for this analysis. P-values were calculated using one-sided Wilcoxon Rank Sum Test. In the last panel, KO, WT and KO/WT mice derived from all matings were combined for the statistical analysis.
+        item-30 at level 4: text: To validate some of the novel ET ... ess might have truncated this element.
+        item-31 at level 4: text: Besides novel ETn insertions tha ... tions (Figure 4—figure supplement 3D).
+        item-32 at level 4: text: Finally, we asked whether there  ... s clearly also play an important role.
+    item-33 at level 2: section_header: Discussion
+      item-34 at level 3: text: C2H2 zinc finger proteins, about ... ) depending upon their insertion site.
+      item-35 at level 3: text: Despite a lack of widespread ETn ... ion of the majority of KRAB-ZFP genes.
+    item-36 at level 2: section_header: Materials and methods
+      item-37 at level 3: table with [31x5]
+        item-37 at level 4: caption: Key resources table
+      item-38 at level 3: section_header: Cell lines and transgenic mice
+        item-39 at level 4: text: Mouse ES cells and F9 EC cells w ... KO/KO and KO/WT (B6/129 F2) offspring.
+      item-40 at level 3: section_header: Generation of KRAB-ZFP expressing cell lines
+        item-41 at level 4: text: KRAB-ZFP ORFs were PCR-amplified ... led and further expanded for ChIP-seq.
+      item-42 at level 3: section_header: CRISPR/Cas9 mediated deletion of KRAB-ZFP clusters and an MMETn insertion
+        item-43 at level 4: text: All gRNAs were expressed from th ... PCR genotyping (Supplementary file 3).
+      item-44 at level 3: section_header: ChIP-seq analysis
+        item-45 at level 4: text: For ChIP-seq analysis of KRAB-ZF ... 010 or Khil et al., 2012 respectively.
+        item-46 at level 4: text: ChIP-seq libraries were construc ...  were re-mapped using Bowtie (--best).
+      item-47 at level 3: section_header: Luciferase reporter assays
+        item-48 at level 4: text: For KRAB-ZFP repression assays,  ... after transfection as described above.
+      item-49 at level 3: section_header: RNA-seq analysis
+        item-50 at level 4: text: Whole RNA was purified using RNe ... lemented in the R function p.adjust().
+      item-51 at level 3: section_header: Reduced representation bisulfite sequencing (RRBS-seq)
+        item-52 at level 4: text: For RRBS-seq analysis, Chr4-cl W ... h sample were considered for analysis.
+      item-53 at level 3: section_header: Retrotransposition assay
+        item-54 at level 4: text: The retrotransposition vectors p ... were stained with Amido Black (Sigma).
+      item-55 at level 3: section_header: Capture-seq screen
+        item-56 at level 4: text: To identify novel retrotransposo ... assembly using the Unicycler software.
+    item-57 at level 2: section_header: Funding Information
+      item-58 at level 3: text: This paper was supported by the following grants:
+      item-59 at level 3: list: group list
+        item-60 at level 4: list_item: http://dx.doi.org/10.13039/10000 ... ment 1ZIAHD008933 to Todd S Macfarlan.
+        item-61 at level 4: list_item: http://dx.doi.org/10.13039/50110 ... ndation 310030_152879 to Didier Trono.
+        item-62 at level 4: list_item: http://dx.doi.org/10.13039/50110 ... dation 310030B_173337 to Didier Trono.
+        item-63 at level 4: list_item: http://dx.doi.org/10.13039/50110 ... ch Council No. 268721 to Didier Trono.
+        item-64 at level 4: list_item: http://dx.doi.org/10.13039/50110 ... rch Council No 694658 to Didier Trono.
+    item-65 at level 2: section_header: Acknowledgements
+      item-66 at level 3: text: We thank Alex Grinberg, Jeanne Y ...  268721; Transpos-X, No. 694658) (DT).
+    item-67 at level 2: section_header: Additional information
+    item-68 at level 2: section_header: Additional files
+    item-69 at level 2: section_header: Data availability
+      item-70 at level 3: text: All NGS data has been deposited  ... GenBank database (MH449667- MH449669).
+      item-71 at level 3: text: The following datasets were generated:
+      item-72 at level 3: text: Wolf G. Retrotransposon reactiva ... ession Omnibus (2019). NCBI: GSE115291
+      item-73 at level 3: text: Wolf G. Mus musculus musculus st ... e. NCBI GenBank (2019). NCBI: MH449667
+      item-74 at level 3: text: Wolf G. Mus musculus musculus st ... e. NCBI GenBank (2019). NCBI: MH449668
+      item-75 at level 3: text: Wolf G. Mus musculus musculus st ... e. NCBI GenBank (2019). NCBI: MH449669
+      item-76 at level 3: text: The following previously published datasets were used:
+      item-77 at level 3: text: Castro-Diaz N, Ecco G, Coluccio  ... ssion Omnibus (2014). NCBI: GSM1406445
+      item-78 at level 3: text: Andrew ZX. H3K9me3_ChIPSeq (Ctrl ... ssion Omnibus (2014). NCBI: GSM1327148
+    item-79 at level 2: section_header: References
+      item-80 at level 3: list: group list
+        item-81 at level 4: list_item: Bailey TL, Boden M, Buske FA, Fr ... OI: 10.1093/nar/gkp335, PMID: 19458158
+        item-82 at level 4: list_item: Baust C, Gagnier L, Baillie GJ,  ... 77.21.11448-11458.2003, PMID: 14557630
+        item-83 at level 4: list_item: Blaschke K, Ebata KT, Karimi MM, ... I: 10.1038/nature12362, PMID: 23812591
+        item-84 at level 4: list_item: Brodziak A, Ziółko E, Muc-Wierzg ... I: 10.12659/msm.882892, PMID: 22648263
+        item-85 at level 4: list_item: Castro-Diaz N, Ecco G, Coluccio  ... 10.1101/gad.241661.114, PMID: 24939876
+        item-86 at level 4: list_item: Chuong EB, Elde NC, Feschotte C. ... 0.1126/science.aad5497, PMID: 26941318
+        item-87 at level 4: list_item: Dan J, Liu Y, Liu N, Chiourea M, ... 6/j.devcel.2014.03.004, PMID: 24735877
+        item-88 at level 4: list_item: De Iaco A, Planet E, Coluccio A, ... . DOI: 10.1038/ng.3858, PMID: 28459456
+        item-89 at level 4: list_item: Deniz Ö, de la Rica L, Cheng KCL ... 1186/s13059-017-1376-y, PMID: 29351814
+        item-90 at level 4: list_item: Dewannieux M, Heidmann T. Endoge ... 6/j.coviro.2013.08.005, PMID: 24004725
+        item-91 at level 4: list_item: Ecco G, Cassano M, Kauzlaric A,  ... 6/j.devcel.2016.02.024, PMID: 27003935
+        item-92 at level 4: list_item: Ecco G, Imbeault M, Trono D. KRA ... OI: 10.1242/dev.132605, PMID: 28765213
+        item-93 at level 4: list_item: Frank JA, Feschotte C. Co-option ... 6/j.coviro.2017.07.021, PMID: 28818736
+        item-94 at level 4: list_item: Gagnier L, Belancio VP, Mager DL ... 1186/s13100-019-0157-4, PMID: 31011371
+        item-95 at level 4: list_item: Groner AC, Meylan S, Ciuffi A, Z ... 1/journal.pgen.1000869, PMID: 20221260
+        item-96 at level 4: list_item: Hancks DC, Kazazian HH. Roles fo ... 1186/s13100-016-0065-9, PMID: 27158268
+        item-97 at level 4: list_item: Imbeault M, Helleboid PY, Trono  ... I: 10.1038/nature21683, PMID: 28273063
+        item-98 at level 4: list_item: Jacobs FM, Greenberg D, Nguyen N ... I: 10.1038/nature13760, PMID: 25274305
+        item-99 at level 4: list_item: Kano H, Kurahashi H, Toda T. Gen ... 0.1073/pnas.0705483104, PMID: 17984064
+        item-100 at level 4: list_item: Karimi MM, Goyal P, Maksakova IA ... 016/j.stem.2011.04.004, PMID: 21624812
+        item-101 at level 4: list_item: Kauzlaric A, Ecco G, Cassano M,  ... 1/journal.pone.0173746, PMID: 28334004
+        item-102 at level 4: list_item: Khil PP, Smagulova F, Brick KM,  ...  10.1101/gr.130583.111, PMID: 22367190
+        item-103 at level 4: list_item: Krueger F, Andrews SR. Bismark:  ... /bioinformatics/btr167, PMID: 21493656
+        item-104 at level 4: list_item: Langmead B, Salzberg SL. Fast ga ... OI: 10.1038/nmeth.1923, PMID: 22388286
+        item-105 at level 4: list_item: Legiewicz M, Zolotukhin AS, Pilk ... 0.1074/jbc.M110.182840, PMID: 20978285
+        item-106 at level 4: list_item: Lehoczky JA, Thomas PE, Patrie K ... 1/journal.pgen.1003967, PMID: 24339789
+        item-107 at level 4: list_item: Leung D, Du T, Wagner U, Xie W,  ... 0.1073/pnas.1322273111, PMID: 24757056
+        item-108 at level 4: list_item: Lilue J, Doran AG, Fiddes IT, Ab ... 1038/s41588-018-0223-8, PMID: 30275530
+        item-109 at level 4: list_item: Liu S, Brind'Amour J, Karimi MM, ... 10.1101/gad.244848.114, PMID: 25228647
+        item-110 at level 4: list_item: Love MI, Huber W, Anders S. Mode ... 1186/s13059-014-0550-8, PMID: 25516281
+        item-111 at level 4: list_item: Lugani F, Arora R, Papeta N, Pat ... 1/journal.pgen.1003206, PMID: 23437001
+        item-112 at level 4: list_item: Macfarlan TS, Gifford WD, Drisco ... I: 10.1038/nature11244, PMID: 22722858
+        item-113 at level 4: list_item: Maksakova IA, Romanish MT, Gagni ... 1/journal.pgen.0020002, PMID: 16440055
+        item-114 at level 4: list_item: Matsui T, Leung D, Miyashita H,  ... I: 10.1038/nature08858, PMID: 20164836
+        item-115 at level 4: list_item: Najafabadi HS, Mnaimneh S, Schmi ...  DOI: 10.1038/nbt.3128, PMID: 25690854
+        item-116 at level 4: list_item: Nellåker C, Keane TM, Yalcin B,  ... .1186/gb-2012-13-6-r45, PMID: 22703977
+        item-117 at level 4: list_item: O'Geen H, Frietze S, Farnham PJ. ... 7/978-1-60761-753-2_27, PMID: 20680851
+        item-118 at level 4: list_item: Patel A, Yang P, Tinkham M, Prad ... 016/j.cell.2018.02.058, PMID: 29551271
+        item-119 at level 4: list_item: Ribet D, Dewannieux M, Heidmann  ... OI: 10.1101/gr.2924904, PMID: 15479948
+        item-120 at level 4: list_item: Richardson SR, Gerdes P, Gerhard ...  10.1101/gr.219022.116, PMID: 28483779
+        item-121 at level 4: list_item: Rowe HM, Jakobsson J, Mesnard D, ... I: 10.1038/nature08674, PMID: 20075919
+        item-122 at level 4: list_item: Rowe HM, Kapopoulou A, Corsinott ...  10.1101/gr.147678.112, PMID: 23233547
+        item-123 at level 4: list_item: Schauer SN, Carreira PE, Shukla  ...  10.1101/gr.226993.117, PMID: 29643204
+        item-124 at level 4: list_item: Schultz DC, Ayyanathan K, Negore ... OI: 10.1101/gad.973302, PMID: 11959841
+        item-125 at level 4: list_item: Semba K, Araki K, Matsumoto K, S ... 1/journal.pgen.1003204, PMID: 23436999
+        item-126 at level 4: list_item: Sripathy SP, Stevens J, Schultz  ... : 10.1128/MCB.00487-06, PMID: 16954381
+        item-127 at level 4: list_item: Thomas JH, Schneider S. Coevolut ...  10.1101/gr.121749.111, PMID: 21784874
+        item-128 at level 4: list_item: Thompson PJ, Macfarlan TS, Lorin ... 6/j.molcel.2016.03.029, PMID: 27259207
+        item-129 at level 4: list_item: Treger RS, Pope SD, Kong Y, Toku ... 6/j.immuni.2018.12.022, PMID: 30709743
+        item-130 at level 4: list_item: Vlangos CN, Siuniak AN, Robinson ... 1/journal.pgen.1003205, PMID: 23437000
+        item-131 at level 4: list_item: Wang J, Xie G, Singh M, Ghanbari ... I: 10.1038/nature13804, PMID: 25317556
+        item-132 at level 4: list_item: Wolf D, Hug K, Goff SP. TRIM28 m ... 0.1073/pnas.0805540105, PMID: 18713861
+        item-133 at level 4: list_item: Wolf G, Greenberg D, Macfarlan T ... 1186/s13100-015-0050-8, PMID: 26435754
+        item-134 at level 4: list_item: Wolf G, Yang P, Füchtbauer AC, F ... 10.1101/gad.252767.114, PMID: 25737282
+        item-135 at level 4: list_item: Yamauchi M, Freitag B, Khan C, B ... JVI.69.2.1142-1149.1995, PMID: 7529329
+        item-136 at level 4: list_item: Zhang Y, Liu T, Meyer CA, Eeckho ... .1186/gb-2008-9-9-r137, PMID: 18798982
+  item-137 at level 1: caption: Figure 1. Genome-wide binding pa ... onsensus fingers highlighted in white.
+  item-138 at level 1: caption: Table 1. KRAB-ZFP genes clusters ...  ChIP-seq was performed in this study.
+  item-139 at level 1: caption: Figure 2. Retrotransposon reacti ... s were calculated using paired t-test.
+  item-140 at level 1: caption: Figure 3. TE-dependent gene acti ... Gm13051 are indicated by dashed lines.
+  item-141 at level 1: caption: Figure 4. ETn retrotransposition ... combined for the statistical analysis.
+  item-142 at level 1: caption: Key resources table
--- a/tests/data/groundtruth/docling_v2/elife-56337.xml.json
+++ b/tests/data/groundtruth/docling_v2/elife-56337.xml.json
--- a/tests/data/groundtruth/docling_v2/elife-56337.xml.md
+++ b/tests/data/groundtruth/docling_v2/elife-56337.xml.md
@ -1,6 +1,8 @@
 # KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage

-Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States
+Gernot Wolf, Alberto de Iaco, Ming-An Sun, Melania Bruno, Matthew Tinkham, Don Hoang, Apratim Mitra, Sherry Ralls, Didier Trono, Todd S Macfarlan
+
+The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health, Bethesda, United States; School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland

 ## Abstract

@ -20,6 +22,23 @@ We analyzed the RNA expression profiles of mouse KRAB-ZFPs across a wide range o

 To determine the binding sites of the KRAB-ZFPs within these and other gene clusters, we expressed epitope-tagged KRAB-ZFPs using stably integrating vectors in mouse embryonic carcinoma (EC) or ES cells (Table 1, Supplementary file 1) and performed chromatin immunoprecipitation followed by deep sequencing (ChIP-seq). We then determined whether the identified binding sites are significantly enriched over annotated TEs and used the non-repetitive peak fraction to identify binding motifs. We discarded 7 of 68 ChIP-seq datasets because we could not obtain a binding motif or a target TE and manual inspection confirmed low signal to noise ratio. Of the remaining 61 KRAB-ZFPs, 51 significantly overlapped at least one TE subfamily (adjusted p-value&lt;1e-5). Altogether, 81 LTR retrotransposon, 18 LINE, 10 SINE and one DNA transposon subfamilies were targeted by at least one of the 51 KRAB-ZFPs (Figure 1A and Supplementary file 1). Chr2-cl KRAB-ZFPs preferably bound IAPEz retrotransposons and L1-type LINEs, while Chr4-cl KRAB-ZFPs targeted various retrotransposons, including the closely related MMETn (hereafter referred to as ETn) and ETnERV (also known as MusD) elements (Figure 1A). ETn elements are non-autonomous LTR retrotransposons that require trans-complementation by the fully coding ETnERV elements that contain Gag, Pro and Pol genes (Ribet et al., 2004). These elements have accumulated to ~240 and~100 copies in the reference C57BL/6 genome, respectively, with ~550 solitary LTRs (Baust et al., 2003). Both ETn and ETnERVs are still active, generating polymorphisms and mutations in several mouse strains (Gagnier et al., 2019). The validity of our ChIP-seq screen was confirmed by the identification of binding motifs - which often resembled the computationally predicted motifs (Figure 1—figure supplement 2A) - for the majority of screened KRAB-ZFPs (Supplementary file 1). Moreover, predicted and experimentally determined motifs were found in targeted TEs in most cases (Supplementary file 1), and reporter repression assays confirmed KRAB-ZFP induced silencing for all the tested sequences (Figure 1—figure supplement 2B). Finally, we observed KAP1 and H3K9me3 enrichment at most of the targeted TEs in wild type ES cells, indicating that most of these KRAB-ZFPs are functionally active in the early embryo (Figure 1A).

+Figure 1. Genome-wide binding patterns of mouse KRAB-ZFPs. (A) Probability heatmap of KRAB-ZFP binding to TEs. Blue color intensity (main field) corresponds to -log10 (adjusted p-value) enrichment of ChIP-seq peak overlap with TE groups (Fisher’s exact test). The green/red color intensity (top panel) represents mean KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) enrichment (respectively) at peaks overlapping significantly targeted TEs (adjusted p-value&lt;1e-5) in WT ES cells. (B) Summarized ChIP-seq signal for indicated KRAB-ZFPs and previously published KAP1 and H3K9me3 in WT ES cells across 127 intact ETn elements. (C) Heatmaps of KRAB-ZFP ChIP-seq signal at ChIP-seq peaks. For better comparison, peaks for all three KRAB-ZFPs were called with the same parameters (p&lt;1e-10, peak enrichment &gt;20). The top panel shows a schematic of the arrangement of the contact amino acid composition of each zinc finger. Zinc fingers are grouped and colored according to similarity, with amino acid differences relative to the five consensus fingers highlighted in white.
+
+<!-- image -->
+
+Table 1. KRAB-ZFP genes clusters in the mouse genome that were investigated in this study. * Number of protein-coding KRAB-ZFP genes identified in a previously published screen (Imbeault et al., 2017) and the ChIP-seq data column indicates the number of KRAB-ZFPs for which ChIP-seq was performed in this study.
+
+| Cluster   | Location   | Size (Mb)   |   # of KRAB-ZFPs* |   ChIP-seq data |
+|-----------|------------|-------------|-------------------|-----------------|
+| Chr2      | Chr2 qH4   | 3.1         |                40 |              17 |
+| Chr4      | Chr4 qE1   | 2.3         |                21 |              19 |
+| Chr10     | Chr10 qC1  | 0.6         |                 6 |               1 |
+| Chr13.1   | Chr13 qB3  | 1.2         |                 6 |               2 |
+| Chr13.2   | Chr13 qB3  | 0.8         |                26 |              12 |
+| Chr8      | Chr8 qB3.3 | 0.1         |                 4 |               4 |
+| Chr9      | Chr9 qA3   | 0.1         |                 4 |               2 |
+| Other     | -          | -           |               248 |               4 |
+
 We generally observed that KRAB-ZFPs present exclusively in mouse target TEs that are restricted to the mouse genome, indicating KRAB-ZFPs and their targets emerged together. For example, several mouse-specific KRAB-ZFPs in Chr2-cl and Chr4-cl target IAP and ETn elements which are only found in the mouse genome and are highly active. This is the strongest data to date supporting that recent KRAB-ZFP expansions in these young clusters is a response to recent TE activity. Likewise, ZFP599 and ZFP617, both conserved in Muroidea, bind to various ORR1-type LTRs which are present in the rat genome (Supplementary file 1). However, ZFP961, a KRAB-ZFP encoded on a small gene cluster on chromosome 8 that is conserved in Muroidea targets TEs that are only found in the mouse genome (e.g. ETn), a paradox we have previously observed with ZFP809, which also targets TEs that are evolutionarily younger than itself (Wolf et al., 2015b). The ZFP961 binding site is located at the 5’ end of the internal region of ETn and ETnERV elements, a sequence that usually contains the primer binding site (PBS), which is required to prime retroviral reverse transcription. Indeed, the ZFP961 motif closely resembles the PBSLys1,2 (Figure 1—figure supplement 3A), which had been previously identified as a KAP1-dependent target of retroviral repression (Yamauchi et al., 1995; Wolf et al., 2008). Repression of the PBSLys1,2 by ZFP961 was also confirmed in reporter assays (Figure 1—figure supplement 2B), indicating that ZFP961 is likely responsible for this silencing effect.

 To further test the hypothesis that KRAB-ZFPs target sites necessary for retrotransposition, we utilized previously generated ETn and ETnERV retrotransposition reporters in which we mutated KRAB-ZFP binding sites (Ribet et al., 2004). Whereas the ETnERV reporters are sufficient for retrotransposition, the ETn reporter requires ETnERV genes supplied in trans. We tested and confirmed that the REX2/ZFP600 and GM13051 binding sites within these TEs are required for efficient retrotransposition (Figure 1—figure supplement 3B). REX2 and ZFP600 both bind a target about 200 bp from the start of the internal region (Figure 1B), a region that often encodes the packaging signal. GM13051 binds a target coding for part of a highly structured mRNA export signal (Legiewicz et al., 2010) near the 3’ end of the internal region of ETn (Figure 1—figure supplement 3C). Both signals are characterized by stem-loop intramolecular base-pairing in which a single mutation can disrupt loop formation. This indicates that at least some KRAB-ZFPs evolved to bind functionally essential target sequences which cannot easily evade repression by mutation.
@ -30,10 +49,18 @@ Our KRAB-ZFP ChIP-seq dataset also provided unique insights into the emergence o

 The majority of KRAB-ZFP genes are harbored in large, highly repetitive clusters that have formed by successive complex segmental duplications (Kauzlaric et al., 2017), rendering them inaccessible to conventional gene targeting. We therefore developed a strategy to delete entire KRAB-ZFP gene clusters in ES cells (including the Chr2-cl and Chr4-cl as well as two clusters on chromosome 13 and a cluster on chromosome 10) using two CRISPR/Cas9 gRNAs targeting unique regions flanking each cluster, and short single-stranded repair oligos with homologies to both sides of the projected cut sites. Using this approach, we generated five cluster KO ES cell lines in at least two biological replicates and performed RNA sequencing (RNA-seq) to determine TE expression levels. Strikingly, four of the five cluster KO ES cells exhibited distinct TE reactivation phenotypes (Figure 2A). Chr2-cl KO resulted in reactivation of several L1 subfamilies as well as RLTR10 (up to more than 100-fold as compared to WT) and IAPEz ERVs. In contrast, the most strongly upregulated TEs in Chr4-cl KO cells were ETn/ETnERV (up to 10-fold as compared to WT), with several other ERV groups modestly reactivated. ETn/ETnERV elements were also upregulated in Chr13.2-cl KO ES cells while the only upregulated ERVs in Chr13.1-cl KO ES cells were MMERVK10C elements (Figure 2A). Most reactivated retrotransposons were targeted by at least one KRAB-ZFP that was encoded in the deleted cluster (Figure 2A and Supplementary file 1), indicating a direct effect of these KRAB-ZFPs on TE expression levels. Furthermore, we observed a loss of KAP1 binding and H3K9me3 at several TE subfamilies that are targeted by at least one KRAB-ZFP within the deleted Chr2-cl and Chr4-cl (Figure 2B, Figure 2—figure supplement 1A), including L1, ETn and IAPEz elements. Using reduced representation bisulfite sequencing (RRBS-seq), we found that a subset of KRAB-ZFP bound TEs were partially hypomethylated in Chr4-cl KO ES cells, but only when grown in genome-wide hypomethylation-inducing conditions (Blaschke et al., 2013; Figure 2C and Supplementary file 2). These data are consistent with the hypothesis that KRAB-ZFPs/KAP1 are not required to establish DNA methylation, but under certain conditions they protect specific TEs and imprint control regions from genome-wide demethylation (Leung et al., 2014; Deniz et al., 2018).

+Figure 2. Retrotransposon reactivation in KRAB-ZFP cluster KO ES cells. (A) RNA-seq analysis of TE expression in five KRAB-ZFP cluster KO ES cells. Green and grey squares on top of the panel represent KRAB-ZFPs with or without ChIP-seq data, respectively, within each deleted gene cluster. Reactivated TEs that are bound by one or several KRAB-ZFPs are indicated by green squares in the panel. Significantly up- and downregulated elements (adjusted p-value&lt;0.05) are highlighted in red and green, respectively. (B) Differential KAP1 binding and H3K9me3 enrichment at TE groups (summarized across all insertions) in Chr2-cl and Chr4-cl KO ES cells. TE groups targeted by one or several KRAB-ZFPs encoded within the deleted clusters are highlighted in blue (differential enrichment over the entire TE sequences) and red (differential enrichment at TE regions that overlap with KRAB-ZFP ChIP-seq peaks). (C) DNA methylation status of CpG sites at indicated TE groups in WT and Chr4-cl KO ES cells grown in serum containing media or in hypomethylation-inducing media (2i + Vitamin C). P-values were calculated using paired t-test.
+
+<!-- image -->
+
 ### KRAB-ZFP cluster deletions license TE-borne enhancers

 We next used our RNA-seq datasets to determine the effect of KRAB-ZFP cluster deletions on gene expression. We identified 195 significantly upregulated and 130 downregulated genes in Chr4-cl KO ES cells, and 108 upregulated and 59 downregulated genes in Chr2-cl KO ES cells (excluding genes on the deleted cluster) (Figure 3A). To address whether gene deregulation in Chr2-cl and Chr4-cl KO ES cells is caused by nearby TE reactivation, we determined whether genes near certain TE subfamilies are more frequently deregulated than random genes. We found a strong correlation of gene upregulation and TE proximity for several TE subfamilies, of which many became transcriptionally activated themselves (Figure 3B). For example, nearly 10% of genes that are located within 100 kb (up- or downstream of the TSS) of an ETn element are upregulated in Chr4-cl KO ES cells, as compared to 0.8% of all genes. In Chr2-cl KO ES cells, upregulated genes were significantly enriched near various LINE groups but also IAPEz-int and RLTR10-int elements, indicating that TE-binding KRAB-ZFPs in these clusters limit the potential activating effects of TEs on nearby genes.

+Figure 3. TE-dependent gene activation in KRAB-ZFP cluster KO ES cells. (A) Differential gene expression in Chr2-cl and Chr4-cl KO ES cells. Significantly up- and downregulated genes (adjusted p-value&lt;0.05) are highlighted in red and green, respectively, KRAB-ZFP genes within the deleted clusters are shown in blue. (B) Correlation of TEs and gene deregulation. Plots show enrichment of TE groups within 100 kb of up- and downregulated genes relative to all genes. Significantly overrepresented LTR and LINE groups (adjusted p-value&lt;0.1) are highlighted in blue and red, respectively. (C) Schematic view of the downstream region of Chst1 where a 5’ truncated ETn insertion is located. ChIP-seq (Input subtracted from ChIP) data for overexpressed epitope-tagged Gm13051 (a Chr4-cl KRAB-ZFP) in F9 EC cells, and re-mapped KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) in WT ES cells are shown together with RNA-seq data from Chr4-cl WT and KO ES cells (mapped using Bowtie (-a -m 1 --strata -v 2) to exclude reads that cannot be uniquely mapped). (D) RT-qPCR analysis of Chst1 mRNA expression in Chr4-cl WT and KO ES cells with or without the CRISPR/Cas9 deleted ETn insertion near Chst1. Values represent mean expression (normalized to Gapdh) from three biological replicates per sample (each performed in three technical replicates) in arbitrary units. Error bars represent standard deviation and asterisks indicate significance (p&lt;0.01, Student’s t-test). n.s.: not significant. (E) Mean coverage of ChIP-seq data (Input subtracted from ChIP) in Chr4-cl WT and KO ES cells over 127 full-length ETn insertions. The binding sites of the Chr4-cl KRAB-ZFPs Rex2 and Gm13051 are indicated by dashed lines.
+
+<!-- image -->
+
 While we generally observed that TE-associated gene reactivation is not caused by elongated or spliced transcription starting at the retrotransposons, we did observe that the strength of the effect of ETn elements on gene expression is stronger on genes in closer proximity. About 25% of genes located within 20 kb of an ETn element, but only 5% of genes located at a distance between 50 and 100 kb from the nearest ETn insertion, become upregulated in Chr4-cl KO ES cells. Importantly however, the correlation is still significant for genes that are located at distances between 50 and 100 kb from the nearest ETn insertion, indicating that ETn elements can act as long-range enhancers of gene expression in the absence of KRAB-ZFPs that target them. To confirm that Chr4-cl KRAB-ZFPs such as GM13051 block ETn-borne enhancers, we tested the ability of a putative ETn enhancer to activate transcription in a reporter assay. For this purpose, we cloned a 5 kb fragment spanning from the GM13051 binding site within the internal region of a truncated ETn insertion to the first exon of the Cd59a gene, which is strongly activated in Chr4-cl KO ES cells (Figure 2—figure supplement 1B). We observed strong transcriptional activity of this fragment which was significantly higher in Chr4-cl KO ES cells. Surprisingly, this activity was reduced to background when the internal segment of the ETn element was not included in the fragment, suggesting the internal segment of the ETn element, but not its LTR, contains a Chr4-cl KRAB-ZFP sensitive enhancer. To further corroborate these findings, we genetically deleted an ETn element that is located about 60 kb from the TSS of Chst1, one of the top-upregulated genes in Chr4-cl KO ES cells (Figure 3C). RT-qPCR analysis revealed that the Chst1 upregulation phenotype in Chr4-cl KO ES cells diminishes when the ETn insertion is absent, providing direct evidence that a KRAB-ZFP controlled ETn-borne enhancer regulates Chst1 expression (Figure 3D). Furthermore, ChIP-seq confirmed a general increase of H3K4me3, H3K4me1 and H3K27ac marks at ETn elements in Chr4-cl KO ES cells (Figure 3E). Notably, enhancer marks were most pronounced around the GM13051 binding site near the 3’ end of the internal region, confirming that the enhancer activity of ETn is located on the internal region and not on the LTR.

 ### ETn retrotransposition in Chr4-cl KO and WT mice
@ -44,6 +71,10 @@ We reasoned that retrotransposon activation could account for the reduced viabil

 Using this dataset, we first confirmed the polymorphic nature of both ETn and MuLV retrotransposons in laboratory mouse strains (Figure 4—figure supplement 2A), highlighting the potential of these elements to retrotranspose. To identify novel insertions, we filtered out insertions that were supported by ETn/MuLV-paired reads in more than one animal. While none of the 54 ancestry-controlled mice showed a single novel MuLV insertion, we observed greatly varying numbers of up to 80 novel ETn insertions in our pedigree (Figure 4A).

+Figure 4. ETn retrotransposition in Chr4-cl KO mice. (A) Pedigree of mice used for transposon insertion screening by capture-seq in mice of different strain backgrounds. The number of novel ETn insertions (only present in one animal) are indicated. For animals whose direct ancestors have not been screened, the ETn insertions are shown in parentheses since parental inheritance cannot be excluded in that case. Germ line insertions are indicated by asterisks. All DNA samples were prepared from tail tissues unless noted (-S: spleen, -E: ear, -B:Blood) (B) Statistical analysis of ETn insertion frequency in tail tissue from 30 Chr4-cl KO, KO/WT and WT mice that were derived from one Chr4-c KO x KO/WT and two Chr4-cl KO/WT x KO/WT matings. Only DNA samples that were collected from juvenile tails were considered for this analysis. P-values were calculated using one-sided Wilcoxon Rank Sum Test. In the last panel, KO, WT and KO/WT mice derived from all matings were combined for the statistical analysis.
+
+<!-- image -->
+
 To validate some of the novel ETn insertions, we designed specific PCR primers for five of the insertions and screened genomic DNA of the mice in which they were identified as well as their parents. For all tested insertions, we were able to amplify their flanking sequence and show that these insertions are absent in their parents (Figure 4—figure supplement 3A). To confirm their identity, we amplified and sequenced three of the novel full-length ETn insertions. Two of these elements (Genbank accession: MH449667-68) resembled typical ETnII elements with identical 5’ and 3’ LTRs and target site duplications (TSD) of 4 or 6 bp, respectively. The third sequenced element (MH449669) represented a hybrid element that contains both ETnI and MusD (ETnERV) sequences. Similar insertions can be found in the B6 reference genome; however, the identified novel insertion has a 2.5 kb deletion of the 5’ end of the internal region. Additionally, the 5’ and 3’ LTR of this element differ in one nucleotide near the start site and contain an unusually large 248 bp TSD (containing a SINE repeat) indicating that an improper integration process might have truncated this element.

 Besides novel ETn insertions that were only identified in one specific animal, we also observed three ETn insertions that could be detected in several siblings but not in their parents or any of the other screened mice. This strongly indicates that these retrotransposition events occurred in the germ line of the parents from which they were passed on to some of their offspring. One of these germ line insertions was evidently passed on from the offspring to the next generation (Figure 4A). As expected, the read numbers supporting these novel germ line insertions were comparable to the read numbers that were found in the flanking regions of annotated B6 ETn insertions (Figure 4—figure supplement 3B). In contrast, virtually all novel insertions that were only found in one animal were supported by significantly fewer reads (Figure 4—figure supplement 3B). This indicates that these elements resulted from retrotransposition events in the developing embryo and not in the zygote or parental germ cells. Indeed, we detected different sets of insertions in various tissues from the same animal (Figure 4—figure supplement 3C). Even between tail samples that were collected from the same animal at different ages, only a fraction of the new insertions were present in both samples, while technical replicates from the same genomic DNA samples showed a nearly complete overlap in insertions (Figure 4—figure supplement 3D).
@ -58,6 +89,41 @@ Despite a lack of widespread ETn activation in Chr4-cl KO mice, it still remains

 ## Materials and methods

+Key resources table
+
+| Reagent type (species) or resource       | Designation                            | Source or reference               | Identifiers                         | Additional information                               |
+|------------------------------------------|----------------------------------------|-----------------------------------|-------------------------------------|------------------------------------------------------|
+| Strain, strain background (Mus musculus) | 129 × 1/SvJ                            | The Jackson Laboratory            | 000691                              | Mice used to generate mixed strain Chr4-cl KO mice   |
+| Cell line (Homo-sapiens)                 | HeLa                                   | ATCC                              | ATCC CCL-2                          |                                                      |
+| Cell line (Mus musculus)                 | JM8A3.N1 C57BL/6N-Atm1Brd              | KOMP Repository                   | PL236745                            | B6 ES cells used to generate KO cell lines and mice  |
+| Cell line (Mus musculus)                 | B6;129‐ Gt(ROSA)26Sortm1(cre/ERT)Nat/J | The Jackson Laboratory            | 004847                              | ES cells used to generate KO cell lines and mice     |
+| Cell line (Mus musculus)                 | R1 ES cells                            | Andras Nagy lab                   | R1                                  | 129 ES cells used to generate KO cell lines and mice |
+| Cell line (Mus musculus)                 | F9 Embryonic carcinoma cells           | ATCC                              | ATCC CRL-1720                       |                                                      |
+| Antibody                                 | Mouse monoclonal ANTI-FLAG M2 antibody | Sigma-Aldrich                     | Cat# F1804, RRID:AB\_262044          | ChIP (1 µg/107 cells)                                |
+| Antibody                                 | Rabbit polyclonal anti-HA              | Abcam                             | Cat# ab9110, RRID:AB\_307019         | ChIP (1 µg/107 cells)                                |
+| Antibody                                 | Mouse monoclonal anti-HA               | Covance                           | Cat# MMS-101P-200, RRID:AB\_10064068 |                                                      |
+| Antibody                                 | Rabbit polyclonal anti-H3K9me3         | Active Motif                      | Cat# 39161, RRID:AB\_2532132         | ChIP (3 µl/107 cells)                                |
+| Antibody                                 | Rabbit polyclonal anti-GFP             | Thermo Fisher Scientific          | Cat# A-11122, RRID:AB\_221569        | ChIP (1 µg/107 cells)                                |
+| Antibody                                 | Rabbit polyclonal anti- H3K4me3        | Abcam                             | Cat# ab8580, RRID:AB\_306649         | ChIP (1 µg/107 cells)                                |
+| Antibody                                 | Rabbit polyclonal anti- H3K4me1        | Abcam                             | Cat# ab8895, RRID:AB\_306847         | ChIP (1 µg/107 cells)                                |
+| Antibody                                 | Rabbit polyclonal anti- H3K27ac        | Abcam                             | Cat# ab4729, RRID:AB\_2118291        | ChIP (1 µg/107 cells)                                |
+| Recombinant DNA reagent                  | pCW57.1                                | Addgene                           | RRID:Addgene\_41393                  | Inducible lentiviral expression vector               |
+| Recombinant DNA reagent                  | pX330-U6-Chimeric\_BB-CBh-hSpCas9       | Addgene                           | RRID:Addgene\_42230                  | CRISPR/Cas9 expression construct                     |
+| Sequence-based reagent                   | Chr2-cl KO gRNA.1                      | This paper                        | Cas9 gRNA                           | GCCGTTGCTCAGTCCAAATG                                 |
+| Sequenced-based reagent                  | Chr2-cl KO gRNA.2                      | This paper                        | Cas9 gRNA                           | GATACCAGAGGTGGCCGCAAG                                |
+| Sequenced-based reagent                  | Chr4-cl KO gRNA.1                      | This paper                        | Cas9 gRNA                           | GCAAAGGGGCTCCTCGATGGA                                |
+| Sequence-based reagent                   | Chr4-cl KO gRNA.2                      | This paper                        | Cas9 gRNA                           | GTTTATGGCCGTGCTAAGGTC                                |
+| Sequenced-based reagent                  | Chr10-cl KO gRNA.1                     | This paper                        | Cas9 gRNA                           | GTTGCCTTCATCCCACCGTG                                 |
+| Sequenced-based reagent                  | Chr10-cl KO gRNA.2                     | This paper                        | Cas9 gRNA                           | GAAGTTCGACTTGGACGGGCT                                |
+| Sequenced-based reagent                  | Chr13.1-cl KO gRNA.1                   | This paper                        | Cas9 gRNA                           | GTAACCCATCATGGGCCCTAC                                |
+| Sequenced-based reagent                  | Chr13.1-cl KO gRNA.2                   | This paper                        | Cas9 gRNA                           | GGACAGGTTATAGGTTTGAT                                 |
+| Sequenced-based reagent                  | Chr13.2-cl KO gRNA.1                   | This paper                        | Cas9 gRNA                           | GGGTTTCTGAGAAACGTGTA                                 |
+| Sequenced-based reagent                  | Chr13.2-cl KO gRNA.2                   | This paper                        | Cas9 gRNA                           | GTGTAATGAGTTCTTATATC                                 |
+| Commercial assay or kit                  | SureSelectQXT Target Enrichment kit    | Agilent                           | G9681-90000                         |                                                      |
+| Software, algorithm                      | Bowtie                                 | http://bowtie-bio.sourceforge.net | RRID:SCR\_005476                     |                                                      |
+| Software, algorithm                      | MACS14                                 | https://bio.tools/macs            | RRID:SCR\_013291                     |                                                      |
+| Software, algorithm                      | Tophat                                 | https://ccb.jhu.edu               | RRID:SCR\_013035                     |                                                      |
+
 ### Cell lines and transgenic mice

 Mouse ES cells and F9 EC cells were cultivated as described previously (Wolf et al., 2015b) unless stated otherwise. Chr4-cl KO ES cells originate from B6;129‐ Gt(ROSA)26Sortm1(cre/ERT)Nat/J mice (Jackson lab), all other KRAB-ZFP cluster KO ES cell lines originate from JM8A3.N1 C57BL/6N-Atm1Brd ES cells (KOMP Repository). Chr2-cl KO and WT ES cells were initially grown in serum-containing media (Wolf et al., 2015b) but changed to 2i media (De Iaco et al., 2017) for several weeks before analysis. To generate Chr4-cl and Chr2-cl KO mice, the cluster deletions were repeated in B6 ES (KOMP repository) or R1 (Nagy lab) ES cells, respectively, and heterozygous clones were injected into B6 albino blastocysts. Chr2-cl KO mice were therefore kept on a mixed B6/Svx129/Sv-CP strain background while Chr4-cl KO mice were initially derived on a pure C57BL/6 background. For capture-seq screens, Chr4-cl KO mice were crossed with 129 × 1/SvJ mice (Jackson lab) to produce the founder mice for Chr4-cl KO and WT (B6/129 F1) offspring. Chr4-cl KO/WT (B6/129 F1) were also crossed with 129 × 1/SvJ mice to get Chr4-cl KO/WT (B6/129 F1) mice, which were intercrossed to give rise to the parents of Chr4-cl KO/KO and KO/WT (B6/129 F2) offspring.
@ -96,173 +162,99 @@ The retrotransposition vectors pCMV-MusD2, pCMV-MusD2-neoTNF and pCMV-ETnI1-neoT

 To identify novel retrotransposon insertions, genomic DNA from various tissues (Supplementary file 4) was purified and used for library construction with target enrichment using the SureSelectQXT Target Enrichment kit (Agilent). Custom RNA capture probes were designed to hybridize with the 120 bp 5’ ends of the 5’ LTRs and the 120 bp 3’ ends of the 3’ LTR of about 600 intact (internal region flanked by two LTRs) MMETn/RLTRETN retrotransposons or of 140 RLTR4\_MM/RLTR4 retrotransposons that were upregulated in Chr4-cl KO ES cells (Figure 4—source data 2). Enriched libraries were sequenced on an Illumina HiSeq as paired-end 50 bp reads. R1 and R2 reads were mapped to the mm9 genome separately, using settings that only allow non-duplicated, uniquely mappable reads (Bowtie -m 1 --best --strata; samtools rmdup -s) and under settings that allow multimapping and duplicated reads (Bowtie --best). Of the latter, only reads that overlap (min. 50% of read) with RLTRETN, MMETn-int, ETnERV-int, ETnERV2-int or ETnERV3-int repeats (ETn) or RLTR4, RLTR4\_MM-int or MuLV-int repeats (RLTR4) were kept. Only uniquely mappable reads whose paired reads were overlapping with the repeats mentioned above were used for further analysis. All ETn- and RLTR4-paired reads were then clustered (as bed files) using BEDTools (bedtools merge -i -n -d 1000) to receive a list of all potential annotated and non-annotated new ETn or RLTR4 insertion sites and all overlapping ETn- or RLTR4-paired reads were counted for each sample at each locus. Finally, all regions that were located within 1 kb of an annotated RLTRETN, MMETn-int, ETnERV-int, ETnERV2-int or ETnERV3-int repeat as well as regions overlapping with previously identified polymorphic ETn elements (Nellåker et al., 2012) were removed. Genomic loci with at least 10 reads per million unique ETn- or RLTR4-paired reads were considered as insertion sites. To qualify for a de-novo insertion, we allowed no called insertions in any of the other screened mice at the locus and not a single read at the locus in the ancestors of the mouse. Insertions at the same locus in at least two siblings from the same offspring were considered as germ line insertions, if the insertion was absent in the parents and mice who were not direct descendants from these siblings. Full-length sequencing of new ETn insertions was done by Sanger sequencing of short PCR products in combination with Illumina sequencing of a large PCR product (Supplementary file 3), followed by de-novo assembly using the Unicycler software.

-## Tables
+## Funding Information

-Table 1.: * Number of protein-coding KRAB-ZFP genes identified in a previously published screen (Imbeault et al., 2017) and the ChIP-seq data column indicates the number of KRAB-ZFPs for which ChIP-seq was performed in this study.
+This paper was supported by the following grants:

-| Cluster   | Location   | Size (Mb)   |   # of KRAB-ZFPs* |   ChIP-seq data |
-|-----------|------------|-------------|-------------------|-----------------|
-| Chr2      | Chr2 qH4   | 3.1         |                40 |              17 |
-| Chr4      | Chr4 qE1   | 2.3         |                21 |              19 |
-| Chr10     | Chr10 qC1  | 0.6         |                 6 |               1 |
-| Chr13.1   | Chr13 qB3  | 1.2         |                 6 |               2 |
-| Chr13.2   | Chr13 qB3  | 0.8         |                26 |              12 |
-| Chr8      | Chr8 qB3.3 | 0.1         |                 4 |               4 |
-| Chr9      | Chr9 qA3   | 0.1         |                 4 |               2 |
-| Other     | -          | -           |               248 |               4 |
+- http://dx.doi.org/10.13039/100009633Eunice Kennedy Shriver National Institute of Child Health and Human Development 1ZIAHD008933 to Todd S Macfarlan.
+- http://dx.doi.org/10.13039/501100001711Swiss National Science Foundation 310030\_152879 to Didier Trono.
+- http://dx.doi.org/10.13039/501100001711Swiss National Science Foundation 310030B\_173337 to Didier Trono.
+- http://dx.doi.org/10.13039/501100000781European Research Council No. 268721 to Didier Trono.
+- http://dx.doi.org/10.13039/501100000781European Research Council No 694658 to Didier Trono.

-Key resources table: 
+## Acknowledgements

-| Reagent type (species) or resource       | Designation                            | Source or reference               | Identifiers                         | Additional information                               |
-|------------------------------------------|----------------------------------------|-----------------------------------|-------------------------------------|------------------------------------------------------|
-| Strain, strain background (Mus musculus) | 129 × 1/SvJ                            | The Jackson Laboratory            | 000691                              | Mice used to generate mixed strain Chr4-cl KO mice   |
-| Cell line (Homo-sapiens)                 | HeLa                                   | ATCC                              | ATCC CCL-2                          |                                                      |
-| Cell line (Mus musculus)                 | JM8A3.N1 C57BL/6N-Atm1Brd              | KOMP Repository                   | PL236745                            | B6 ES cells used to generate KO cell lines and mice  |
-| Cell line (Mus musculus)                 | B6;129‐ Gt(ROSA)26Sortm1(cre/ERT)Nat/J | The Jackson Laboratory            | 004847                              | ES cells used to generate KO cell lines and mice     |
-| Cell line (Mus musculus)                 | R1 ES cells                            | Andras Nagy lab                   | R1                                  | 129 ES cells used to generate KO cell lines and mice |
-| Cell line (Mus musculus)                 | F9 Embryonic carcinoma cells           | ATCC                              | ATCC CRL-1720                       |                                                      |
-| Antibody                                 | Mouse monoclonal ANTI-FLAG M2 antibody | Sigma-Aldrich                     | Cat# F1804, RRID:AB\_262044          | ChIP (1 µg/107 cells)                                |
-| Antibody                                 | Rabbit polyclonal anti-HA              | Abcam                             | Cat# ab9110, RRID:AB\_307019         | ChIP (1 µg/107 cells)                                |
-| Antibody                                 | Mouse monoclonal anti-HA               | Covance                           | Cat# MMS-101P-200, RRID:AB\_10064068 |                                                      |
-| Antibody                                 | Rabbit polyclonal anti-H3K9me3         | Active Motif                      | Cat# 39161, RRID:AB\_2532132         | ChIP (3 µl/107 cells)                                |
-| Antibody                                 | Rabbit polyclonal anti-GFP             | Thermo Fisher Scientific          | Cat# A-11122, RRID:AB\_221569        | ChIP (1 µg/107 cells)                                |
-| Antibody                                 | Rabbit polyclonal anti- H3K4me3        | Abcam                             | Cat# ab8580, RRID:AB\_306649         | ChIP (1 µg/107 cells)                                |
-| Antibody                                 | Rabbit polyclonal anti- H3K4me1        | Abcam                             | Cat# ab8895, RRID:AB\_306847         | ChIP (1 µg/107 cells)                                |
-| Antibody                                 | Rabbit polyclonal anti- H3K27ac        | Abcam                             | Cat# ab4729, RRID:AB\_2118291        | ChIP (1 µg/107 cells)                                |
-| Recombinant DNA reagent                  | pCW57.1                                | Addgene                           | RRID:Addgene\_41393                  | Inducible lentiviral expression vector               |
-| Recombinant DNA reagent                  | pX330-U6-Chimeric\_BB-CBh-hSpCas9       | Addgene                           | RRID:Addgene\_42230                  | CRISPR/Cas9 expression construct                     |
-| Sequence-based reagent                   | Chr2-cl KO gRNA.1                      | This paper                        | Cas9 gRNA                           | GCCGTTGCTCAGTCCAAATG                                 |
-| Sequenced-based reagent                  | Chr2-cl KO gRNA.2                      | This paper                        | Cas9 gRNA                           | GATACCAGAGGTGGCCGCAAG                                |
-| Sequenced-based reagent                  | Chr4-cl KO gRNA.1                      | This paper                        | Cas9 gRNA                           | GCAAAGGGGCTCCTCGATGGA                                |
-| Sequence-based reagent                   | Chr4-cl KO gRNA.2                      | This paper                        | Cas9 gRNA                           | GTTTATGGCCGTGCTAAGGTC                                |
-| Sequenced-based reagent                  | Chr10-cl KO gRNA.1                     | This paper                        | Cas9 gRNA                           | GTTGCCTTCATCCCACCGTG                                 |
-| Sequenced-based reagent                  | Chr10-cl KO gRNA.2                     | This paper                        | Cas9 gRNA                           | GAAGTTCGACTTGGACGGGCT                                |
-| Sequenced-based reagent                  | Chr13.1-cl KO gRNA.1                   | This paper                        | Cas9 gRNA                           | GTAACCCATCATGGGCCCTAC                                |
-| Sequenced-based reagent                  | Chr13.1-cl KO gRNA.2                   | This paper                        | Cas9 gRNA                           | GGACAGGTTATAGGTTTGAT                                 |
-| Sequenced-based reagent                  | Chr13.2-cl KO gRNA.1                   | This paper                        | Cas9 gRNA                           | GGGTTTCTGAGAAACGTGTA                                 |
-| Sequenced-based reagent                  | Chr13.2-cl KO gRNA.2                   | This paper                        | Cas9 gRNA                           | GTGTAATGAGTTCTTATATC                                 |
-| Commercial assay or kit                  | SureSelectQXT Target Enrichment kit    | Agilent                           | G9681-90000                         |                                                      |
-| Software, algorithm                      | Bowtie                                 | http://bowtie-bio.sourceforge.net | RRID:SCR\_005476                     |                                                      |
-| Software, algorithm                      | MACS14                                 | https://bio.tools/macs            | RRID:SCR\_013291                     |                                                      |
-| Software, algorithm                      | Tophat                                 | https://ccb.jhu.edu               | RRID:SCR\_013035                     |                                                      |
+We thank Alex Grinberg, Jeanne Yimdjo and Victoria Carter for generating and maintaining transgenic mice. We also thank members of the Macfarlan and Trono labs for useful discussion, Steven Coon, James Iben, Tianwei Li and Anna Malawska for NGS and computational support. This work was supported by NIH grant 1ZIAHD008933 and the NIH DDIR Innovation Award program (TSM), and by subsidies from the Swiss National Science Foundation (310030\_152879 and 310030B\_173337) and the European Research Council (KRABnKAP, No. 268721; Transpos-X, No. 694658) (DT).

-## Figures
+## Additional information

-Figure 1.: Genome-wide binding patterns of mouse KRAB-ZFPs.
-(A) Probability heatmap of KRAB-ZFP binding to TEs. Blue color intensity (main field) corresponds to -log10 (adjusted p-value) enrichment of ChIP-seq peak overlap with TE groups (Fisher’s exact test). The green/red color intensity (top panel) represents mean KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) enrichment (respectively) at peaks overlapping significantly targeted TEs (adjusted p-value&lt;1e-5) in WT ES cells. (B) Summarized ChIP-seq signal for indicated KRAB-ZFPs and previously published KAP1 and H3K9me3 in WT ES cells across 127 intact ETn elements. (C) Heatmaps of KRAB-ZFP ChIP-seq signal at ChIP-seq peaks. For better comparison, peaks for all three KRAB-ZFPs were called with the same parameters (p&lt;1e-10, peak enrichment &gt;20). The top panel shows a schematic of the arrangement of the contact amino acid composition of each zinc finger. Zinc fingers are grouped and colored according to similarity, with amino acid differences relative to the five consensus fingers highlighted in white.
-Figure 1—source data 1.KRAB-ZFP expression in 40 mouse tissues and cell lines (ENCODE).Mean values of replicates are shown as log2 transcripts per million.
-Figure 1—source data 2.Probability heatmap of KRAB-ZFP binding to TEs.Values corresponds to -log10 (adjusted p-value) enrichment of ChIP-seq peak overlap with TE groups (Fisher’s exact test).
+## Additional files

-<!-- image -->
+## Data availability

-Figure 1—figure supplement 1.: ES cell-specific expression of KRAB-ZFP gene clusters.
-(A) Heatmap showing expression patterns of mouse KRAB-ZFPs in 40 mouse tissues and cell lines (ENCODE). Heatmap colors indicate gene expression levels in log2 transcripts per million (TPM). The asterisk indicates a group of 30 KRAB-ZFPs that are exclusively expressed in ES cells. (B) Physical location of the genes encoding for the 30 KRAB-ZFPs that are exclusively expressed in ES cells. (C) Phylogenetic (Maximum likelihood) tree of the KRAB domains of mouse KRAB-ZFPs. KRAB-ZFPs encoded on the gene clusters on chromosome 2 and 4 are highlighted. The scale bar at the bottom indicates amino acid substitutions per site.
+All NGS data has been deposited in GEO (GSE115291). Sequences of full-length de novo ETn insertions have been deposited in the GenBank database (MH449667- MH449669).

-<!-- image -->
+The following datasets were generated:

-Figure 1—figure supplement 2.: KRAB-ZFP binding motifs and their repression activity.
-(A) Comparison of computationally predicted (bottom) and experimentally determined (top) KRAB-ZFP binding motifs. Only significant pairs are shown (FDR &lt; 0.1). (B) Luciferase reporter assays to confirm KRAB-ZFP repression of the identified target sites. Bars show the luciferase activity (normalized to Renilla luciferase) of reporter plasmids containing the indicated target sites cloned upstream of the SV40 promoter. Reporter plasmids were co-transfected into 293 T cells with a Renilla luciferase plasmid for normalization and plasmids expressing the targeting KRAB-ZFP. Normalized mean luciferase activity (from three replicates) is shown relative to luciferase activity of the reporter plasmid co-transfected with an empty pcDNA3.1 vector.
+Wolf G. Retrotransposon reactivation and mobilization upon deletions of megabase scale KRAB zinc finger gene clusters in mice. NCBI Gene Expression Omnibus (2019). NCBI: GSE115291

-<!-- image -->
+Wolf G. Mus musculus musculus strain C57BL/6x129X1/SvJ retrotransposon MMETn-int, complete sequence. NCBI GenBank (2019). NCBI: MH449667

-Figure 1—figure supplement 3.: KRAB-ZFP binding to ETn retrotransposons.
-(A) Comparison of the PBSLys1,2 sequence with Zfp961 binding motifs in nonrepetitive peaks (Nonrep) and peaks at ETn elements. (B) Retrotransposition assays of original (ETnI1-neoTNF and MusD2-neoTNF Ribet et al., 2004) and modified reporter vectors where the Rex2 or Gm13051 binding motifs where removed. Schematic of reporter vectors are displayed at the top. HeLa cells were transfected as described in the Materials and Methods section and neo-resistant colonies, indicating retrotransposition events, were selected and stained. (C) Stem-loop structure of the ETn RNA export signal, the Gm13051 motif on the corresponding DNA is marked with red circles, the part of the motif that was deleted is indicated with grey crosses (adapted from Legiewicz et al., 2010).
+Wolf G. Mus musculus musculus strain C57BL/6x129X1/SvJ retrotransposon MMETn-int, complete sequence. NCBI GenBank (2019). NCBI: MH449668

-<!-- image -->
+Wolf G. Mus musculus musculus strain C57BL/6x129X1/SvJ retrotransposon MMETn-int, complete sequence. NCBI GenBank (2019). NCBI: MH449669

-Figure 2.: Retrotransposon reactivation in KRAB-ZFP cluster KO ES cells.
-(A) RNA-seq analysis of TE expression in five KRAB-ZFP cluster KO ES cells. Green and grey squares on top of the panel represent KRAB-ZFPs with or without ChIP-seq data, respectively, within each deleted gene cluster. Reactivated TEs that are bound by one or several KRAB-ZFPs are indicated by green squares in the panel. Significantly up- and downregulated elements (adjusted p-value&lt;0.05) are highlighted in red and green, respectively. (B) Differential KAP1 binding and H3K9me3 enrichment at TE groups (summarized across all insertions) in Chr2-cl and Chr4-cl KO ES cells. TE groups targeted by one or several KRAB-ZFPs encoded within the deleted clusters are highlighted in blue (differential enrichment over the entire TE sequences) and red (differential enrichment at TE regions that overlap with KRAB-ZFP ChIP-seq peaks). (C) DNA methylation status of CpG sites at indicated TE groups in WT and Chr4-cl KO ES cells grown in serum containing media or in hypomethylation-inducing media (2i + Vitamin C). P-values were calculated using paired t-test.
-Figure 2—source data 1.Differential H3K9me3 and KAP1 distribution in WT and KRAB-ZFP cluster KO ES cells at TE families and KRAB-ZFP bound TE insertions.Differential read counts and statistical testing were determined by DESeq2.
+The following previously published datasets were used:

-<!-- image -->
+Castro-Diaz N, Ecco G, Coluccio A, Kapopoulou A, Duc J, Trono D. Evollutionally dynamic L1 regulation in embryonic stem cells. NCBI Gene Expression Omnibus (2014). NCBI: GSM1406445

-Figure 2—figure supplement 1.: Epigenetic changes at TEs and TE-borne enhancers in KRAB-ZFP cluster KO ES cells.
-(A) Differential analysis of summative (all individual insertions combined) H3K9me3 enrichment at TE groups in Chr10-cl, Chr13.1-cl and Chr13.2-cl KO ES cells. TE groups targeted by one or several KRAB-ZFPs encoded within the deleted clusters are highlighted in orange (differential enrichment over the entire TE sequences) and red (differential enrichment at TE regions that overlap with KRAB-ZFP ChIP-seq peaks). (B) Top: Schematic view of the Cd59a/Cd59b locus with a 5’ truncated ETn insertion. ChIP-seq (Input subtracted from ChIP) data for overexpressed epitope-tagged Gm13051 (a Chr4-cl KRAB-ZFP) in F9 EC cells, and re-mapped KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) in WT ES cells are shown together with RNA-seq data from Chr4-cl WT and KO ES cells (mapped using Bowtie (-a -m 1 --strata -v 2) to exclude reads that cannot be uniquely mapped). Bottom: Transcriptional activity of a 5 kb fragment with or without fragments of the ETn insertion was tested by luciferase reporter assay in Chr4-cl WT and KO ES cells.
-
-<!-- image -->
-
-Figure 3.: TE-dependent gene activation in KRAB-ZFP cluster KO ES cells.
-(A) Differential gene expression in Chr2-cl and Chr4-cl KO ES cells. Significantly up- and downregulated genes (adjusted p-value&lt;0.05) are highlighted in red and green, respectively, KRAB-ZFP genes within the deleted clusters are shown in blue. (B) Correlation of TEs and gene deregulation. Plots show enrichment of TE groups within 100 kb of up- and downregulated genes relative to all genes. Significantly overrepresented LTR and LINE groups (adjusted p-value&lt;0.1) are highlighted in blue and red, respectively. (C) Schematic view of the downstream region of Chst1 where a 5’ truncated ETn insertion is located. ChIP-seq (Input subtracted from ChIP) data for overexpressed epitope-tagged Gm13051 (a Chr4-cl KRAB-ZFP) in F9 EC cells, and re-mapped KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) in WT ES cells are shown together with RNA-seq data from Chr4-cl WT and KO ES cells (mapped using Bowtie (-a -m 1 --strata -v 2) to exclude reads that cannot be uniquely mapped). (D) RT-qPCR analysis of Chst1 mRNA expression in Chr4-cl WT and KO ES cells with or without the CRISPR/Cas9 deleted ETn insertion near Chst1. Values represent mean expression (normalized to Gapdh) from three biological replicates per sample (each performed in three technical replicates) in arbitrary units. Error bars represent standard deviation and asterisks indicate significance (p&lt;0.01, Student’s t-test). n.s.: not significant. (E) Mean coverage of ChIP-seq data (Input subtracted from ChIP) in Chr4-cl WT and KO ES cells over 127 full-length ETn insertions. The binding sites of the Chr4-cl KRAB-ZFPs Rex2 and Gm13051 are indicated by dashed lines.
-
-<!-- image -->
-
-Figure 4.: ETn retrotransposition in Chr4-cl KO mice.
-(A) Pedigree of mice used for transposon insertion screening by capture-seq in mice of different strain backgrounds. The number of novel ETn insertions (only present in one animal) are indicated. For animals whose direct ancestors have not been screened, the ETn insertions are shown in parentheses since parental inheritance cannot be excluded in that case. Germ line insertions are indicated by asterisks. All DNA samples were prepared from tail tissues unless noted (-S: spleen, -E: ear, -B:Blood) (B) Statistical analysis of ETn insertion frequency in tail tissue from 30 Chr4-cl KO, KO/WT and WT mice that were derived from one Chr4-c KO x KO/WT and two Chr4-cl KO/WT x KO/WT matings. Only DNA samples that were collected from juvenile tails were considered for this analysis. P-values were calculated using one-sided Wilcoxon Rank Sum Test. In the last panel, KO, WT and KO/WT mice derived from all matings were combined for the statistical analysis.
-Figure 4—source data 1.Coordinates of identified novel ETn insertions and supporting capture-seq read counts.Genomic regions indicate cluster of supporting reads.
-Figure 4—source data 2.Sequences of capture-seq probes used to enrich genomic DNA for ETn and MuLV (RLTR4) insertions.
-
-<!-- image -->
-
-Figure 4—figure supplement 1.: Birth statistics of KRAB-ZFP cluster KO mice and TE reactivation in adult tissues.
-(A) Birth statistics of Chr4- and Chr2-cl mice derived from KO/WT x KO/WT matings in different strain backgrounds. (B) RNA-seq analysis of TE expression in Chr2- (left) and Chr4-cl (right) KO tissues. TE groups with the highest reactivation phenotype in ES cells are shown separately. Significantly up- and downregulated elements (adjusted p-value&lt;0.05) are highlighted in red and green, respectively. Experiments were performed in at least two biological replicates.
-
-<!-- image -->
-
-Figure 4—figure supplement 2.: Identification of polymorphic ETn and MuLV retrotransposon insertions in Chr4-cl KO and WT mice.
-Heatmaps show normalized capture-seq read counts in RPM (Read Per Million) for identified polymorphic ETn (A) and MuLV (B) loci in different mouse strains. Only loci with strong support for germ line ETn or MuLV insertions (at least 100 or 3000 ETn or MuLV RPM, respectively) in at least two animals are shown. Non-polymorphic insertion loci with high read counts in all screened mice were excluded for better visibility. The sample information (sample name and cell type/tissue) is annotated at the bottom, with the strain information indicated by color at the top. The color gradient indicates log10(RPM+1).
-
-<!-- image -->
-
-Figure 4—figure supplement 3.: Confirmation of novel ETn insertions identified by capture-seq.
-(A) PCR validation of novel ETn insertions in genomic DNA of three littermates (IDs: T09673, T09674 and T00436) and their parents (T3913 and T3921). Primer sequences are shown in Supplementary file 3. (B) ETn capture-seq read counts (RPM) at putative novel somatic (loci identified exclusively in one single animal), novel germ line (loci identified in several littermates) insertions, and at B6 reference ETn elements. (C) Heatmap shows capture-seq read counts (RPM) of a Chr4-cl KO mouse (ID: C6733) as determined in different tissues. Each row represents a novel ETn locus that was identified in at least one tissue. The color gradient indicates log10(RPM+1). (D) Heatmap shows the capture-seq RPM in technical replicates using the same Chr4-cl KO DNA sample (rep1/rep2) or replicates with DNA samples prepared from different sections of the tail from the same mouse at different ages (tail1/tail2). Each row represents a novel ETn locus that was identified in at least one of the displayed samples. The color gradient indicates log10(RPM+1).
-
-<!-- image -->
+Andrew ZX. H3K9me3\_ChIPSeq (Ctrl). NCBI Gene Expression Omnibus (2014). NCBI: GSM1327148

 ## References

- TL Bailey; M Boden; FA Buske; M Frith; CE Grant; L Clementi; J Ren; WW Li; WS Noble. MEME SUITE: tools for motif discovery and searching. Nucleic Acids Research (2009)
- C Baust; L Gagnier; GJ Baillie; MJ Harris; DM Juriloff; DL Mager. Structure and expression of mobile ETnII retroelements and their coding-competent MusD relatives in the mouse. Journal of Virology (2003)
- K Blaschke; KT Ebata; MM Karimi; JA Zepeda-Martínez; P Goyal; S Mahapatra; A Tam; DJ Laird; M Hirst; A Rao; MC Lorincz; M Ramalho-Santos. Vitamin C induces Tet-dependent DNA demethylation and a blastocyst-like state in ES cells. Nature (2013)
- A Brodziak; E Ziółko; M Muc-Wierzgoń; E Nowakowska-Zajdel; T Kokot; K Klakla. The role of human endogenous retroviruses in the pathogenesis of autoimmune diseases. Medical Science Monitor : International Medical Journal of Experimental and Clinical Research (2012)
- N Castro-Diaz; G Ecco; A Coluccio; A Kapopoulou; B Yazdanpanah; M Friedli; J Duc; SM Jang; P Turelli; D Trono. Evolutionally dynamic L1 regulation in embryonic stem cells. Genes &amp; Development (2014)
- EB Chuong; NC Elde; C Feschotte. Regulatory evolution of innate immunity through co-option of endogenous retroviruses. Science (2016)
- J Dan; Y Liu; N Liu; M Chiourea; M Okuka; T Wu; X Ye; C Mou; L Wang; L Wang; Y Yin; J Yuan; B Zuo; F Wang; Z Li; X Pan; Z Yin; L Chen; DL Keefe; S Gagos; A Xiao; L Liu. Rif1 maintains telomere length homeostasis of ESCs by mediating heterochromatin silencing. Developmental Cell (2014)
- A De Iaco; E Planet; A Coluccio; S Verp; J Duc; D Trono. DUX-family transcription factors regulate zygotic genome activation in placental mammals. Nature Genetics (2017)
- Ö Deniz; L de la Rica; KCL Cheng; D Spensberger; MR Branco. SETDB1 prevents TET2-dependent activation of IAP retroelements in naïve embryonic stem cells. Genome Biology (2018)
- M Dewannieux; T Heidmann. Endogenous retroviruses: acquisition, amplification and taming of genome invaders. Current Opinion in Virology (2013)
- G Ecco; M Cassano; A Kauzlaric; J Duc; A Coluccio; S Offner; M Imbeault; HM Rowe; P Turelli; D Trono. Transposable elements and their KRAB-ZFP controllers regulate gene expression in adult tissues. Developmental Cell (2016)
- G Ecco; M Imbeault; D Trono. KRAB zinc finger proteins. Development (2017)
- JA Frank; C Feschotte. Co-option of endogenous viral sequences for host cell function. Current Opinion in Virology (2017)
- L Gagnier; VP Belancio; DL Mager. Mouse germ line mutations due to retrotransposon insertions. Mobile DNA (2019)
- AC Groner; S Meylan; A Ciuffi; N Zangger; G Ambrosini; N Dénervaud; P Bucher; D Trono. KRAB-zinc finger proteins and KAP1 can mediate long-range transcriptional repression through heterochromatin spreading. PLOS Genetics (2010)
- DC Hancks; HH Kazazian. Roles for retrotransposon insertions in human disease. Mobile DNA (2016)
- M Imbeault; PY Helleboid; D Trono. KRAB zinc-finger proteins contribute to the evolution of gene regulatory networks. Nature (2017)
- FM Jacobs; D Greenberg; N Nguyen; M Haeussler; AD Ewing; S Katzman; B Paten; SR Salama; D Haussler. An evolutionary arms race between KRAB zinc-finger genes ZNF91/93 and SVA/L1 retrotransposons. Nature (2014)
- H Kano; H Kurahashi; T Toda. Genetically regulated epigenetic transcriptional activation of retrotransposon insertion confers mouse dactylaplasia phenotype. PNAS (2007)
- MM Karimi; P Goyal; IA Maksakova; M Bilenky; D Leung; JX Tang; Y Shinkai; DL Mager; S Jones; M Hirst; MC Lorincz. DNA methylation and SETDB1/H3K9me3 regulate predominantly distinct sets of genes, retroelements, and chimeric transcripts in mESCs. Cell Stem Cell (2011)
- A Kauzlaric; G Ecco; M Cassano; J Duc; M Imbeault; D Trono. The mouse genome displays highly dynamic populations of KRAB-zinc finger protein genes and related genetic units. PLOS ONE (2017)
- PP Khil; F Smagulova; KM Brick; RD Camerini-Otero; GV Petukhova. Sensitive mapping of recombination hotspots using sequencing-based detection of ssDNA. Genome Research (2012)
- F Krueger; SR Andrews. Bismark: a flexible aligner and methylation caller for Bisulfite-Seq applications. Bioinformatics (2011)
- B Langmead; SL Salzberg. Fast gapped-read alignment with bowtie 2. Nature Methods (2012)
- M Legiewicz; AS Zolotukhin; GR Pilkington; KJ Purzycka; M Mitchell; H Uranishi; J Bear; GN Pavlakis; SF Le Grice; BK Felber. The RNA transport element of the murine  musD  retrotransposon requires long-range intramolecular interactions for function. Journal of Biological Chemistry (2010)
- JA Lehoczky; PE Thomas; KM Patrie; KM Owens; LM Villarreal; K Galbraith; J Washburn; CN Johnson; B Gavino; AD Borowsky; KJ Millen; P Wakenight; W Law; ML Van Keuren; G Gavrilina; ED Hughes; TL Saunders; L Brihn; JH Nadeau; JW Innis. A novel intergenic ETnII-β insertion mutation causes multiple malformations in Polypodia mice. PLOS Genetics (2013)
- D Leung; T Du; U Wagner; W Xie; AY Lee; P Goyal; Y Li; KE Szulwach; P Jin; MC Lorincz; B Ren. Regulation of DNA methylation turnover at LTR retrotransposons and imprinted loci by the histone methyltransferase Setdb1. PNAS (2014)
- J Lilue; AG Doran; IT Fiddes; M Abrudan; J Armstrong; R Bennett; W Chow; J Collins; S Collins; A Czechanski; P Danecek; M Diekhans; DD Dolle; M Dunn; R Durbin; D Earl; A Ferguson-Smith; P Flicek; J Flint; A Frankish; B Fu; M Gerstein; J Gilbert; L Goodstadt; J Harrow; K Howe; X Ibarra-Soria; M Kolmogorov; CJ Lelliott; DW Logan; J Loveland; CE Mathews; R Mott; P Muir; S Nachtweide; FCP Navarro; DT Odom; N Park; S Pelan; SK Pham; M Quail; L Reinholdt; L Romoth; L Shirley; C Sisu; M Sjoberg-Herrera; M Stanke; C Steward; M Thomas; G Threadgold; D Thybert; J Torrance; K Wong; J Wood; B Yalcin; F Yang; DJ Adams; B Paten; TM Keane. Sixteen diverse laboratory mouse reference genomes define strain-specific haplotypes and novel functional loci. Nature Genetics (2018)
- S Liu; J Brind'Amour; MM Karimi; K Shirane; A Bogutz; L Lefebvre; H Sasaki; Y Shinkai; MC Lorincz. Setdb1  is required for germline development and silencing of H3K9me3-marked endogenous retroviruses in primordial germ cells. Genes &amp; Development (2014)
- MI Love; W Huber; S Anders. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology (2014)
- F Lugani; R Arora; N Papeta; A Patel; Z Zheng; R Sterken; RA Singer; G Caridi; C Mendelsohn; L Sussel; VE Papaioannou; AG Gharavi. A retrotransposon insertion in the 5' regulatory domain of Ptf1a results in ectopic gene expression and multiple congenital defects in Danforth's short tail mouse. PLOS Genetics (2013)
- TS Macfarlan; WD Gifford; S Driscoll; K Lettieri; HM Rowe; D Bonanomi; A Firth; O Singer; D Trono; SL Pfaff. Embryonic stem cell potency fluctuates with endogenous retrovirus activity. Nature (2012)
- IA Maksakova; MT Romanish; L Gagnier; CA Dunn; LN van de Lagemaat; DL Mager. Retroviral elements and their hosts: insertional mutagenesis in the mouse germ line. PLOS Genetics (2006)
- T Matsui; D Leung; H Miyashita; IA Maksakova; H Miyachi; H Kimura; M Tachibana; MC Lorincz; Y Shinkai. Proviral silencing in embryonic stem cells requires the histone methyltransferase ESET. Nature (2010)
- HS Najafabadi; S Mnaimneh; FW Schmitges; M Garton; KN Lam; A Yang; M Albu; MT Weirauch; E Radovani; PM Kim; J Greenblatt; BJ Frey; TR Hughes. C2H2 zinc finger proteins greatly expand the human regulatory lexicon. Nature Biotechnology (2015)
- C Nellåker; TM Keane; B Yalcin; K Wong; A Agam; TG Belgard; J Flint; DJ Adams; WN Frankel; CP Ponting. The genomic landscape shaped by selection on transposable elements across 18 mouse strains. Genome Biology (2012)
- H O'Geen; S Frietze; PJ Farnham. Using ChIP-seq technology to identify targets of zinc finger transcription factors. Methods in Molecular Biology (2010)
- A Patel; P Yang; M Tinkham; M Pradhan; M-A Sun; Y Wang; D Hoang; G Wolf; JR Horton; X Zhang; T Macfarlan; X Cheng. DNA conformation induces adaptable binding by tandem zinc finger proteins. Cell (2018)
- D Ribet; M Dewannieux; T Heidmann. An active murine transposon family pair: retrotransposition of "master" MusD copies and ETn trans-mobilization. Genome Research (2004)
- SR Richardson; P Gerdes; DJ Gerhardt; FJ Sanchez-Luque; GO Bodea; M Muñoz-Lopez; JS Jesuadian; MHC Kempen; PE Carreira; JA Jeddeloh; JL Garcia-Perez; HH Kazazian; AD Ewing; GJ Faulkner. Heritable L1 retrotransposition in the mouse primordial germline and early embryo. Genome Research (2017)
- HM Rowe; J Jakobsson; D Mesnard; J Rougemont; S Reynard; T Aktas; PV Maillard; H Layard-Liesching; S Verp; J Marquis; F Spitz; DB Constam; D Trono. KAP1 controls endogenous retroviruses in embryonic stem cells. Nature (2010)
- HM Rowe; A Kapopoulou; A Corsinotti; L Fasching; TS Macfarlan; Y Tarabay; S Viville; J Jakobsson; SL Pfaff; D Trono. TRIM28 repression of retrotransposon-based enhancers is necessary to preserve transcriptional dynamics in embryonic stem cells. Genome Research (2013)
- SN Schauer; PE Carreira; R Shukla; DJ Gerhardt; P Gerdes; FJ Sanchez-Luque; P Nicoli; M Kindlova; S Ghisletti; AD Santos; D Rapoud; D Samuel; J Faivre; AD Ewing; SR Richardson; GJ Faulkner. L1 retrotransposition is a common feature of mammalian hepatocarcinogenesis. Genome Research (2018)
- DC Schultz; K Ayyanathan; D Negorev; GG Maul; FJ Rauscher. SETDB1: a novel KAP-1-associated histone H3, lysine 9-specific methyltransferase that contributes to HP1-mediated silencing of euchromatic genes by KRAB zinc-finger proteins. Genes &amp; Development (2002)
- K Semba; K Araki; K Matsumoto; H Suda; T Ando; A Sei; H Mizuta; K Takagi; M Nakahara; M Muta; G Yamada; N Nakagata; A Iida; S Ikegawa; Y Nakamura; M Araki; K Abe; K Yamamura. Ectopic expression of Ptf1a induces spinal defects, urogenital defects, and anorectal malformations in Danforth's short tail mice. PLOS Genetics (2013)
- SP Sripathy; J Stevens; DC Schultz. The KAP1 corepressor functions to coordinate the assembly of de novo HP1-demarcated microenvironments of heterochromatin required for KRAB zinc finger protein-mediated transcriptional repression. Molecular and Cellular Biology (2006)
- JH Thomas; S Schneider. Coevolution of retroelements and tandem zinc finger genes. Genome Research (2011)
- PJ Thompson; TS Macfarlan; MC Lorincz. Long terminal repeats: from parasitic elements to building blocks of the transcriptional regulatory repertoire. Molecular Cell (2016)
- RS Treger; SD Pope; Y Kong; M Tokuyama; M Taura; A Iwasaki. The lupus susceptibility locus Sgp3 encodes the suppressor of endogenous retrovirus expression SNERV. Immunity (2019)
- CN Vlangos; AN Siuniak; D Robinson; AM Chinnaiyan; RH Lyons; JD Cavalcoli; CE Keegan. Next-generation sequencing identifies the Danforth's short tail mouse mutation as a retrotransposon insertion affecting Ptf1a expression. PLOS Genetics (2013)
- J Wang; G Xie; M Singh; AT Ghanbarian; T Raskó; A Szvetnik; H Cai; D Besser; A Prigione; NV Fuchs; GG Schumann; W Chen; MC Lorincz; Z Ivics; LD Hurst; Z Izsvák. Primate-specific endogenous retrovirus-driven transcription defines naive-like stem cells. Nature (2014)
- D Wolf; K Hug; SP Goff. TRIM28 mediates primer binding site-targeted silencing of Lys1,2 tRNA-utilizing retroviruses in embryonic cells. PNAS (2008)
- G Wolf; D Greenberg; TS Macfarlan. Spotting the enemy within: targeted silencing of foreign DNA in mammalian genomes by the Krüppel-associated box zinc finger protein family. Mobile DNA (2015a)
- G Wolf; P Yang; AC Füchtbauer; EM Füchtbauer; AM Silva; C Park; W Wu; AL Nielsen; FS Pedersen; TS Macfarlan. The KRAB zinc finger protein ZFP809 is required to initiate epigenetic silencing of endogenous retroviruses. Genes &amp; Development (2015b)
- M Yamauchi; B Freitag; C Khan; B Berwin; E Barklis. Stem cell factor binding to retrovirus primer binding site silencers. Journal of Virology (1995)
- Y Zhang; T Liu; CA Meyer; J Eeckhoute; DS Johnson; BE Bernstein; C Nusbaum; RM Myers; M Brown; W Li; XS Liu. Model-based analysis of ChIP-Seq (MACS). Genome Biology (2008)
+- Bailey TL, Boden M, Buske FA, Frith M, Grant CE, Clementi L, Ren J, Li WW, Noble WS. MEME SUITE: tools for motif discovery and searching. Nucleic Acids Research 37:W202–W208 (2009). DOI: 10.1093/nar/gkp335, PMID: 19458158
+- Baust C, Gagnier L, Baillie GJ, Harris MJ, Juriloff DM, Mager DL. Structure and expression of mobile ETnII retroelements and their coding-competent MusD relatives in the mouse. Journal of Virology 77:11448–11458 (2003). DOI: 10.1128/JVI.77.21.11448-11458.2003, PMID: 14557630
+- Blaschke K, Ebata KT, Karimi MM, Zepeda-Martínez JA, Goyal P, Mahapatra S, Tam A, Laird DJ, Hirst M, Rao A, Lorincz MC, Ramalho-Santos M. Vitamin C induces Tet-dependent DNA demethylation and a blastocyst-like state in ES cells. Nature 500:222–226 (2013). DOI: 10.1038/nature12362, PMID: 23812591
+- Brodziak A, Ziółko E, Muc-Wierzgoń M, Nowakowska-Zajdel E, Kokot T, Klakla K. The role of human endogenous retroviruses in the pathogenesis of autoimmune diseases. Medical Science Monitor : International Medical Journal of Experimental and Clinical Research 18:RA80–RA88 (2012). DOI: 10.12659/msm.882892, PMID: 22648263
+- Castro-Diaz N, Ecco G, Coluccio A, Kapopoulou A, Yazdanpanah B, Friedli M, Duc J, Jang SM, Turelli P, Trono D. Evolutionally dynamic L1 regulation in embryonic stem cells. Genes &amp; Development 28:1397–1409 (2014). DOI: 10.1101/gad.241661.114, PMID: 24939876
+- Chuong EB, Elde NC, Feschotte C. Regulatory evolution of innate immunity through co-option of endogenous retroviruses. Science 351:1083–1087 (2016). DOI: 10.1126/science.aad5497, PMID: 26941318
+- Dan J, Liu Y, Liu N, Chiourea M, Okuka M, Wu T, Ye X, Mou C, Wang L, Wang L, Yin Y, Yuan J, Zuo B, Wang F, Li Z, Pan X, Yin Z, Chen L, Keefe DL, Gagos S, Xiao A, Liu L. Rif1 maintains telomere length homeostasis of ESCs by mediating heterochromatin silencing. Developmental Cell 29:7–19 (2014). DOI: 10.1016/j.devcel.2014.03.004, PMID: 24735877
+- De Iaco A, Planet E, Coluccio A, Verp S, Duc J, Trono D. DUX-family transcription factors regulate zygotic genome activation in placental mammals. Nature Genetics 49:941–945 (2017). DOI: 10.1038/ng.3858, PMID: 28459456
+- Deniz Ö, de la Rica L, Cheng KCL, Spensberger D, Branco MR. SETDB1 prevents TET2-dependent activation of IAP retroelements in naïve embryonic stem cells. Genome Biology 19:6 (2018). DOI: 10.1186/s13059-017-1376-y, PMID: 29351814
+- Dewannieux M, Heidmann T. Endogenous retroviruses: acquisition, amplification and taming of genome invaders. Current Opinion in Virology 3:646–656 (2013). DOI: 10.1016/j.coviro.2013.08.005, PMID: 24004725
+- Ecco G, Cassano M, Kauzlaric A, Duc J, Coluccio A, Offner S, Imbeault M, Rowe HM, Turelli P, Trono D. Transposable elements and their KRAB-ZFP controllers regulate gene expression in adult tissues. Developmental Cell 36:611–623 (2016). DOI: 10.1016/j.devcel.2016.02.024, PMID: 27003935
+- Ecco G, Imbeault M, Trono D. KRAB zinc finger proteins. Development 144:2719–2729 (2017). DOI: 10.1242/dev.132605, PMID: 28765213
+- Frank JA, Feschotte C. Co-option of endogenous viral sequences for host cell function. Current Opinion in Virology 25:81–89 (2017). DOI: 10.1016/j.coviro.2017.07.021, PMID: 28818736
+- Gagnier L, Belancio VP, Mager DL. Mouse germ line mutations due to retrotransposon insertions. Mobile DNA 10:15 (2019). DOI: 10.1186/s13100-019-0157-4, PMID: 31011371
+- Groner AC, Meylan S, Ciuffi A, Zangger N, Ambrosini G, Dénervaud N, Bucher P, Trono D. KRAB-zinc finger proteins and KAP1 can mediate long-range transcriptional repression through heterochromatin spreading. PLOS Genetics 6:e1000869 (2010). DOI: 10.1371/journal.pgen.1000869, PMID: 20221260
+- Hancks DC, Kazazian HH. Roles for retrotransposon insertions in human disease. Mobile DNA 7:9 (2016). DOI: 10.1186/s13100-016-0065-9, PMID: 27158268
+- Imbeault M, Helleboid PY, Trono D. KRAB zinc-finger proteins contribute to the evolution of gene regulatory networks. Nature 543:550–554 (2017). DOI: 10.1038/nature21683, PMID: 28273063
+- Jacobs FM, Greenberg D, Nguyen N, Haeussler M, Ewing AD, Katzman S, Paten B, Salama SR, Haussler D. An evolutionary arms race between KRAB zinc-finger genes ZNF91/93 and SVA/L1 retrotransposons. Nature 516:242–245 (2014). DOI: 10.1038/nature13760, PMID: 25274305
+- Kano H, Kurahashi H, Toda T. Genetically regulated epigenetic transcriptional activation of retrotransposon insertion confers mouse dactylaplasia phenotype. PNAS 104:19034–19039 (2007). DOI: 10.1073/pnas.0705483104, PMID: 17984064
+- Karimi MM, Goyal P, Maksakova IA, Bilenky M, Leung D, Tang JX, Shinkai Y, Mager DL, Jones S, Hirst M, Lorincz MC. DNA methylation and SETDB1/H3K9me3 regulate predominantly distinct sets of genes, retroelements, and chimeric transcripts in mESCs. Cell Stem Cell 8:676–687 (2011). DOI: 10.1016/j.stem.2011.04.004, PMID: 21624812
+- Kauzlaric A, Ecco G, Cassano M, Duc J, Imbeault M, Trono D. The mouse genome displays highly dynamic populations of KRAB-zinc finger protein genes and related genetic units. PLOS ONE 12:e0173746 (2017). DOI: 10.1371/journal.pone.0173746, PMID: 28334004
+- Khil PP, Smagulova F, Brick KM, Camerini-Otero RD, Petukhova GV. Sensitive mapping of recombination hotspots using sequencing-based detection of ssDNA. Genome Research 22:957–965 (2012). DOI: 10.1101/gr.130583.111, PMID: 22367190
+- Krueger F, Andrews SR. Bismark: a flexible aligner and methylation caller for Bisulfite-Seq applications. Bioinformatics 27:1571–1572 (2011). DOI: 10.1093/bioinformatics/btr167, PMID: 21493656
+- Langmead B, Salzberg SL. Fast gapped-read alignment with bowtie 2. Nature Methods 9:357–359 (2012). DOI: 10.1038/nmeth.1923, PMID: 22388286
+- Legiewicz M, Zolotukhin AS, Pilkington GR, Purzycka KJ, Mitchell M, Uranishi H, Bear J, Pavlakis GN, Le Grice SF, Felber BK. The RNA transport element of the murine musD retrotransposon requires long-range intramolecular interactions for function. Journal of Biological Chemistry 285:42097–42104 (2010). DOI: 10.1074/jbc.M110.182840, PMID: 20978285
+- Lehoczky JA, Thomas PE, Patrie KM, Owens KM, Villarreal LM, Galbraith K, Washburn J, Johnson CN, Gavino B, Borowsky AD, Millen KJ, Wakenight P, Law W, Van Keuren ML, Gavrilina G, Hughes ED, Saunders TL, Brihn L, Nadeau JH, Innis JW. A novel intergenic ETnII-β insertion mutation causes multiple malformations in Polypodia mice. PLOS Genetics 9:e1003967 (2013). DOI: 10.1371/journal.pgen.1003967, PMID: 24339789
+- Leung D, Du T, Wagner U, Xie W, Lee AY, Goyal P, Li Y, Szulwach KE, Jin P, Lorincz MC, Ren B. Regulation of DNA methylation turnover at LTR retrotransposons and imprinted loci by the histone methyltransferase Setdb1. PNAS 111:6690–6695 (2014). DOI: 10.1073/pnas.1322273111, PMID: 24757056
+- Lilue J, Doran AG, Fiddes IT, Abrudan M, Armstrong J, Bennett R, Chow W, Collins J, Collins S, Czechanski A, Danecek P, Diekhans M, Dolle DD, Dunn M, Durbin R, Earl D, Ferguson-Smith A, Flicek P, Flint J, Frankish A, Fu B, Gerstein M, Gilbert J, Goodstadt L, Harrow J, Howe K, Ibarra-Soria X, Kolmogorov M, Lelliott CJ, Logan DW, Loveland J, Mathews CE, Mott R, Muir P, Nachtweide S, Navarro FCP, Odom DT, Park N, Pelan S, Pham SK, Quail M, Reinholdt L, Romoth L, Shirley L, Sisu C, Sjoberg-Herrera M, Stanke M, Steward C, Thomas M, Threadgold G, Thybert D, Torrance J, Wong K, Wood J, Yalcin B, Yang F, Adams DJ, Paten B, Keane TM. Sixteen diverse laboratory mouse reference genomes define strain-specific haplotypes and novel functional loci. Nature Genetics 50:1574–1583 (2018). DOI: 10.1038/s41588-018-0223-8, PMID: 30275530
+- Liu S, Brind'Amour J, Karimi MM, Shirane K, Bogutz A, Lefebvre L, Sasaki H, Shinkai Y, Lorincz MC. Setdb1 is required for germline development and silencing of H3K9me3-marked endogenous retroviruses in primordial germ cells. Genes &amp; Development 28:2041–2055 (2014). DOI: 10.1101/gad.244848.114, PMID: 25228647
+- Love MI, Huber W, Anders S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology 15:550 (2014). DOI: 10.1186/s13059-014-0550-8, PMID: 25516281
+- Lugani F, Arora R, Papeta N, Patel A, Zheng Z, Sterken R, Singer RA, Caridi G, Mendelsohn C, Sussel L, Papaioannou VE, Gharavi AG. A retrotransposon insertion in the 5' regulatory domain of Ptf1a results in ectopic gene expression and multiple congenital defects in Danforth's short tail mouse. PLOS Genetics 9:e1003206 (2013). DOI: 10.1371/journal.pgen.1003206, PMID: 23437001
+- Macfarlan TS, Gifford WD, Driscoll S, Lettieri K, Rowe HM, Bonanomi D, Firth A, Singer O, Trono D, Pfaff SL. Embryonic stem cell potency fluctuates with endogenous retrovirus activity. Nature 487:57–63 (2012). DOI: 10.1038/nature11244, PMID: 22722858
+- Maksakova IA, Romanish MT, Gagnier L, Dunn CA, van de Lagemaat LN, Mager DL. Retroviral elements and their hosts: insertional mutagenesis in the mouse germ line. PLOS Genetics 2:e2 (2006). DOI: 10.1371/journal.pgen.0020002, PMID: 16440055
+- Matsui T, Leung D, Miyashita H, Maksakova IA, Miyachi H, Kimura H, Tachibana M, Lorincz MC, Shinkai Y. Proviral silencing in embryonic stem cells requires the histone methyltransferase ESET. Nature 464:927–931 (2010). DOI: 10.1038/nature08858, PMID: 20164836
+- Najafabadi HS, Mnaimneh S, Schmitges FW, Garton M, Lam KN, Yang A, Albu M, Weirauch MT, Radovani E, Kim PM, Greenblatt J, Frey BJ, Hughes TR. C2H2 zinc finger proteins greatly expand the human regulatory lexicon. Nature Biotechnology 33:555–562 (2015). DOI: 10.1038/nbt.3128, PMID: 25690854
+- Nellåker C, Keane TM, Yalcin B, Wong K, Agam A, Belgard TG, Flint J, Adams DJ, Frankel WN, Ponting CP. The genomic landscape shaped by selection on transposable elements across 18 mouse strains. Genome Biology 13:R45 (2012). DOI: 10.1186/gb-2012-13-6-r45, PMID: 22703977
+- O'Geen H, Frietze S, Farnham PJ. Using ChIP-seq technology to identify targets of zinc finger transcription factors. Methods in Molecular Biology 649:437–455 (2010). DOI: 10.1007/978-1-60761-753-2\_27, PMID: 20680851
+- Patel A, Yang P, Tinkham M, Pradhan M, Sun M-A, Wang Y, Hoang D, Wolf G, Horton JR, Zhang X, Macfarlan T, Cheng X. DNA conformation induces adaptable binding by tandem zinc finger proteins. Cell 173:221–233 (2018). DOI: 10.1016/j.cell.2018.02.058, PMID: 29551271
+- Ribet D, Dewannieux M, Heidmann T. An active murine transposon family pair: retrotransposition of "master" MusD copies and ETn trans-mobilization. Genome Research 14:2261–2267 (2004). DOI: 10.1101/gr.2924904, PMID: 15479948
+- Richardson SR, Gerdes P, Gerhardt DJ, Sanchez-Luque FJ, Bodea GO, Muñoz-Lopez M, Jesuadian JS, Kempen MHC, Carreira PE, Jeddeloh JA, Garcia-Perez JL, Kazazian HH, Ewing AD, Faulkner GJ. Heritable L1 retrotransposition in the mouse primordial germline and early embryo. Genome Research 27:1395–1405 (2017). DOI: 10.1101/gr.219022.116, PMID: 28483779
+- Rowe HM, Jakobsson J, Mesnard D, Rougemont J, Reynard S, Aktas T, Maillard PV, Layard-Liesching H, Verp S, Marquis J, Spitz F, Constam DB, Trono D. KAP1 controls endogenous retroviruses in embryonic stem cells. Nature 463:237–240 (2010). DOI: 10.1038/nature08674, PMID: 20075919
+- Rowe HM, Kapopoulou A, Corsinotti A, Fasching L, Macfarlan TS, Tarabay Y, Viville S, Jakobsson J, Pfaff SL, Trono D. TRIM28 repression of retrotransposon-based enhancers is necessary to preserve transcriptional dynamics in embryonic stem cells. Genome Research 23:452–461 (2013). DOI: 10.1101/gr.147678.112, PMID: 23233547
+- Schauer SN, Carreira PE, Shukla R, Gerhardt DJ, Gerdes P, Sanchez-Luque FJ, Nicoli P, Kindlova M, Ghisletti S, Santos AD, Rapoud D, Samuel D, Faivre J, Ewing AD, Richardson SR, Faulkner GJ. L1 retrotransposition is a common feature of mammalian hepatocarcinogenesis. Genome Research 28:639–653 (2018). DOI: 10.1101/gr.226993.117, PMID: 29643204
+- Schultz DC, Ayyanathan K, Negorev D, Maul GG, Rauscher FJ. SETDB1: a novel KAP-1-associated histone H3, lysine 9-specific methyltransferase that contributes to HP1-mediated silencing of euchromatic genes by KRAB zinc-finger proteins. Genes &amp; Development 16:919–932 (2002). DOI: 10.1101/gad.973302, PMID: 11959841
+- Semba K, Araki K, Matsumoto K, Suda H, Ando T, Sei A, Mizuta H, Takagi K, Nakahara M, Muta M, Yamada G, Nakagata N, Iida A, Ikegawa S, Nakamura Y, Araki M, Abe K, Yamamura K. Ectopic expression of Ptf1a induces spinal defects, urogenital defects, and anorectal malformations in Danforth's short tail mice. PLOS Genetics 9:e1003204 (2013). DOI: 10.1371/journal.pgen.1003204, PMID: 23436999
+- Sripathy SP, Stevens J, Schultz DC. The KAP1 corepressor functions to coordinate the assembly of de novo HP1-demarcated microenvironments of heterochromatin required for KRAB zinc finger protein-mediated transcriptional repression. Molecular and Cellular Biology 26:8623–8638 (2006). DOI: 10.1128/MCB.00487-06, PMID: 16954381
+- Thomas JH, Schneider S. Coevolution of retroelements and tandem zinc finger genes. Genome Research 21:1800–1812 (2011). DOI: 10.1101/gr.121749.111, PMID: 21784874
+- Thompson PJ, Macfarlan TS, Lorincz MC. Long terminal repeats: from parasitic elements to building blocks of the transcriptional regulatory repertoire. Molecular Cell 62:766–776 (2016). DOI: 10.1016/j.molcel.2016.03.029, PMID: 27259207
+- Treger RS, Pope SD, Kong Y, Tokuyama M, Taura M, Iwasaki A. The lupus susceptibility locus Sgp3 encodes the suppressor of endogenous retrovirus expression SNERV. Immunity 50:334–347 (2019). DOI: 10.1016/j.immuni.2018.12.022, PMID: 30709743
+- Vlangos CN, Siuniak AN, Robinson D, Chinnaiyan AM, Lyons RH, Cavalcoli JD, Keegan CE. Next-generation sequencing identifies the Danforth's short tail mouse mutation as a retrotransposon insertion affecting Ptf1a expression. PLOS Genetics 9:e1003205 (2013). DOI: 10.1371/journal.pgen.1003205, PMID: 23437000
+- Wang J, Xie G, Singh M, Ghanbarian AT, Raskó T, Szvetnik A, Cai H, Besser D, Prigione A, Fuchs NV, Schumann GG, Chen W, Lorincz MC, Ivics Z, Hurst LD, Izsvák Z. Primate-specific endogenous retrovirus-driven transcription defines naive-like stem cells. Nature 516:405–409 (2014). DOI: 10.1038/nature13804, PMID: 25317556
+- Wolf D, Hug K, Goff SP. TRIM28 mediates primer binding site-targeted silencing of Lys1,2 tRNA-utilizing retroviruses in embryonic cells. PNAS 105:12521–12526 (2008). DOI: 10.1073/pnas.0805540105, PMID: 18713861
+- Wolf G, Greenberg D, Macfarlan TS. Spotting the enemy within: targeted silencing of foreign DNA in mammalian genomes by the Krüppel-associated box zinc finger protein family. Mobile DNA 6:17 (2015a). DOI: 10.1186/s13100-015-0050-8, PMID: 26435754
+- Wolf G, Yang P, Füchtbauer AC, Füchtbauer EM, Silva AM, Park C, Wu W, Nielsen AL, Pedersen FS, Macfarlan TS. The KRAB zinc finger protein ZFP809 is required to initiate epigenetic silencing of endogenous retroviruses. Genes &amp; Development 29:538–554 (2015b). DOI: 10.1101/gad.252767.114, PMID: 25737282
+- Yamauchi M, Freitag B, Khan C, Berwin B, Barklis E. Stem cell factor binding to retrovirus primer binding site silencers. Journal of Virology 69:1142–1149 (1995). DOI: 10.1128/JVI.69.2.1142-1149.1995, PMID: 7529329
+- Zhang Y, Liu T, Meyer CA, Eeckhoute J, Johnson DS, Bernstein BE, Nusbaum C, Myers RM, Brown M, Li W, Liu XS. Model-based analysis of ChIP-Seq (MACS). Genome Biology 9:R137 (2008). DOI: 10.1186/gb-2008-9-9-r137, PMID: 18798982
--- a/tests/data/groundtruth/docling_v2/picture_classification.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/picture_classification.doctags.txt
@ -1,15 +1,11 @@
-<document>
-<section_header_level_1><location><page_1><loc_22><loc_83><loc_41><loc_84></location>Figures Example</section_header_level_1>
-<text><location><page_1><loc_22><loc_63><loc_78><loc_81></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
-<figure>
-<location><page_1><loc_22><loc_36><loc_78><loc_62></location>
-<caption>Figure 1: This is an example image.</caption>
-</figure>
-<text><location><page_1><loc_22><loc_15><loc_78><loc_30></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.</text>
-<text><location><page_2><loc_22><loc_66><loc_78><loc_84></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
-<figure>
-<location><page_2><loc_36><loc_36><loc_64><loc_65></location>
-<caption>Figure 2: This is an example image.</caption>
-</figure>
-<text><location><page_2><loc_22><loc_15><loc_78><loc_31></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.</text>
-</document>
+<doctag><section_header_level_1><loc_109><loc_79><loc_206><loc_87>Figures Example</section_header_level_1>
+<text><loc_109><loc_94><loc_390><loc_183>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
+<picture><loc_110><loc_192><loc_389><loc_322><caption><loc_185><loc_334><loc_314><loc_340>Figure 1: This is an example image.</caption></picture>
+<text><loc_109><loc_349><loc_390><loc_423>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.</text>
+<page_footer><loc_248><loc_439><loc_252><loc_445>1</page_footer>
+<page_break>
+<text><loc_109><loc_81><loc_390><loc_169>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
+<picture><loc_179><loc_176><loc_320><loc_321><caption><loc_185><loc_330><loc_314><loc_336>Figure 2: This is an example image.</caption></picture>
+<text><loc_109><loc_345><loc_390><loc_426>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.</text>
+<page_footer><loc_248><loc_439><loc_252><loc_445>2</page_footer>
+</doctag>
--- a/tests/data/groundtruth/docling_v2/pnas_sample.xml.itxt
+++ b/tests/data/groundtruth/docling_v2/pnas_sample.xml.itxt
@ -0,0 +1,148 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: title: The coreceptor mutation CCR5Δ32  ... V epidemics and is selected for by HIV
+    item-2 at level 2: paragraph: Amy D. Sullivan, Janis Wigginton, Denise Kirschner
+    item-3 at level 2: paragraph: Department of Microbiology and I ... dical School, Ann Arbor, MI 48109-0620
+    item-4 at level 2: section_header: Abstract
+      item-5 at level 3: text: We explore the impact of a host  ... creasing the frequency of this allele.
+    item-6 at level 2: text: Nineteen million people have die ...  factors such as host genetics (4, 5).
+    item-7 at level 2: text: To exemplify the contribution of ...  follow the CCR5Δ32 allelic frequency.
+    item-8 at level 2: text: We hypothesize that CCR5Δ32 limi ... g the frequency of this mutant allele.
+    item-9 at level 2: text: CCR5 is a host-cell chemokine re ... iral strain (such as X4 or R5X4) (30).
+    item-10 at level 2: section_header: The Model
+      item-11 at level 3: text: Because we are most concerned wi ... t both economic and social conditions.
+      item-12 at level 3: picture
+        item-12 at level 4: caption: Figure 1 A schematic representation of the basic compartmental HIV epidemic model. The criss-cross lines indicate the sexual mixing between different compartments. Each of these interactions has a positive probability of taking place; they also incorporate individual rates of transmission indicated as λ, but in full notation is λ î,,→i,j, where i,j,k is the phenotype of the infected partner and î, is the phenotype of the susceptible partner. Also shown are the different rates of disease progression, γ i,j,k , that vary according to genotype, gender, and stage. Thus, the interactions between different genotypes, genders, and stages are associated with a unique probability of HIV infection. M, male; F, female.
+      item-13 at level 3: table with [6x5]
+        item-13 at level 4: caption: Table 1 Children's genotype
+      item-14 at level 3: section_header: Parameter Estimates for the Model.
+        item-15 at level 4: text: Estimates for rates that govern  ... d in Fig. 1 are summarized as follows:
+        item-16 at level 4: formula:  \frac{dS_{i,j}(t)}{dt}={\chi}_{ ... ,\hat {k}{\rightarrow}i,j}S_{i,j}(t), 
+        item-17 at level 4: formula:  \hspace{1em}\hspace{1em}\hspace ... j,A}(t)-{\gamma}_{i,j,A}I_{i,j,A}(t), 
+        item-18 at level 4: formula:  \frac{dI_{i,j,B}(t)}{dt}={\gamm ... j,B}(t)-{\gamma}_{i,j,B}I_{i,j,B}(t), 
+        item-19 at level 4: formula:  \frac{dA(t)}{dt}={\gamma}_{i,j, ...  \right) -{\mu}_{A}A(t)-{\delta}A(t), 
+        item-20 at level 4: text: where, in addition to previously ... on of the infected partner, and j ≠ .
+        item-21 at level 4: table with [14x5]
+          item-21 at level 5: caption: Table 2 Transmission probabilities
+        item-22 at level 4: table with [8x3]
+          item-22 at level 5: caption: Table 3 Progression rates
+        item-23 at level 4: table with [20x3]
+          item-23 at level 5: caption: Table 4 Parameter values
+        item-24 at level 4: text: The effects of the CCR5 W/Δ32 an ... nting this probability of infection is
+        item-25 at level 4: formula:  {\lambda}_{\hat {i},\hat {j},\h ... \hat {i},\hat {j},\hat {k}} \right] , 
+        item-26 at level 4: text: where j ≠  is either male or fe ... e those with AIDS in the simulations).
+        item-27 at level 4: text: The average rate of partner acqu ... owing the male rates to vary (36, 37).
+        item-28 at level 4: section_header: Transmission probabilities.
+          item-29 at level 5: text: The effect of a genetic factor i ... reported; ref. 42) (ref. 43, Table 2).
+          item-30 at level 5: text: Given the assumption of no treat ... ases during the end stage of disease).
+        item-31 at level 4: section_header: Disease progression.
+          item-32 at level 5: text: We assume three stages of HIV in ... ssion rates are summarized in Table 3.
+      item-33 at level 3: section_header: Demographic Setting.
+        item-34 at level 4: text: Demographic parameters are based ... [suppressing (t) notation]: χ1,j 1,j =
+        item-35 at level 4: formula:  B_{r}\hspace{.167em}{ \,\substa ... }+I_{2,M,k})}{N_{M}} \right] + \right 
+        item-36 at level 4: formula:  p_{v} \left \left( \frac{(I_{1, ... ght] \right) \right] ,\hspace{.167em} 
+        item-37 at level 4: text: where the probability of HIV ver ... heir values are summarized in Table 4.
+    item-38 at level 2: section_header: Prevalence of HIV
+      item-39 at level 3: section_header: Demographics and Model Validation.
+        item-40 at level 4: text: The model was validated by using ... 5% to capture early epidemic behavior.
+        item-41 at level 4: text: In deciding on our initial value ... n within given subpopulations (2, 49).
+        item-42 at level 4: text: In the absence of HIV infection, ... those predicted by our model (Fig. 2).
+        item-43 at level 4: picture
+          item-43 at level 5: caption: Figure 2 Model simulation of HIV infection in a population lacking the protective CCR5Δ32 allele compared with national data from Kenya (healthy adults) and Mozambique (blood donors, ref. 17). The simulated population incorporates parameter estimates from sub-Saharan African demographics. Note the two outlier points from the Mozambique data were likely caused by underreporting in the early stages of the epidemic.
+      item-44 at level 3: section_header: Effects of the Allele on Prevalence.
+        item-45 at level 4: text: After validating the model in th ... among adults for total HIV/AIDS cases.
+        item-46 at level 4: text: Although CCR5Δ32/Δ32 homozygosit ... frequency of the mutation as 0.105573.
+        item-47 at level 4: text: Fig. 3 shows the prevalence of H ... mic, reaching 18% before leveling off.
+        item-48 at level 4: picture
+          item-48 at level 5: caption: Figure 3 Prevalence of HIV/AIDS in the adult population as predicted by the model. The top curve (○) indicates prevalence in a population lacking the protective allele. We compare that to a population with 19% heterozygous and 1% homozygous for the allele (implying an allelic frequency of 0.105573. Confidence interval bands (light gray) are shown around the median simulation () providing a range of uncertainty in evaluating parameters for the effect of the mutation on the infectivity and the duration of asymptomatic HIV for heterozygotes.
+        item-49 at level 4: text: In contrast, when a proportion o ... gins to decline slowly after 70 years.
+        item-50 at level 4: text: In the above simulations we assu ...  in the presence of the CCR5 mutation.
+        item-51 at level 4: text: Because some parameters (e.g., r ... s a major influence on disease spread.
+    item-52 at level 2: section_header: HIV Induces Selective Pressure on Genotype Frequency
+      item-53 at level 3: text: To observe changes in the freque ...  for ≈1,600 years before leveling off.
+      item-54 at level 3: picture
+        item-54 at level 4: caption: Figure 4 Effects of HIV-1 on selection of the CCR5Δ32 allele. The Hardy-Weinberg equilibrium level is represented in the no-infection simulation (solid lines) for each population. Divergence from the original Hardy-Weinberg equilibrium is shown to occur in the simulations that include HIV infection (dashed lines). Fraction of the total subpopulations are presented: (A) wild types (W/W), (B) heterozygotes (W/Δ32), and (C) homozygotes (Δ32/Δ32). Note that we initiate this simulation with a much lower allelic frequency (0.00105) than used in the rest of the study to better exemplify the actual selective effect over a 1,000-year time scale. (D) The allelic selection effect over a 2,000-year time scale.
+    item-55 at level 2: section_header: Discussion
+      item-56 at level 3: text: This study illustrates how popul ... pulations where the allele is present.
+      item-57 at level 3: text: We also observed that HIV can pr ... is) have been present for much longer.
+      item-58 at level 3: text: Two mathematical models have con ... ce of the pathogen constant over time.
+      item-59 at level 3: text: Even within our focus on host pr ... f a protective allele such as CCR5Δ32.
+      item-60 at level 3: text: Although our models demonstrate  ... f the population to epidemic HIV (16).
+      item-61 at level 3: text: In assessing the HIV/AIDS epidem ... for education and prevention programs.
+    item-62 at level 2: section_header: Acknowledgments
+      item-63 at level 3: text: We thank Mark Krosky, Katia Koel ... ers for extremely insightful comments.
+    item-64 at level 2: section_header: References
+      item-65 at level 3: list: group list
+        item-66 at level 4: list_item: Weiss HA, Hawkes S. Leprosy Rev 72:92–98 (2001). PMID: 11355525
+        item-67 at level 4: list_item: Taha TE, Dallabetta GA, Hoover D ...  AIDS 12:197–203 (1998). PMID: 9468369
+        item-68 at level 4: list_item: AIDS Epidemic Update. Geneva: World Health Organization1–17 (1998).
+        item-69 at level 4: list_item: D'Souza MP, Harden VA. Nat Med 2:1293–1300 (1996). PMID: 8946819
+        item-70 at level 4: list_item: Martinson JJ, Chapman NH, Rees D ... Genet 16:100–103 (1997). PMID: 9140404
+        item-71 at level 4: list_item: Roos MTL, Lange JMA, deGoede REY ...  Dis 165:427–432 (1992). PMID: 1347054
+        item-72 at level 4: list_item: Garred P, Eugen-Olsen J, Iversen ...  Lancet 349:1884 (1997). PMID: 9217763
+        item-73 at level 4: list_item: Katzenstein TL, Eugen-Olsen J, H ... rovirol 16:10–14 (1997). PMID: 9377119
+        item-74 at level 4: list_item: deRoda H, Meyer K, Katzenstain W ... ce 273:1856–1862 (1996). PMID: 8791590
+        item-75 at level 4: list_item: Meyer L, Magierowska M, Hubert J ...  AIDS 11:F73–F78 (1997). PMID: 9302436
+        item-76 at level 4: list_item: Smith MW, Dean M, Carrington M,  ... ence 277:959–965 (1997). PMID: 9252328
+        item-77 at level 4: list_item: Samson M, Libert F, Doranz BJ, R ... don) 382:722–725 (1996). PMID: 8751444
+        item-78 at level 4: list_item: McNicholl JM, Smith DK, Qari SH, ... ct Dis 3:261–271 (1997). PMID: 9284370
+        item-79 at level 4: list_item: Michael NL, Chang G, Louie LG, M ... at Med 3:338–340 (1997). PMID: 9055864
+        item-80 at level 4: list_item: Mayaud P, Mosha F, Todd J, Balir ... IDS 11:1873–1880 (1997). PMID: 9412707
+        item-81 at level 4: list_item: Hoffman IF, Jere CS, Taylor TE,  ... li P, Dyer JR. AIDS 13:487–494 (1998).
+        item-82 at level 4: list_item: HIV/AIDS Surveillance Database.  ...  International Programs Center (1999).
+        item-83 at level 4: list_item: Anderson RM, May RM, McLean AR.  ... don) 332:228–234 (1988). PMID: 3279320
+        item-84 at level 4: list_item: Berger EA, Doms RW, Fenyo EM, Ko ... (London) 391:240 (1998). PMID: 9440686
+        item-85 at level 4: list_item: Alkhatib G, Broder CC, Berger EA ... rol 70:5487–5494 (1996). PMID: 8764060
+        item-86 at level 4: list_item: Choe H, Farzan M, Sun Y, Sulliva ... ell 85:1135–1148 (1996). PMID: 8674119
+        item-87 at level 4: list_item: Deng H, Liu R, Ellmeier W, Choe  ... don) 381:661–666 (1996). PMID: 8649511
+        item-88 at level 4: list_item: Doranz BJ, Rucker J, Yi Y, Smyth ... ell 85:1149–1158 (1996). PMID: 8674120
+        item-89 at level 4: list_item: Dragic T, Litwin V, Allaway GP,  ... don) 381:667–673 (1996). PMID: 8649512
+        item-90 at level 4: list_item: Zhu T, Mo H, Wang N, Nam DS, Cao ... ce 261:1179–1181 (1993). PMID: 8356453
+        item-91 at level 4: list_item: Bjorndal A, Deng H, Jansson M, F ... rol 71:7478–7487 (1997). PMID: 9311827
+        item-92 at level 4: list_item: Conner RI, Sheridan KE, Ceradini ...  Med 185:621–628 (1997). PMID: 9034141
+        item-93 at level 4: list_item: Liu R, Paxton WA, Choe S, Ceradi ...  Cell 86:367–377 (1996). PMID: 8756719
+        item-94 at level 4: list_item: Mussico M, Lazzarin A, Nicolosi  ... w) 154:1971–1976 (1994). PMID: 8074601
+        item-95 at level 4: list_item: Michael NL, Nelson JA, KewalRama ... rol 72:6040–6047 (1998). PMID: 9621067
+        item-96 at level 4: list_item: Hethcote HW, Yorke JA. Gonorrhea ...  and Control. Berlin: Springer (1984).
+        item-97 at level 4: list_item: Anderson RM, May RM. Nature (London) 333:514–522 (1988). PMID: 3374601
+        item-98 at level 4: list_item: Asiimwe-Okiror G, Opio AA, Musin ... IDS 11:1757–1763 (1997). PMID: 9386811
+        item-99 at level 4: list_item: Carael M, Cleland J, Deheneffe J ... AIDS 9:1171–1175 (1995). PMID: 8519454
+        item-100 at level 4: list_item: Blower SM, Boe C. J AIDS 6:1347–1352 (1993). PMID: 8254474
+        item-101 at level 4: list_item: Kirschner D. J Appl Math 56:143–166 (1996).
+        item-102 at level 4: list_item: Le Pont F, Blower S. J AIDS 4:987–999 (1991). PMID: 1890608
+        item-103 at level 4: list_item: Kim MY, Lagakos SW. Ann Epidemiol 1:117–128 (1990). PMID: 1669741
+        item-104 at level 4: list_item: Anderson RM, May RM. Infectious  ... ol. Oxford: Oxford Univ. Press (1992).
+        item-105 at level 4: list_item: Ragni MV, Faruki H, Kingsley LA. ... ed Immune Defic Syndr 17:42–45 (1998).
+        item-106 at level 4: list_item: Kaplan JE, Khabbaz RF, Murphy EL ... virol 12:193–201 (1996). PMID: 8680892
+        item-107 at level 4: list_item: Padian NS, Shiboski SC, Glass SO ... nghoff E. Am J Edu 146:350–357 (1997).
+        item-108 at level 4: list_item: Leynaert B, Downs AM, de Vincenzi I. Am J Edu 148:88–96 (1998).
+        item-109 at level 4: list_item: Garnett GP, Anderson RM. J Acquired Immune Defic Syndr 9:500–513 (1995).
+        item-110 at level 4: list_item: Stigum H, Magnus P, Harris JR, S ... eteig LS. Am J Edu 145:636–643 (1997).
+        item-111 at level 4: list_item: Ho DD, Neumann AU, Perelson AS,  ... don) 373:123–126 (1995). PMID: 7816094
+        item-112 at level 4: list_item: World Resources (1998–1999). Oxford: Oxford Univ. Press (1999).
+        item-113 at level 4: list_item: Kostrikis LG, Neumann AU, Thomso ...  73:10264–10271 (1999). PMID: 10559343
+        item-114 at level 4: list_item: Low-Beer D, Stoneburner RL, Muku ... at Med 3:553–557 (1997). PMID: 9142126
+        item-115 at level 4: list_item: Grosskurth H, Mosha F, Todd J, S ... . AIDS 9:927–934 (1995). PMID: 7576329
+        item-116 at level 4: list_item: Melo J, Beby-Defaux A, Faria C,  ... AIDS 23:203–204 (2000). PMID: 10737436
+        item-117 at level 4: list_item: Iman RL, Helton JC, Campbell JE. J Quality Technol 13:174–183 (1981).
+        item-118 at level 4: list_item: Iman RL, Helton JC, Campbell JE. J Quality Technol 13:232–240 (1981).
+        item-119 at level 4: list_item: Blower SM, Dowlatabadi H. Int Stat Rev 62:229–243 (1994).
+        item-120 at level 4: list_item: Porco TC, Blower SM. Theor Popul Biol 54:117–132 (1998). PMID: 9733654
+        item-121 at level 4: list_item: Blower SM, Porco TC, Darby G. Nat Med 4:673–678 (1998). PMID: 9623975
+        item-122 at level 4: list_item: Libert F, Cochaux P, Beckman G,  ...  Genet 7:399–406 (1998). PMID: 9466996
+        item-123 at level 4: list_item: Lalani AS, Masters J, Zeng W, Ba ... e 286:1968–1971 (1999). PMID: 10583963
+        item-124 at level 4: list_item: Kermack WO, McKendrick AG. Proc R Soc London 261:700–721 (1927).
+        item-125 at level 4: list_item: Gupta S, Hill AVS. Proc R Soc London Ser B 260:271–277 (1995).
+        item-126 at level 4: list_item: Ruwende C, Khoo SC, Snow RW, Yat ... don) 376:246–249 (1995). PMID: 7617034
+        item-127 at level 4: list_item: McDermott DH, Zimmerman PA, Guig ... ncet 352:866–870 (1998). PMID: 9742978
+        item-128 at level 4: list_item: Kostrikis LG, Huang Y, Moore JP, ... at Med 4:350–353 (1998). PMID: 9500612
+        item-129 at level 4: list_item: Winkler C, Modi W, Smith MW, Nel ... ence 279:389–393 (1998). PMID: 9430590
+        item-130 at level 4: list_item: Martinson JJ, Hong L, Karanicola ... AIDS 14:483–489 (2000). PMID: 10780710
+        item-131 at level 4: list_item: Vernazza PL, Eron JJ, Fiscus SA, ... AIDS 13:155–166 (1999). PMID: 10202821
+  item-132 at level 1: caption: Figure 1 A schematic representat ...  of HIV infection. M, male; F, female.
+  item-133 at level 1: caption: Table 1 Children's genotype
+  item-134 at level 1: caption: Table 2 Transmission probabilities
+  item-135 at level 1: caption: Table 3 Progression rates
+  item-136 at level 1: caption: Table 4 Parameter values
+  item-137 at level 1: caption: Figure 2 Model simulation of HIV ... g in the early stages of the epidemic.
+  item-138 at level 1: caption: Figure 3 Prevalence of HIV/AIDS  ... of asymptomatic HIV for heterozygotes.
+  item-139 at level 1: caption: Figure 4 Effects of HIV-1 on sel ... n effect over a 2,000-year time scale.
--- a/tests/data/groundtruth/docling_v2/pnas_sample.xml.json
+++ b/tests/data/groundtruth/docling_v2/pnas_sample.xml.json
--- a/tests/data/groundtruth/docling_v2/pnas_sample.xml.md
+++ b/tests/data/groundtruth/docling_v2/pnas_sample.xml.md
@ -0,0 +1,258 @@
+# The coreceptor mutation CCR5Δ32 influences the dynamics of HIV epidemics and is selected for by HIV
+
+Amy D. Sullivan, Janis Wigginton, Denise Kirschner
+
+Department of Microbiology and Immunology, University  of Michigan Medical School, Ann Arbor, MI 48109-0620
+
+## Abstract
+
+We explore the impact of a host genetic factor on heterosexual HIV epidemics by using a deterministic mathematical model. A protective allele unequally distributed across populations is exemplified in our models by the 32-bp deletion in the host-cell chemokine receptor CCR5, CCR5Δ32. Individuals homozygous for CCR5Δ32 are protected against HIV infection whereas those heterozygous for CCR5Δ32 have lower pre-AIDS viral loads and delayed progression to AIDS. CCR5Δ32 may limit HIV spread by decreasing the probability of both risk of infection and infectiousness. In this work, we characterize epidemic HIV within three dynamic subpopulations: CCR5/CCR5 (homozygous, wild type), CCR5/CCR5Δ32 (heterozygous), and CCR5Δ32/CCR5Δ32 (homozygous, mutant). Our results indicate that prevalence of HIV/AIDS is greater in populations lacking the CCR5Δ32 alleles (homozygous wild types only) as compared with populations that include people heterozygous or homozygous for CCR5Δ32. Also, we show that HIV can provide selective pressure for CCR5Δ32, increasing the frequency of this allele.
+
+Nineteen million people have died of AIDS since the discovery of HIV in the 1980s. In 1999 alone, 5.4 million people were newly infected with HIV (ref. 1 and http://www.unaids.org/epidemicupdate/report/Epireport.html). (For brevity, HIV-1 is referred to as HIV in this paper.) Sub-Saharan Africa has been hardest hit, with more than 20% of the general population HIV-positive in some countries (2, 3). In comparison, heterosexual epidemics in developed, market-economy countries have not reached such severe levels. Factors contributing to the severity of the epidemic in economically developing countries abound, including economic, health, and social differences such as high levels of sexually transmitted diseases and a lack of prevention programs. However, the staggering rate at which the epidemic has spread in sub-Saharan Africa has not been adequately explained. The rate and severity of this epidemic also could indicate a greater underlying susceptibility to HIV attributable not only to sexually transmitted disease, economics, etc., but also to other more ubiquitous factors such as host genetics (4, 5).
+
+To exemplify the contribution of such a host genetic factor to HIV prevalence trends, we consider a well-characterized 32-bp deletion in the host-cell chemokine receptor CCR5, CCR5Δ32. When HIV binds to host cells, it uses the CD4 receptor on the surface of host immune cells together with a coreceptor, mainly the CCR5 and CXCR4 chemokine receptors (6). Homozygous mutations for this 32-bp deletion offer almost complete protection from HIV infection, and heterozygous mutations are associated with lower pre-AIDS viral loads and delayed progression to AIDS (7–14). CCR5Δ32 generally is found in populations of European descent, with allelic frequencies ranging from 0 to 0.29 (13). African and Asian populations studied outside the United States or Europe appear to lack the CCR5Δ32 allele, with an allelic frequency of almost zero (5, 13). Thus, to understand the effects of a protective allele, we use a mathematical model to track prevalence of HIV in populations with or without CCR5Δ32 heterozygous and homozygous people and also to follow the CCR5Δ32 allelic frequency.
+
+We hypothesize that CCR5Δ32 limits epidemic HIV by decreasing infection rates, and we evaluate the relative contributions to this by the probability of infection and duration of infectivity. To capture HIV infection as a chronic infectious disease together with vertical transmission occurring in untreated mothers, we model a dynamic population (i.e., populations that vary in growth rates because of fluctuations in birth or death rates) based on realistic demographic characteristics (18). This scenario also allows tracking of the allelic frequencies over time. This work considers how a specific host genetic factor affecting HIV infectivity and viremia at the individual level might influence the epidemic in a dynamic population and how HIV exerts selective pressure, altering the frequency of this mutant allele.
+
+CCR5 is a host-cell chemokine receptor, which is also used as a coreceptor by R5 strains of HIV that are generally acquired during sexual transmission (6, 19–25). As infection progresses to AIDS the virus expands its repertoire of potential coreceptors to include other CC-family and CXC-family receptors in roughly 50% of patients (19, 26, 27). CCR5Δ32 was identified in HIV-resistant people (28). Benefits to individuals from the mutation in this allele are as follows. Persons homozygous for the CCR5Δ32 mutation are almost nonexistent in HIV-infected populations (11, 12) (see ref. 13 for review). Persons heterozygous for the mutant allele (CCR5 W/Δ32) tend to have lower pre-AIDS viral loads. Aside from the beneficial effects that lower viral loads may have for individuals, there is also an altruistic effect, as transmission rates are reduced for individuals with low viral loads (as compared with, for example, AZT and other studies; ref. 29). Finally, individuals heterozygous for the mutant allele (CCR5 W/Δ32) also have a slower progression to AIDS than those homozygous for the wild-type allele (CCR5 W/W) (7–10), remaining in the population 2 years longer, on average. Interestingly, the dearth of information on HIV disease progression in people homozygous for the CCR5Δ32 allele (CCR5 Δ32/Δ32) stems from the rarity of HIV infection in this group (4, 12, 28). However, in case reports of HIV-infected CCR5 Δ32/Δ32 homozygotes, a rapid decline in CD4+ T cells and a high viremia are observed, likely because of initial infection with a more aggressive viral strain (such as X4 or R5X4) (30).
+
+## The Model
+
+Because we are most concerned with understanding the severity of the epidemic in developing countries where the majority of infection is heterosexual, we consider a purely heterosexual model. To model the effects of the allele in the population, we examine the rate of HIV spread by using an enhanced susceptible-infected-AIDS model of epidemic HIV (for review see ref. 31). Our model compares two population scenarios: a CCR5 wild-type population and one with CCR5Δ32 heterozygotes and homozygotes in addition to the wild type. To model the scenario where there are only wild-type individuals present in the population (i.e., CCR5 W/W), we track the sexually active susceptibles at time t [Si,j (t)], where i = 1 refers to genotype (CCR5 W/W only in this case) and j is either the male or female subpopulation. We also track those who are HIV-positive at time t not yet having AIDS in Ii,j,k (t) where k refers to stage of HIV infection [primary (A) or asymptomatic (B)]. The total number of individuals with AIDS at time t are tracked in A(t). The source population are children, χ i,j (t), who mature into the sexually active population at time t (Fig. 1, Table 1). We compare the model of a population lacking the CCR5Δ32 allele to a demographically similar population with a high frequency of the allele. When genetic heterogeneity is included, male and female subpopulations are each further divided into three distinct genotypic groups, yielding six susceptible subpopulations, [Si,j (t), where i ranges from 1 to 3, where 1 = CCR5W/W; 2 = CCR5 W/Δ32; 3 = CCR5 Δ32/Δ32]. The infected classes, Ii,j,k (t), also increase in number to account for these new genotype compartments. In both settings we assume there is no treatment available and no knowledge of HIV status by people in the early acute and middle asymptomatic stages (both conditions exist in much of sub-Saharan Africa). In addition, we assume that sexual mixing in the population occurs randomly with respect to genotype and HIV disease status, all HIV-infected people eventually progress to AIDS, and no barrier contraceptives are used. These last assumptions reflect both economic and social conditions.
+
+Figure 1 A schematic representation of the basic compartmental HIV epidemic model. The criss-cross lines indicate the sexual mixing between different compartments. Each of these interactions has a positive probability of taking place; they also incorporate individual rates of transmission indicated as λ, but in full notation is λ î,,→i,j, where i,j,k is the phenotype of the infected partner and î, is the phenotype of the susceptible partner. Also shown are the different rates of disease progression, γ i,j,k , that vary according to genotype, gender, and stage. Thus, the interactions between different genotypes, genders, and stages are associated with a unique probability of HIV infection. M, male; F, female.
+
+<!-- image -->
+
+Table 1 Children's genotype
+
+| Parents   | Mother   | Mother             | Mother                       | Mother             |
+|-----------|----------|--------------------|------------------------------|--------------------|
+|           |          |                    |                              |                    |
+| Father    |          | W/W                | W/Δ32                        | Δ32/Δ32            |
+|           | W/W      | χ1,j 1,j           | χ1,j 1,j, χ2,j 2,j           | χ2,j 2,j           |
+|           | W/Δ32    | χ1,j 1,j, χ2,j 2,j | χ1,j 1,j, χ2,j 2,j, χ3,j 3,j | χ2,j 2,j, χ3,j 3,j |
+|           | Δ32/Δ32  | χ2,j 2,j           | χ2,j 2,j, χ3,j 3,j           | χ3,j 3,j           |
+
+### Parameter Estimates for the Model.
+
+Estimates for rates that govern the interactions depicted in Fig. 1 were derived from the extensive literature on HIV. Our parameters and their estimates are summarized in Tables 2–4. The general form of the equations describing the rates of transition between population classes as depicted in Fig. 1 are summarized as follows:
+
+$$ \frac{dS_{i,j}(t)}{dt}={\chi}_{i,j}(t)-{\mu}_{j}S_{i,j}(t)-{\lambda}_{\hat {\imath},\hat {},\hat {k}{\rightarrow}i,j}S_{i,j}(t), $$
+
+$$ \hspace{1em}\hspace{1em}\hspace{.167em}\frac{dI_{i,j,A}(t)}{dt}={\lambda}_{\hat {\imath},\hat {},\hat {k}{\rightarrow}i,j}S_{i,j}(t)-{\mu}_{j}I_{i,j,A}(t)-{\gamma}_{i,j,A}I_{i,j,A}(t), $$
+
+$$ \frac{dI_{i,j,B}(t)}{dt}={\gamma}_{i,j,A}I_{i,j,A}(t)-{\mu}_{j}I_{i,j,B}(t)-{\gamma}_{i,j,B}I_{i,j,B}(t), $$
+
+$$ \frac{dA(t)}{dt}={\gamma}_{i,j,B} \left( { \,\substack{ ^{3} \\ {\sum} \\ _{i=1} }\, }I_{i,F,B}(t)+I_{i,M,B}(t) \right) -{\mu}_{A}A(t)-{\delta}A(t), $$
+
+where, in addition to previously defined populations and rates (with i equals genotype, j equals gender, and k equals stage of infection, either A or B), μ j , represents the non-AIDS (natural) death rate for males and females respectively, and μA is estimated by the average (μF + μM/2). This approximation allows us to simplify the model (only one AIDS compartment) without compromising the results, as most people with AIDS die of AIDS (δAIDS) and very few of other causes (μA). These estimates include values that affect infectivity (λ î,,→i,j ), transmission (β î,,→i,j ), and disease progression (γ i  ,  j  ,  k ) where the î,, notation represents the genotype, gender, and stage of infection of the infected partner, and j ≠ .
+
+Table 2 Transmission probabilities
+
+| HIV-infected partner (îıı^^, ^^, k k^^)   | Susceptible partner (i, j)   | Susceptible partner (i, j)   | Susceptible partner (i, j)   | Susceptible partner (i, j)   |
+|-----------------------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
+| HIV-infected partner (îıı^^, ^^, k k^^)   |                              |                              |                              |                              |
+| HIV-infected partner (îıı^^, ^^, k k^^)   | (^^ to j)                 | W/W                          | W/Δ32                        | Δ32/Δ32                      |
+|                                               |                              |                              |                              |                              |
+| Acute/primary                                 |                              |                              |                              |                              |
+| W/W or Δ32/Δ32                                | M to F                       | 0.040                        | 0.040                        | 0.00040                      |
+|                                               | F to M                       | 0.020                        | 0.020                        | 0.00020                      |
+| W/Δ32                                         | M to F                       | 0.030                        | 0.030                        | 0.00030                      |
+|                                               | F to M                       | 0.015                        | 0.015                        | 0.00015                      |
+| Asymptomatic                                  |                              |                              |                              |                              |
+| W/W or Δ32/Δ32                                | M to F                       | 0.0010                       | 0.0010                       | 10 × 10−6                    |
+|                                               | F to M                       | 0.0005                       | 0.0005                       | 5 × 10−6                     |
+| W/Δ32                                         | M to F                       | 0.0005                       | 0.0005                       | 5 × 10−6                     |
+|                                               | F to M                       | 0.00025                      | 0.00025                      | 2.5 × 10−6                   |
+
+Table 3 Progression rates
+
+| Genotype   | Disease stage   | Males/females    |
+|------------|-----------------|------------------|
+|            |                 |                  |
+| W/W        | A               | 3.5              |
+|            | B               | 0.16667          |
+| W/Δ32      | A               | 3.5              |
+|            | B               | 0.125            |
+| Δ32/Δ32    | A               | 3.5              |
+|            | B               | 0.16667          |
+
+Table 4 Parameter values
+
+| Parameter                               | Definition                                               | Value                   |
+|-----------------------------------------|----------------------------------------------------------|-------------------------|
+|                                         |                                                          |                         |
+| μ F  F, μ M  M                          | All-cause mortality for adult females (males)            | 0.015 (0.016) per year  |
+| μχχ                                     | All-cause childhood mortality (&lt;15 years of age)         | 0.01 per year           |
+| B  r  r                                 | Birthrate                                                | 0.25 per woman per year |
+| SA  F  F                                | Percent females acquiring new partners (sexual activity) | 10%                     |
+| SA  M  M                                | Percent males acquiring new partners (sexual activity)   | 25%                     |
+| m  F  F(ς$$ {\mathrm{_{{F}}^{{2}}}} $$) | Mean (variance) no. of new partners for females          | 1.8 (1.2) per year      |
+| ς$$ {\mathrm{_{{M}}^{{2}}}} $$          | Variance in no. of new partners for males                | 5.5 per year            |
+| 1 − p  v  v                             | Probability of vertical transmission                     | 0.30 per birth          |
+| I  i,j,k  i,j,k(0)                      | Initial total population HIV-positive                    | 0.50%                   |
+| χ i,j  i,j(0)                           | Initial total children in population (&lt;15 years of age)  | 45%                     |
+| W/W (0)                                 | Initial total wild types (W/W) in population             | 80%                     |
+| W/Δ32(0)                                | Initial total heterozygotes (W/Δ32) in population        | 19%                     |
+| Δ32/Δ32(0)                              | Initial total homozygotes (Δ32/Δ32) in population        | 1%                      |
+| r  M  M(r  F  F)                        | Initial percent males (females) in total population      | 49% (51%)               |
+| ϕ F  F, ϕ M  M                          | Number of sexual contacts a female (male) has            | 30 (24) per partner     |
+| ɛ i,j,k  i,j,k                          | % effect of mutation on transmission rates (see Table 2) | 0 &lt; ɛ i,j,k  i,j,k &lt; 1  |
+| δ                                       | Death rate for AIDS population                           | 1.0 per year            |
+| q                                       | Allelic frequency of Δ32 allele                          | 0.105573                |
+
+The effects of the CCR5 W/Δ32 and CCR5 Δ32/Δ32 genotypes are included in our model through both the per-capita probabilities of infection, λ î,,→i,j , and the progression rates, γ i  ,  j  ,  k . The infectivity coefficients, λ î,,→i,j , are calculated for each population subgroup based on the following: likelihood of HIV transmission in a sexual encounter between a susceptible and an infected (βîıı^^,j,k k^^→i,j ) person; formation of new partnerships (c  j  j); number of contacts in a given partnership (ϕ j ); and probability of encountering an infected individual (I  î,, /N   ). The formula representing this probability of infection is
+
+$$ {\lambda}_{\hat {i},\hat {j},\hat {k}{\rightarrow}i,j}=\frac{C_{j}{\cdot}{\phi}_{j}}{N_{\hat {j}}}\hspace{.167em} \left[ { \,\substack{ \\ {\sum} \\ _{\hat {i},\hat {k}} }\, }{\beta}_{\hat {i},\hat {j},\hat {k}{\rightarrow}i,j}{\cdot}I_{\hat {i},\hat {j},\hat {k}} \right] , $$
+
+where j ≠  is either male or female. N    represents the total population of gender  (this does not include those with AIDS in the simulations).
+
+The average rate of partner acquisition, cj , includes the mean plus the variance to mean ratio of the relevant distribution of partner-change rates to capture the small number of high-risk people: cj  = mj  + (ς/m j) where the mean (mj ) and variance (ς) are annual figures for new partnerships only (32). These means are estimated from Ugandan data for the number of heterosexual partners in the past year (33) and the number of nonregular heterosexual partners (i.e., spouses or long-term partners) in the past year (34). In these sexual activity surveys, men invariably have more new partnerships; thus, we assumed that they would have fewer average contacts per partnership than women (a higher rate of new partner acquisition means fewer sexual contacts with a given partner; ref. 35). To incorporate this assumption in our model, the male contacts/partnership, ϕ M , was reduced by 20%. In a given population, the numbers of heterosexual interactions must equate between males and females. The balancing equation applied here is SA F·m F·N F = SA M·m M·N M, where SAj  are the percent sexually active and Nj  are the total in the populations for gender j. To specify changes in partner acquisition, we apply a male flexibility mechanism, holding the female rate of acquisition constant and allowing the male rates to vary (36, 37).
+
+#### Transmission probabilities.
+
+The effect of a genetic factor in a model of HIV transmission can be included by reducing the transmission coefficient. The probabilities of transmission per contact with an infected partner, βîıı^^,^^,k k^^→i,j , have been estimated in the literature (see ref. 38 for estimates in minimally treated groups). We want to capture a decreased risk in transmission based on genotype (ref. 39, Table 2). No studies have directly evaluated differences in infectivity between HIV-infected CCR5 W/Δ32 heterozygotes and HIV-infected CCR5 wild types. Thus, we base estimates for reduced transmission on studies of groups with various HIV serum viral loads (40), HTLV-I/II viral loads (41), and a study of the effect of AZT treatment on transmission (29). We decrease transmission probabilities for infecting CCR5Δ32/Δ32 persons by 100-fold to reflect the rarity of infections in these persons. However, we assume that infected CCR5Δ32/Δ32 homozygotes can infect susceptibles at a rate similar to CCR5W/W homozygotes, as the former generally have high viremias (ref. 30, Table 2). We also assume that male-to-female transmission is twice as efficient as female-to-male transmission (up to a 9-fold difference has been reported; ref. 42) (ref. 43, Table 2).
+
+Given the assumption of no treatment, the high burden of disease in people with AIDS is assumed to greatly limit their sexual activity. Our initial model excludes people with AIDS from the sexually active groups. Subsequently, we allow persons with AIDS to be sexually active, fixing their transmission rates (βAIDS) to be the same across all CCR5 genotypes, and lower than transmission rates for primary-stage infection (as the viral burden on average is not as high as during the acute phase), and larger than transmission rates for asymptomatic-stage infection (as the viral burden characteristically increases during the end stage of disease).
+
+#### Disease progression.
+
+We assume three stages of HIV infection: primary (acute, stage A), asymptomatic HIV (stage B), and AIDS. The rates of transition through the first two stages are denoted by γ i,j,k  i,j,k, where i represents genotype, j is male/female, and k represents either stage A or stage B. Transition rates through each of these stages are assumed to be inversely proportional to the duration of that stage; however, other distributions are possible (31, 44, 45). Although viral loads generally peak in the first 2 months of infection, steady-state viral loads are established several months beyond this (46). For group A, the primary HIV-infecteds, duration is assumed to be 3.5 months. Based on results from European cohort studies (7–10), the beneficial effects of the CCR5 W/Δ32 genotype are observed mainly in the asymptomatic years of HIV infection; ≈7 years after seroconversion survival rates appear to be quite similar between heterozygous and homozygous individuals. We also assume that CCR5Δ32/Δ32-infected individuals and wild-type individuals progress similarly, and that men and women progress through each disease stage at the same rate. Given these observations, and that survival after infection may be shorter in untreated populations, we choose the duration time in stage B to be 6 years for wild-type individuals and 8 years for heterozygous individuals. Transition through AIDS, δAIDS, is inversely proportional to the duration of AIDS. We estimate this value to be 1 year for the time from onset of AIDS to death. The progression rates are summarized in Table 3.
+
+### Demographic Setting.
+
+Demographic parameters are based on data from Malawi, Zimbabwe, and Botswana (3, 47). Estimated birth and child mortality rates are used to calculate the annual numbers of children (χ i,j  i,j) maturing into the potentially sexually active, susceptible group at the age of 15 years (3). For example, in the case where the mother is CCR5 wild type and the father is CCR5 wild type or heterozygous, the number of CCR5 W/W children is calculated as follows [suppressing (t) notation]: χ1,j 1,j =
+
+$$ B_{r}\hspace{.167em}{ \,\substack{ \\ {\sum} \\ _{k} }\, } \left[ S_{1,F}\frac{(S_{1,M}+I_{1,M,k})}{N_{M}}+ \left[ (0.5)S_{1,F}\frac{(S_{2,M}+I_{2,M,k})}{N_{M}} \right] + \right $$
+
+$$ p_{v} \left \left( \frac{(I_{1,F,k}(S_{1,M}+I_{1,M,k}))}{N_{M}}+ \left[ (0.5)I_{1,F,k}\frac{(S_{2,M}+I_{2,M,k})}{N_{M}} \right] \right) \right] ,\hspace{.167em} $$
+
+where the probability of HIV vertical transmission, 1 − pv , and the birthrate, Br , are both included in the equations together with the Mendelian inheritance values as presented in Table 1. The generalized version of this equation (i.e., χ i,j  i,j) can account for six categories of children (including gender and genotype). We assume that all children of all genotypes are at risk, although we can relax this condition if data become available to support vertical protection (e.g., ref. 48). All infected children are assumed to die before age 15. Before entering the susceptible group at age 15, there is additional loss because of mortality from all non-AIDS causes occurring less than 15 years of age at a rate of μχχ × χ i,j  i,j (where μχ is the mortality under 15 years of age). Children then enter the population as susceptibles at an annual rate, ς j  j × χ i,j  i,j/15, where ς j  distributes the children 51% females and 49% males. All parameters and their values are summarized in Table 4.
+
+## Prevalence of HIV
+
+### Demographics and Model Validation.
+
+The model was validated by using parameters estimated from available demographic data. Simulations were run in the absence of HIV infection to compare the model with known population growth rates. Infection was subsequently introduced with an initial low HIV prevalence of 0.5% to capture early epidemic behavior.
+
+In deciding on our initial values for parameters during infection, we use Joint United Nations Programme on HIV/AIDS national prevalence data for Malawi, Zimbabwe, and Botswana. Nationwide seroprevalence of HIV in these countries varies from ≈11% to over 20% (3), although there may be considerable variation within given subpopulations (2, 49).
+
+In the absence of HIV infection, the annual percent population growth rate in the model is ≈2.5%, predicting the present-day values for an average of sub-Saharan African cities (data not shown). To validate the model with HIV infection, we compare our simulation of the HIV epidemic to existing prevalence data for Kenya and Mozambique (http://www.who.int/emc-hiv/fact-sheets/pdfs/kenya.pdf and ref. 51). Prevalence data collected from these countries follow similar trajectories to those predicted by our model (Fig. 2).
+
+Figure 2 Model simulation of HIV infection in a population lacking the protective CCR5Δ32 allele compared with national data from Kenya (healthy adults) and Mozambique (blood donors, ref. 17). The simulated population incorporates parameter estimates from sub-Saharan African demographics. Note the two outlier points from the Mozambique data were likely caused by underreporting in the early stages of the epidemic.
+
+<!-- image -->
+
+### Effects of the Allele on Prevalence.
+
+After validating the model in the wild type-only population, both CCR5Δ32 heterozygous and homozygous people are included. Parameter values for HIV transmission, duration of illness, and numbers of contacts per partner are assumed to be the same within both settings. We then calculate HIV/AIDS prevalence among adults for total HIV/AIDS cases.
+
+Although CCR5Δ32/Δ32 homozygosity is rarely seen in HIV-positive populations (prevalence ranges between 0 and 0.004%), 1–20% of people in HIV-negative populations of European descent are homozygous. Thus, to evaluate the potential impact of CCR5Δ32, we estimate there are 19% CCR5 W/Δ32 heterozygous and 1% CCR5 Δ32/Δ32 homozygous people in our population. These values are in Hardy-Weinberg equilibrium with an allelic frequency of the mutation as 0.105573.
+
+Fig. 3 shows the prevalence of HIV in two populations: one lacking the mutant CCR5 allele and another carrying that allele. In the population lacking the protective mutation, prevalence increases logarithmically for the first 35 years of the epidemic, reaching 18% before leveling off.
+
+Figure 3 Prevalence of HIV/AIDS in the adult population as predicted by the model. The top curve (○) indicates prevalence in a population lacking the protective allele. We compare that to a population with 19% heterozygous and 1% homozygous for the allele (implying an allelic frequency of 0.105573. Confidence interval bands (light gray) are shown around the median simulation () providing a range of uncertainty in evaluating parameters for the effect of the mutation on the infectivity and the duration of asymptomatic HIV for heterozygotes.
+
+<!-- image -->
+
+In contrast, when a proportion of the population carries the CCR5Δ32 allele, the epidemic increases more slowly, but still logarithmically, for the first 50 years, and HIV/AIDS prevalence reaches ≈12% (Fig. 3). Prevalence begins to decline slowly after 70 years.
+
+In the above simulations we assume that people with AIDS are not sexually active. However, when these individuals are included in the sexually active population the severity of the epidemic increases considerably (data not shown). Consistent with our initial simulations, prevalences are still relatively lower in the presence of the CCR5 mutation.
+
+Because some parameters (e.g., rate constants) are difficult to estimate based on available data, we implement an uncertainty analysis to assess the variability in the model outcomes caused by any inaccuracies in estimates of the parameter values with regard to the effect of the allelic mutation. For these analyses we use Latin hypercube sampling, as described in refs. 52–56, Our uncertainty and sensitivity analyses focus on infectivity vs. duration of infectiousness. To this end, we assess the effects on the dynamics of the epidemic for a range of values of the parameters governing transmission and progression rates: βîıı^^,^^,k k^^→i,j  and γ i,j,k  i,j,k. All other parameters are held constant. These results are presented as an interval band about the average simulation for the population carrying the CCR5Δ32 allele (Fig. 3). Although there is variability in the model outcomes, the analysis indicates that the overall model predictions are consistent for a wide range of transmission and progression rates. Further, most of the variation observed in the outcome is because of the transmission rates for both heterosexual males and females in the primary stage of infection (β2,M,A →  i  ,F, β2,F,A →  i  ,M). As mentioned above, we assume lower viral loads correlate with reduced infectivity; thus, the reduction in viral load in heterozygotes has a major influence on disease spread.
+
+## HIV Induces Selective Pressure on Genotype Frequency
+
+To observe changes in the frequency of the CCR5Δ32 allele in a setting with HIV infection as compared with the Hardy-Weinberg equilibrium in the absence of HIV, we follow changes in the total number of CCR5Δ32 heterozygotes and homozygotes over 1,000 years (Fig. 4). We initially perform simulations in the absence of HIV infection as a negative control to show there is not significant selection of the allele in the absence of infection. To determine how long it would take for the allelic frequency to reach present-day levels (e.g., q = 0.105573), we initiate this simulation for 1,000 years with a very small allelic frequency (q = 0.00105). In the absence of HIV, the allelic frequency is maintained in equilibrium as shown by the constant proportions of CCR5Δ32 heterozygotes and homozygotes (Fig. 4, solid lines). The selection for CCR5Δ32 in the presence of HIV is seen in comparison (Fig. 4, dashed lines). We expand the time frame of this simulation to 2,000 years to view the point at which the frequency reaches present levels (where q ∼0.105573 at year = 1200). Note that the allelic frequency increases for ≈1,600 years before leveling off.
+
+Figure 4 Effects of HIV-1 on selection of the CCR5Δ32 allele. The Hardy-Weinberg equilibrium level is represented in the no-infection simulation (solid lines) for each population. Divergence from the original Hardy-Weinberg equilibrium is shown to occur in the simulations that include HIV infection (dashed lines). Fraction of the total subpopulations are presented: (A) wild types (W/W), (B) heterozygotes (W/Δ32), and (C) homozygotes (Δ32/Δ32). Note that we initiate this simulation with a much lower allelic frequency (0.00105) than used in the rest of the study to better exemplify the actual selective effect over a 1,000-year time scale. (D) The allelic selection effect over a 2,000-year time scale.
+
+<!-- image -->
+
+## Discussion
+
+This study illustrates how populations can differ in susceptibility to epidemic HIV/AIDS depending on a ubiquitous attribute such as a prevailing genotype. We have examined heterosexual HIV epidemics by using mathematical models to assess HIV transmission in dynamic populations either with or without CCR5Δ32 heterozygous and homozygous persons. The most susceptible population lacks the protective mutation in CCR5. In less susceptible populations, the majority of persons carrying the CCR5Δ32 allele are heterozygotes. We explore the hypothesis that lower viral loads (CCR5Δ32 heterozygotes) or resistance to infection (CCR5Δ32 homozygotes) observed in persons with this coreceptor mutation ultimately can influence HIV epidemic trends. Two contrasting influences of the protective CCR5 allele are conceivable: it may limit the epidemic by decreasing the probability of infection because of lower viral loads in infected heterozygotes, or it may exacerbate the epidemic by extending the time that infectious individuals remain in the sexually active population. Our results strongly suggest the former. Thus, the absence of this allele in Africa could explain the severity of HIV disease as compared with populations where the allele is present.
+
+We also observed that HIV can provide selective pressure for the CCR5Δ32 allele within a population, increasing the allelic frequency. Other influences may have additionally selected for this allele. Infectious diseases such as plague and small pox have been postulated to select for CCR5Δ32 (57, 58). For plague, relatively high levels of CCR5Δ32 are believed to have arisen within ≈4,000 years, accounting for the prevalence of the mutation only in populations of European descent. Smallpox virus uses the CC-coreceptor, indicating that direct selection for mutations in CCR5 may have offered resistance to smallpox. Given the differences in the epidemic rates of plague (59), smallpox, and HIV, it is difficult to directly compare our results to these findings. However, our model suggests that the CCR5Δ32 mutation could have reached its present allelic frequency in Northern Europe within this time frame if selected for by a disease with virulence patterns similar to HIV. Our results further support the idea that HIV has been only recently introduced as a pathogen into African populations, as the frequency of the protective allele is almost zero, and our model predicts that selection of the mutant allele in this population by HIV alone takes at least 1,000 years. This prediction is distinct from the frequency of the CCR5Δ32 allele in European populations, where pathogens that may have influenced its frequency (e.g., Yersinia pestis) have been present for much longer.
+
+Two mathematical models have considered the role of parasite and host genetic heterogeneity with regard to susceptibility to another pathogen, namely malaria (60, 61). In each it was determined that heterogeneity of host resistance facilitates the maintenance of diversity in parasite virulence. Given our underlying interest in the coevolution of pathogen and host, we focus on changes in a host protective mutation, holding the virulence of the pathogen constant over time.
+
+Even within our focus on host protective mutations, numerous genetic factors, beneficial or detrimental, could potentially influence epidemics. Other genetically determined host factors affecting HIV susceptibility and disease progression include a CCR5 A/A to G/G promoter polymorphism (62), a CCR2 point mutation (11, 63), and a mutation in the CXCR4 ligand (64). The CCR2b mutation, CCR264I, is found in linkage with at least one CCR5 promoter polymorphism (65) and is prevalent in populations where CCR5Δ32 is nonexistent, such as sub-Saharan Africa (63). However, as none of these mutations have been consistently shown to be as protective as the CCR5Δ32 allele, we simplified our model to incorporate only the effect of CCR5Δ32. Subsequent models could be constructed from our model to account for the complexity of multiple protective alleles. It is interesting to note that our model predicts that even if CCR264I is present at high frequencies in Africa, its protective effects may not augment the lack of a protective allele such as CCR5Δ32.
+
+Although our models demonstrate that genetic factors can contribute to the high prevalence of HIV in sub-Saharan Africa, demographic factors are also clearly important in this region. Our models explicitly incorporated such factors, for example, lack of treatment availability. Additional factors were implicitly controlled for by varying only the presence of the CCR5Δ32 allele. More complex models eventually could include interactions with infectious diseases that serve as cofactors in HIV transmission. The role of high sexually transmitted disease prevalences in HIV infection has long been discussed, especially in relation to core populations (15, 50, 66). Malaria, too, might influence HIV transmission, as it is associated with transient increases in semen HIV viral loads and thus could increase the susceptibility of the population to epidemic HIV (16).
+
+In assessing the HIV/AIDS epidemic, considerable attention has been paid to the influence of core groups in driving sexually transmitted disease epidemics. Our results also highlight how characteristics more uniformly distributed in a population can affect susceptibility. We observed that the genotypic profile of a population affects its susceptibility to epidemic HIV/AIDS. Additional studies are needed to better characterize the influence of these genetic determinants on HIV transmission, as they may be crucial in estimating the severity of the epidemic in some populations. This information can influence the design of treatment strategies as well as point to the urgency for education and prevention programs.
+
+## Acknowledgments
+
+We thank Mark Krosky, Katia Koelle, and Kevin Chung for programming and technical assistance. We also thank Drs. V. J. DiRita, P. Kazanjian, and S. M. Blower for helpful comments and discussions. We thank the reviewers for extremely insightful comments.
+
+## References
+
+- Weiss HA, Hawkes S. Leprosy Rev 72:92–98 (2001). PMID: 11355525
+- Taha TE, Dallabetta GA, Hoover DR, Chiphangwi JD, Mtimavalye LAR. AIDS 12:197–203 (1998). PMID: 9468369
+- AIDS Epidemic Update. Geneva: World Health Organization1–17 (1998).
+- D'Souza MP, Harden VA. Nat Med 2:1293–1300 (1996). PMID: 8946819
+- Martinson JJ, Chapman NH, Rees DC, Liu YT, Clegg JB. Nat Genet 16:100–103 (1997). PMID: 9140404
+- Roos MTL, Lange JMA, deGoede REY, Miedema PT, Tersmette F, Coutinho M, Schellekens RA. J Infect Dis 165:427–432 (1992). PMID: 1347054
+- Garred P, Eugen-Olsen J, Iversen AKN, Benfield TL, Svejgaard A, Hofmann B. Lancet 349:1884 (1997). PMID: 9217763
+- Katzenstein TL, Eugen-Olsen J, Hofman B, Benfield T, Pedersen C, Iversen AK, Sorensen AM, Garred P, Koppelhus U, Svejgaard A, Gerstoft J. J Acquired Immune Defic Syndr Hum Retrovirol 16:10–14 (1997). PMID: 9377119
+- deRoda H, Meyer K, Katzenstain W, Dean M. Science 273:1856–1862 (1996). PMID: 8791590
+- Meyer L, Magierowska M, Hubert JB, Rouzioux C, Deveau C, Sanson F, Debre P, Delfraissy JF, Theodorou I. AIDS 11:F73–F78 (1997). PMID: 9302436
+- Smith MW, Dean M, Carrington M, Winkler C, Huttley DA, Lomb GA, Goedert JJ, O'Brien TR, Jacobson LP, Kaslow R, et al. Science 277:959–965 (1997). PMID: 9252328
+- Samson M, Libert F, Doranz BJ, Rucker J, Liesnard C, Farber CM, Saragosti S, Lapoumeroulie C, Cognaux J, Forceille C, et al. Nature (London) 382:722–725 (1996). PMID: 8751444
+- McNicholl JM, Smith DK, Qari SH, Hodge T. Emerging Infect Dis 3:261–271 (1997). PMID: 9284370
+- Michael NL, Chang G, Louie LG, Mascola JR, Dondero D, Birx DL, Sheppard HW. Nat Med 3:338–340 (1997). PMID: 9055864
+- Mayaud P, Mosha F, Todd J, Balira R, Mgara J, West B, Rusizoka M, Mwijarubi E, Gabone R, Gavyole A, et al. AIDS 11:1873–1880 (1997). PMID: 9412707
+- Hoffman IF, Jere CS, Taylor TE, Munthali P, Dyer JR. AIDS 13:487–494 (1998).
+- HIV/AIDS Surveillance Database. Washington, DC: Population Division, International Programs Center (1999).
+- Anderson RM, May RM, McLean AR. Nature (London) 332:228–234 (1988). PMID: 3279320
+- Berger EA, Doms RW, Fenyo EM, Korber BT, Littman DR, Moore JP, Sattentau QJ, Schuitemaker H, Sodroski J, Weiss RA. Nature (London) 391:240 (1998). PMID: 9440686
+- Alkhatib G, Broder CC, Berger EA. J Virol 70:5487–5494 (1996). PMID: 8764060
+- Choe H, Farzan M, Sun Y, Sullivan N, Rollins B, Ponath PD, Wu L, Mackay CR, LaRosa G, Newman W, et al. Cell 85:1135–1148 (1996). PMID: 8674119
+- Deng H, Liu R, Ellmeier W, Choe S, Unutmaz D, Burkhart M, Di Marzio P, Marmon S, Sutton RE, Hill CM, et al. Nature (London) 381:661–666 (1996). PMID: 8649511
+- Doranz BJ, Rucker J, Yi Y, Smyth RJ, Samsom M, Peiper M, Parmentier SC, Collman RG, Doms RW. Cell 85:1149–1158 (1996). PMID: 8674120
+- Dragic T, Litwin V, Allaway GP, Martin SR, Huang Y, Nagashima KA, Cayanan C, Maddon PJ, Koup RA, Moore JP, Paxton WA. Nature (London) 381:667–673 (1996). PMID: 8649512
+- Zhu T, Mo H, Wang N, Nam DS, Cao Y, Koup RA, Ho DD. Science 261:1179–1181 (1993). PMID: 8356453
+- Bjorndal A, Deng H, Jansson M, Fiore JR, Colognesi C, Karlsson A, Albert J, Scarlatti G, Littman DR, Fenyo EM. J Virol 71:7478–7487 (1997). PMID: 9311827
+- Conner RI, Sheridan KE, Ceradinin D, Choe S, Landau NR. J Exp Med 185:621–628 (1997). PMID: 9034141
+- Liu R, Paxton WA, Choe S, Ceradini D, Martin SR, Horuk R, MacDonald ME, Stuhlmann H, Koup RA, Landau NR. Cell 86:367–377 (1996). PMID: 8756719
+- Mussico M, Lazzarin A, Nicolosi A, Gasparini M, Costigliola P, Arici C, Saracco A. Arch Intern Med (Moscow) 154:1971–1976 (1994). PMID: 8074601
+- Michael NL, Nelson JA, KewalRamani VN, Chang G, O'Brien SJ, Mascola JR, Volsky B, Louder M, White GC, Littman DR, et al. J Virol 72:6040–6047 (1998). PMID: 9621067
+- Hethcote HW, Yorke JA. Gonorrhea Transmission Dynamics and Control. Berlin: Springer (1984).
+- Anderson RM, May RM. Nature (London) 333:514–522 (1988). PMID: 3374601
+- Asiimwe-Okiror G, Opio AA, Musinguzi J, Madraa E, Tembo G, Carael M. AIDS 11:1757–1763 (1997). PMID: 9386811
+- Carael M, Cleland J, Deheneffe JC, Ferry B, Ingham R. AIDS 9:1171–1175 (1995). PMID: 8519454
+- Blower SM, Boe C. J AIDS 6:1347–1352 (1993). PMID: 8254474
+- Kirschner D. J Appl Math 56:143–166 (1996).
+- Le Pont F, Blower S. J AIDS 4:987–999 (1991). PMID: 1890608
+- Kim MY, Lagakos SW. Ann Epidemiol 1:117–128 (1990). PMID: 1669741
+- Anderson RM, May RM. Infectious Disease of Humans: Dynamics and Control. Oxford: Oxford Univ. Press (1992).
+- Ragni MV, Faruki H, Kingsley LA. J Acquired Immune Defic Syndr 17:42–45 (1998).
+- Kaplan JE, Khabbaz RF, Murphy EL, Hermansen S, Roberts C, Lal R, Heneine W, Wright D, Matijas L, Thomson R, et al. J Acquired Immune Defic Syndr Hum Retrovirol 12:193–201 (1996). PMID: 8680892
+- Padian NS, Shiboski SC, Glass SO, Vittinghoff E. Am J Edu 146:350–357 (1997).
+- Leynaert B, Downs AM, de Vincenzi I. Am J Edu 148:88–96 (1998).
+- Garnett GP, Anderson RM. J Acquired Immune Defic Syndr 9:500–513 (1995).
+- Stigum H, Magnus P, Harris JR, Samualson SO, Bakketeig LS. Am J Edu 145:636–643 (1997).
+- Ho DD, Neumann AU, Perelson AS, Chen W, Leonard JM, Markowitz M. Nature (London) 373:123–126 (1995). PMID: 7816094
+- World Resources (1998–1999). Oxford: Oxford Univ. Press (1999).
+- Kostrikis LG, Neumann AU, Thomson B, Korber BT, McHardy P, Karanicolas R, Deutsch L, Huang Y, Lew JF, McIntosh K, et al. J Virol 73:10264–10271 (1999). PMID: 10559343
+- Low-Beer D, Stoneburner RL, Mukulu A. Nat Med 3:553–557 (1997). PMID: 9142126
+- Grosskurth H, Mosha F, Todd J, Senkoro K, Newell J, Klokke A, Changalucha J, West B, Mayaud P, Gavyole A. AIDS 9:927–934 (1995). PMID: 7576329
+- Melo J, Beby-Defaux A, Faria C, Guiraud G, Folgosa E, Barreto A, Agius G. J AIDS 23:203–204 (2000). PMID: 10737436
+- Iman RL, Helton JC, Campbell JE. J Quality Technol 13:174–183 (1981).
+- Iman RL, Helton JC, Campbell JE. J Quality Technol 13:232–240 (1981).
+- Blower SM, Dowlatabadi H. Int Stat Rev 62:229–243 (1994).
+- Porco TC, Blower SM. Theor Popul Biol 54:117–132 (1998). PMID: 9733654
+- Blower SM, Porco TC, Darby G. Nat Med 4:673–678 (1998). PMID: 9623975
+- Libert F, Cochaux P, Beckman G, Samson M, Aksenova M, Cao A, Czeizel A, Claustres M, de la Rua C, Ferrari M, et al. Hum Mol Genet 7:399–406 (1998). PMID: 9466996
+- Lalani AS, Masters J, Zeng W, Barrett J, Pannu R, Everett H, Arendt CW, McFadden G. Science 286:1968–1971 (1999). PMID: 10583963
+- Kermack WO, McKendrick AG. Proc R Soc London 261:700–721 (1927).
+- Gupta S, Hill AVS. Proc R Soc London Ser B 260:271–277 (1995).
+- Ruwende C, Khoo SC, Snow RW, Yates SNR, Kwiatkowski D, Gupta S, Warn P, Allsopp CE, Gilbert SC, Peschu N. Nature (London) 376:246–249 (1995). PMID: 7617034
+- McDermott DH, Zimmerman PA, Guignard F, Kleeberger CA, Leitman SF, Murphy PM. Lancet 352:866–870 (1998). PMID: 9742978
+- Kostrikis LG, Huang Y, Moore JP, Wolinsky SM, Zhang L, Guo Y, Deutsch L, Phair J, Neumann AU, Ho DD. Nat Med 4:350–353 (1998). PMID: 9500612
+- Winkler C, Modi W, Smith MW, Nelson GW, Wu X, Carrington M, Dean M, Honjo T, Tashiro K, Yabe D, et al. Science 279:389–393 (1998). PMID: 9430590
+- Martinson JJ, Hong L, Karanicolas R, Moore JP, Kostrikis LG. AIDS 14:483–489 (2000). PMID: 10780710
+- Vernazza PL, Eron JJ, Fiscus SA, Cohen MS. AIDS 13:155–166 (1999). PMID: 10202821
--- a/tests/data/groundtruth/docling_v2/pntd.0008301.xml.itxt
+++ b/tests/data/groundtruth/docling_v2/pntd.0008301.xml.itxt
@ -1,132 +1,135 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: title: Risk factors associated with fai ... s: Results of a multi-country analysis
-    item-2 at level 2: paragraph: Burgert-Brucker Clara R.; 1: Glo ... shington, DC, United States of America
-    item-3 at level 2: section_header: Abstract
-      item-4 at level 3: text: Achieving elimination of lymphat ... ine prevalence and/or lower elevation.
-    item-5 at level 2: section_header: Introduction
-      item-6 at level 3: text: Lymphatic filariasis (LF), a dis ... 8 countries remain endemic for LF [3].
-      item-7 at level 3: text: The road to elimination as a pub ... t elimination be officially validated.
-      item-8 at level 3: text: Pre-TAS include at least one sen ... me of day that blood can be taken [5].
-      item-9 at level 3: text: When a country fails to meet the ... o ensure rounds of MDA are not missed.
-      item-10 at level 3: text: This study aims to understand wh ... e of limited LF elimination resources.
-    item-11 at level 2: section_header: Methods
-      item-12 at level 3: text: This is a secondary data analysi ... rch; no ethical approval was required.
-      item-13 at level 3: text: Building on previous work, we de ... available global geospatial data sets.
-      item-14 at level 3: section_header: Data sources
-        item-15 at level 4: text: Information on baseline prevalen ...  publicly available sources (Table 1).
-      item-16 at level 3: section_header: Outcome and covariate variables
-        item-17 at level 4: text: The outcome of interest for this ... r than or equal to 1% Mf or 2% Ag [4].
-        item-18 at level 4: text: Potential covariates were derive ... is and the final categorizations used.
-        item-19 at level 4: section_header: Baseline prevalence
-          item-20 at level 5: text: Baseline prevalence can be assum ... (2) using the cut-off of <10% or ≥10%.
-        item-21 at level 4: section_header: Agent
-          item-22 at level 5: text: In terms of differences in trans ... dazole (DEC-ALB)] from the MDA domain.
-        item-23 at level 4: section_header: Environment
-          item-24 at level 5: text: LF transmission intensity is inf ... dicates a higher level of “greenness.”
-          item-25 at level 5: text: We included the socio-economic v ...  proxy for socio-economic status [33].
-          item-26 at level 5: text: Finally, all or parts of distric ... s were co-endemic with onchocerciasis.
-        item-27 at level 4: section_header: MDA
-          item-28 at level 5: text: Treatment effectiveness depends  ... esent a threat to elimination [41,42].
-          item-29 at level 5: text: We considered three approaches w ... unds ever documented in that district.
-        item-30 at level 4: section_header: Pre-TAS implementation
-          item-31 at level 5: text: Pre-TAS results can be influence ... d throughout the time period of study.
-      item-32 at level 3: section_header: Data inclusion criteria
-        item-33 at level 4: text: The dataset, summarized at the d ... al analysis dataset had 554 districts.
-      item-34 at level 3: section_header: Statistical analysis and modeling
-        item-35 at level 4: text: Statistical analysis and modelin ... d the number of variables accordingly.
-        item-36 at level 4: text: Sensitivity analysis was perform ... ot have been truly LF-endemic [43,44].
-    item-37 at level 2: section_header: Results
-      item-38 at level 3: text: The overall pre-TAS pass rate fo ... ts had baseline prevalences below 20%.
-      item-39 at level 3: text: Fig 3 shows the unadjusted analy ... overage, and sufficient rounds of MDA.
-      item-40 at level 3: text: The final log-binomial model inc ... igh baseline and diagnostic test used.
-      item-41 at level 3: text: Fig 4 shows the risk ratio resul ... of failing pre-TAS (95% CI 1.95–4.83).
-      item-42 at level 3: text: Sensitivity analyses were conduc ... gnified by large confidence intervals.
-      item-43 at level 3: text: Overall 74 districts in the data ... or 51% of all the failures (38 of 74).
-    item-44 at level 2: section_header: Discussion
-      item-45 at level 3: text: This paper reports for the first ... ctors associated with TAS failure [7].
-      item-46 at level 3: text: Though diagnostic test used was  ...  FTS was more sensitive than ICT [45].
-      item-47 at level 3: text: Elevation was the only environme ... ich impact vector chances of survival.
-      item-48 at level 3: text: The small number of failures ove ... search has shown the opposite [15,16].
-      item-49 at level 3: text: All other variables included in  ... are not necessary to lower prevalence.
-      item-50 at level 3: text: Limitations to this study includ ...  reducing LF prevalence [41,48,51–53].
-      item-51 at level 3: text: Fourteen districts were excluded ... ta to extreme outliners in a district.
-      item-52 at level 3: text: As this analysis used data acros ... of individuals included in the survey.
-      item-53 at level 3: text: This paper provides evidence fro ... th high baseline and/or low elevation.
-    item-54 at level 2: section_header: Tables
-      item-55 at level 3: table with [18x8]
-        item-55 at level 4: caption: Table 1: Categorization of potential factors influencing pre-TAS results.
-      item-56 at level 3: table with [11x6]
-        item-56 at level 4: caption: Table 2: Adjusted risk ratios for pre-TAS failure from log-binomial model sensitivity analysis.
-    item-57 at level 2: section_header: Figures
-      item-58 at level 3: picture
-        item-58 at level 4: caption: Fig 1: Number of pre-TAS by country.
-      item-59 at level 3: picture
-        item-59 at level 4: caption: Fig 2: District-level baseline prevalence by country.
-      item-60 at level 3: picture
-        item-60 at level 4: caption: Fig 3: Percent pre-TAS failure by each characteristic (unadjusted).
-      item-61 at level 3: picture
-        item-61 at level 4: caption: Fig 4: Adjusted risk ratios for pre-TAS failure with 95% Confidence Interval from log-binomial model.
-      item-62 at level 3: picture
-        item-62 at level 4: caption: Fig 5: Analysis of failures by model combinations.
-    item-63 at level 2: section_header: References
-      item-64 at level 3: list: group list
-        item-65 at level 4: list_item: World Health Organization. Lymph ... rategic plan 2010–2020. Geneva; 2010. 
-        item-66 at level 4: list_item: World Health Organization. Valid ...  public health problem. Geneva; 2017. 
-        item-67 at level 4: list_item: Global programme to eliminate ly ... eport, 2018. Wkly Epidemiol Rec (2019)
-        item-68 at level 4: list_item: World Health Organization. Globa ... ss drug administration. Geneva; 2011. 
-        item-69 at level 4: list_item: World Health Organization. Stren ... isease-specific Indicators. 2016; 42. 
-        item-70 at level 4: list_item: Kyelem D; Biswas G; Bockarie MJ; ... search needs. Am J Trop Med Hyg (2008)
-        item-71 at level 4: list_item: Goldberg EM; King JD; Mupfasoni  ... c filariasis. Am J Trop Med Hyg (2019)
-        item-72 at level 4: list_item: Cano J; Rebollo MP; Golding N; P ...  present. Parasites and Vectors (2014)
-        item-73 at level 4: list_item: CGIAR-CSI. CGIAR-CSI SRTM 90m DEM Digital Elevation Database. In: . 
-        item-74 at level 4: list_item: USGS NASA. Vegetation indices 16 ... et]. [cited 1 May 2018]. Available: . 
-        item-75 at level 4: list_item: Funk C; Peterson P; Landsfeld M; ... r monitoring extremes. Sci Data (2015)
-        item-76 at level 4: list_item: Lloyd CT; Sorichetta A; Tatem AJ ... in population studies. Sci Data (2017)
-        item-77 at level 4: list_item: Elvidge CD; Baugh KE; Zhizhin M; ... hts. Proc Asia-Pacific Adv Netw (2013)
-        item-78 at level 4: list_item: Jambulingam P; Subramanian S; De ... dicators. Parasites and Vectors (2016)
-        item-79 at level 4: list_item: Michael E; Malecela-Lazaro MN; S ... c filariasis. Lancet Infect Dis (2004)
-        item-80 at level 4: list_item: Stolk WA; Swaminathan S; van Oor ...  simulation study. J Infect Dis (2003)
-        item-81 at level 4: list_item: Grady CA; De Rochars MB; Direny  ... asis programs. Emerg Infect Dis (2007)
-        item-82 at level 4: list_item: Evans D; McFarland D; Adamani W; ... Nigeria. Ann Trop Med Parasitol (2011)
-        item-83 at level 4: list_item: Richards FO; Eigege A; Miri ES;  ...  in Nigeria. PLoS Negl Trop Dis (2011)
-        item-84 at level 4: list_item: Biritwum NK; Yikpotey P; Marfo B ... Ghana. Trans R Soc Trop Med Hyg (2016)
-        item-85 at level 4: list_item: Moraga P; Cano J; Baggaley RF; G ... odelling. Parasites and Vectors (2015)
-        item-86 at level 4: list_item: Irvine MA; Njenga SM; Gunawarden ... ction. Trans R Soc Trop Med Hyg (2016)
-        item-87 at level 4: list_item: Ottesen EA. Efficacy of diethylc ... ariae in humans. Rev Infect Dis (1985)
-        item-88 at level 4: list_item: Gambhir M; Bockarie M; Tisch D;  ...  lymphatic filariasis. BMC Biol (2010)
-        item-89 at level 4: list_item: World Health Organization. Globa ... al entomology handbook. Geneva; 2013. 
-        item-90 at level 4: list_item: Slater H; Michael E. Predicting  ... gical niche modelling. PLoS One (2012)
-        item-91 at level 4: list_item: Slater H; Michael E. Mapping, Ba ...  prevalence in Africa. PLoS One (2013)
-        item-92 at level 4: list_item: Sabesan S; Raju KHK; Subramanian ... odel. Vector-Borne Zoonotic Dis (2013)
-        item-93 at level 4: list_item: Stanton MC; Molyneux DH; Kyelem  ... in Burkina Faso. Geospat Health (2013)
-        item-94 at level 4: list_item: Manhenje I; Teresa Galán-Puchade ... hern Mozambique. Geospat Health (2013)
-        item-95 at level 4: list_item: Ngwira BM; Tambala P; Perez a M; ...  infection in Malawi. Filaria J (2007)
-        item-96 at level 4: list_item: Simonsen PE; Mwakitalu ME. Urban ... hatic filariasis. Parasitol Res (2013)
-        item-97 at level 4: list_item: Proville J; Zavala-Araiza D; Wag ... socio-economic trends. PLoS One (2017)
-        item-98 at level 4: list_item: Endeshaw T; Taye A; Tadesse Z; K ... st Ethiopia. Pathog Glob Health (2015)
-        item-99 at level 4: list_item: Richards FO; Eigege A; Pam D; Ka ... eas of co-endemicity. Filaria J (2005)
-        item-100 at level 4: list_item: Kyelem D; Sanou S; Boatin B a; M ... cations. Ann Trop Med Parasitol (2003)
-        item-101 at level 4: list_item: Weil GJ; Lammie PJ; Richards FO; ... ne and ivermectin. J Infect Dis (1991)
-        item-102 at level 4: list_item: Kumar A; Sachan P. Measuring imp ... rug administration. Trop Biomed (2014)
-        item-103 at level 4: list_item: Njenga SM; Mwandawiro CS; Wamae  ...  control. Parasites and Vectors (2011)
-        item-104 at level 4: list_item: Boyd A; Won KY; McClintock SK; D ... gane, Haiti. PLoS Negl Trop Dis (2010)
-        item-105 at level 4: list_item: Irvine MA; Reimer LJ; Njenga SM; ... mination. Parasites and Vectors (2015)
-        item-106 at level 4: list_item: Irvine MA; Stolk WA; Smith ME; S ... elling study. Lancet Infect Dis (2017)
-        item-107 at level 4: list_item: Pion SD; Montavon C; Chesnais CB ... crofilaremia. Am J Trop Med Hyg (2016)
-        item-108 at level 4: list_item: Wanji S; Esum ME; Njouendou AJ;  ... in Cameroon. PLoS Negl Trop Dis (2018)
-        item-109 at level 4: list_item: Chesnais CB; Awaca-Uvon NP; Bola ... a in Africa. PLoS Negl Trop Dis (2017)
-        item-110 at level 4: list_item: Silumbwe A; Zulu JM; Halwindi H; ... haran Africa. BMC Public Health (2017)
-        item-111 at level 4: list_item: Adams AM; Vuckovic M; Birch E; B ... nistration. Trop Med Infect Dis (2018)
-        item-112 at level 4: list_item: Rao RU; Samarasekera SD; Nagodav ... n Sri Lanka. PLoS Negl Trop Dis (2017)
-        item-113 at level 4: list_item: Xu Z; Graves PM; Lau CL; Clement ... is in American Samoa. Epidemics (2018)
-        item-114 at level 4: list_item: Id CM; Tettevi EJ; Mechan F; Idu ... rural Ghana. PLoS Negl Trop Dis (2019)
-        item-115 at level 4: list_item: Eigege A; Kal A; Miri E; Sallau  ...  in Nigeria. PLoS Negl Trop Dis (2013)
-        item-116 at level 4: list_item: Van den Berg H; Kelly-Hope LA; L ... r management. Lancet Infect Dis (2013)
-        item-117 at level 4: list_item: Webber R.. Eradication of Wucher ... ntrol. Trans R Soc Trop Med Hyg (1979)
-  item-118 at level 1: caption: Table 1: Categorization of potential factors influencing pre-TAS results.
-  item-119 at level 1: caption: Table 2: Adjusted risk ratios fo ... g-binomial model sensitivity analysis.
-  item-120 at level 1: caption: Fig 1: Number of pre-TAS by country.
-  item-121 at level 1: caption: Fig 2: District-level baseline prevalence by country.
-  item-122 at level 1: caption: Fig 3: Percent pre-TAS failure by each characteristic (unadjusted).
-  item-123 at level 1: caption: Fig 4: Adjusted risk ratios for  ... ence Interval from log-binomial model.
-  item-124 at level 1: caption: Fig 5: Analysis of failures by model combinations.
+    item-2 at level 2: paragraph: Clara R. Burgert-Brucker, Kathry ... garet Baker, John Kraemer, Molly Brady
+    item-3 at level 2: paragraph: Global Health Division, RTI Inte ... shington, DC, United States of America
+    item-4 at level 2: section_header: Abstract
+      item-5 at level 3: text: Achieving elimination of lymphat ... as at highest risk of failing pre-TAS.
+    item-6 at level 2: section_header: Author summary
+      item-7 at level 3: text: Achieving elimination of lymphat ... ine prevalence and/or lower elevation.
+    item-8 at level 2: section_header: Introduction
+      item-9 at level 3: text: Lymphatic filariasis (LF), a dis ... 8 countries remain endemic for LF [3].
+      item-10 at level 3: text: The road to elimination as a pub ... t elimination be officially validated.
+      item-11 at level 3: text: Pre-TAS include at least one sen ... me of day that blood can be taken [5].
+      item-12 at level 3: text: When a country fails to meet the ... o ensure rounds of MDA are not missed.
+      item-13 at level 3: text: This study aims to understand wh ... e of limited LF elimination resources.
+    item-14 at level 2: section_header: Methods
+      item-15 at level 3: text: This is a secondary data analysi ... rch; no ethical approval was required.
+      item-16 at level 3: text: Building on previous work, we de ... available global geospatial data sets.
+      item-17 at level 3: table with [18x8]
+        item-17 at level 4: caption: Table 1 Categorization of potential factors influencing pre-TAS results.
+      item-18 at level 3: section_header: Data sources
+        item-19 at level 4: text: Information on baseline prevalen ...  publicly available sources (Table 1).
+      item-20 at level 3: section_header: Outcome and covariate variables
+        item-21 at level 4: text: The outcome of interest for this ... r than or equal to 1% Mf or 2% Ag [4].
+        item-22 at level 4: text: Potential covariates were derive ... is and the final categorizations used.
+        item-23 at level 4: section_header: Baseline prevalence
+          item-24 at level 5: text: Baseline prevalence can be assum ... (2) using the cut-off of <10% or ≥10%.
+        item-25 at level 4: section_header: Agent
+          item-26 at level 5: text: In terms of differences in trans ... dazole (DEC-ALB)] from the MDA domain.
+        item-27 at level 4: section_header: Environment
+          item-28 at level 5: text: LF transmission intensity is inf ... dicates a higher level of “greenness.”
+          item-29 at level 5: text: We included the socio-economic v ...  proxy for socio-economic status [33].
+          item-30 at level 5: text: Finally, all or parts of distric ... s were co-endemic with onchocerciasis.
+        item-31 at level 4: section_header: MDA
+          item-32 at level 5: text: Treatment effectiveness depends  ... esent a threat to elimination [41,42].
+          item-33 at level 5: text: We considered three approaches w ... unds ever documented in that district.
+        item-34 at level 4: section_header: Pre-TAS implementation
+          item-35 at level 5: text: Pre-TAS results can be influence ... d throughout the time period of study.
+      item-36 at level 3: section_header: Data inclusion criteria
+        item-37 at level 4: text: The dataset, summarized at the d ... al analysis dataset had 554 districts.
+      item-38 at level 3: section_header: Statistical analysis and modeling
+        item-39 at level 4: text: Statistical analysis and modelin ... d the number of variables accordingly.
+        item-40 at level 4: text: Sensitivity analysis was perform ... ot have been truly LF-endemic [43,44].
+    item-41 at level 2: section_header: Results
+      item-42 at level 3: text: The overall pre-TAS pass rate fo ... ts had baseline prevalences below 20%.
+      item-43 at level 3: picture
+        item-43 at level 4: caption: Fig 1 Number of pre-TAS by country.
+      item-44 at level 3: picture
+        item-44 at level 4: caption: Fig 2 District-level baseline prevalence by country.
+      item-45 at level 3: text: Fig 3 shows the unadjusted analy ... overage, and sufficient rounds of MDA.
+      item-46 at level 3: picture
+        item-46 at level 4: caption: Fig 3 Percent pre-TAS failure by each characteristic (unadjusted).
+      item-47 at level 3: text: The final log-binomial model inc ... igh baseline and diagnostic test used.
+      item-48 at level 3: text: Fig 4 shows the risk ratio resul ... of failing pre-TAS (95% CI 1.95–4.83).
+      item-49 at level 3: picture
+        item-49 at level 4: caption: Fig 4 Adjusted risk ratios for pre-TAS failure with 95% Confidence Interval from log-binomial model.
+      item-50 at level 3: text: Sensitivity analyses were conduc ... gnified by large confidence intervals.
+      item-51 at level 3: table with [11x6]
+        item-51 at level 4: caption: Table 2 Adjusted risk ratios for pre-TAS failure from log-binomial model sensitivity analysis.
+      item-52 at level 3: text: Overall 74 districts in the data ... or 51% of all the failures (38 of 74).
+      item-53 at level 3: picture
+        item-53 at level 4: caption: Fig 5 Analysis of failures by model combinations.
+    item-54 at level 2: section_header: Discussion
+      item-55 at level 3: text: This paper reports for the first ... ctors associated with TAS failure [7].
+      item-56 at level 3: text: Though diagnostic test used was  ...  FTS was more sensitive than ICT [45].
+      item-57 at level 3: text: Elevation was the only environme ... ich impact vector chances of survival.
+      item-58 at level 3: text: The small number of failures ove ... search has shown the opposite [15,16].
+      item-59 at level 3: text: All other variables included in  ... are not necessary to lower prevalence.
+      item-60 at level 3: text: Limitations to this study includ ...  reducing LF prevalence [41,48,51–53].
+      item-61 at level 3: text: Fourteen districts were excluded ... ta to extreme outliners in a district.
+      item-62 at level 3: text: As this analysis used data acros ... of individuals included in the survey.
+      item-63 at level 3: text: This paper provides evidence fro ... th high baseline and/or low elevation.
+    item-64 at level 2: section_header: Acknowledgments
+      item-65 at level 3: text: The authors would like to thank  ... e surveys financially and technically.
+    item-66 at level 2: section_header: References
+      item-67 at level 3: list: group list
+        item-68 at level 4: list_item: World Health Organization. Lymph ... trategic plan 2010–2020. Geneva; 2010.
+        item-69 at level 4: list_item: World Health Organization. Valid ... a public health problem. Geneva; 2017.
+        item-70 at level 4: list_item: World Health Organization. Globa ...  Wkly Epidemiol Rec. 2019;94: 457–472.
+        item-71 at level 4: list_item: World Health Organization. Globa ... ass drug administration. Geneva; 2011.
+        item-72 at level 4: list_item: World Health Organization. Stren ... Disease-specific Indicators. 2016; 42.
+        item-73 at level 4: list_item: KyelemD, BiswasG, BockarieMJ, Br ... Trop Med Hyg. 2008;79: 480–4. 18840733
+        item-74 at level 4: list_item: GoldbergEM, KingJD, MupfasoniD,  ... . 2019; 10.4269/ajtmh.18-0721 31115301
+        item-75 at level 4: list_item: CanoJ, RebolloMP, GoldingN, Pull ... : 1–19. 10.1186/1756-3305-7-1 24411014
+        item-76 at level 4: list_item: CGIAR-CSI. CGIAR-CSI SRTM 90m DE ...  Available: http://srtm.csi.cgiar.org/
+        item-77 at level 4: list_item: USGS NASA. Vegetation indices 16 ... /lpdaac.usgs.gov/products/myd13a1v006/
+        item-78 at level 4: list_item: FunkC, PetersonP, LandsfeldM, Pe ...  2015;2 10.1038/sdata.2015.66 26646728
+        item-79 at level 4: list_item: LloydCT, SorichettaA, TatemAJ. H ... : 170001 10.1038/sdata.2017.1 28140386
+        item-80 at level 4: list_item: ElvidgeCD, BaughKE, ZhizhinM, Hs ... Network; 2013;35: 62 10.7125/apan.35.7
+        item-81 at level 4: list_item: JambulingamP, SubramanianS, De V ... 18. 10.1186/s13071-015-1291-6 26728523
+        item-82 at level 4: list_item: MichaelE, Malecela-LazaroMN, Sim ... 10.1016/S1473-3099(04)00973-9 15050941
+        item-83 at level 4: list_item: StolkWA, SwaminathanS, van Oortm ... ;188: 1371–81. 10.1086/378354 14593597
+        item-84 at level 4: list_item: GradyCA, De RocharsMB, DirenyAN, ... 8–610. 10.3201/eid1304.061063 17553278
+        item-85 at level 4: list_item: EvansD, McFarlandD, AdamaniW, Ei ... 0.1179/2047773211Y.0000000010 22325813
+        item-86 at level 4: list_item: RichardsFO, EigegeA, MiriES, Kal ...  10.1371/journal.pntd.0001346 22022627
+        item-87 at level 4: list_item: BiritwumNK, YikpoteyP, MarfoBK,  ... 90–695. 10.1093/trstmh/trx007 28938053
+        item-88 at level 4: list_item: MoragaP, CanoJ, BaggaleyRF, Gyap ... 16. 10.1186/s13071-014-0608-1 25561160
+        item-89 at level 4: list_item: IrvineMA, NjengaSM, GunawardenaS ... 18–124. 10.1093/trstmh/trv096 26822604
+        item-90 at level 4: list_item: OttesenEA. Efficacy of diethylca ... iae in humans. Rev Infect Dis. 1985;7.
+        item-91 at level 4: list_item: GambhirM, BockarieM, TischD, Kaz ... 2010;8 10.1186/1741-7007-8-22 20236528
+        item-92 at level 4: list_item: World Health Organization. Globa ... cal entomology handbook. Geneva; 2013.
+        item-93 at level 4: list_item: SlaterH, MichaelE. Predicting th ...  10.1371/journal.pone.0032202 22359670
+        item-94 at level 4: list_item: SlaterH, MichaelE. Mapping, Baye ...  10.1371/journal.pone.0071574 23951194
+        item-95 at level 4: list_item: SabesanS, RajuKHK, SubramanianS, ... 57–665. 10.1089/vbz.2012.1238 23808973
+        item-96 at level 4: list_item: StantonMC, MolyneuxDH, KyelemD,  ... : 159–173. 10.4081/gh.2013.63 24258892
+        item-97 at level 4: list_item: ManhenjeI, Teresa Galán-Puchades ... : 391–398. 10.4081/gh.2013.96 23733300
+        item-98 at level 4: list_item: NgwiraBM, TambalaP, Perez aM, Bo ... ;6: 12 10.1186/1475-2883-6-12 18047646
+        item-99 at level 4: list_item: SimonsenPE, MwakitaluME. Urban l ... 44. 10.1007/s00436-012-3226-x 23239094
+        item-100 at level 4: list_item: ProvilleJ, Zavala-AraizaD, Wagne ...  10.1371/journal.pone.0174610 28346500
+        item-101 at level 4: list_item: EndeshawT, TayeA, TadesseZ, Kata ... 10.1080/20477724.2015.1103501 26878935
+        item-102 at level 4: list_item: RichardsFO, EigegeA, PamD, KalA, ... 4: 3–5. 10.1186/1475-2883-4-3 15916708
+        item-103 at level 4: list_item: KyelemD, SanouS, BoatinB a., Med ... 8. 10.1179/000349803225002462 14754495
+        item-104 at level 4: list_item: WeilGJ, LammiePJ, RichardsFO, Eb ... –816. 10.1093/infdis/164.4.814 1894943
+        item-105 at level 4: list_item: KumarA, SachanP. Measuring impac ... rop Biomed. 2014;31: 225–229. 25134891
+        item-106 at level 4: list_item: NjengaSM, MwandawiroCS, WamaeCN, ... 4: 1–9. 10.1186/1756-3305-4-1 21205315
+        item-107 at level 4: list_item: BoydA, WonKY, McClintockSK, Dono ...  10.1371/journal.pntd.0000640 20351776
+        item-108 at level 4: list_item: IrvineMA, ReimerLJ, NjengaSM, Gu ... 19. 10.1186/s13071-014-0608-1 25561160
+        item-109 at level 4: list_item: IrvineMA, StolkWA, SmithME, Subr ... 10.1016/S1473-3099(16)30467-4 28012943
+        item-110 at level 4: list_item: PionSD, MontavonC, ChesnaisCB, K ... 7–1423. 10.4269/ajtmh.16-0547 27729568
+        item-111 at level 4: list_item: WanjiS, EsumME, NjouendouAJ, Mbe ...  10.1371/journal.pntd.0007192 30849120
+        item-112 at level 4: list_item: ChesnaisCB, Awaca-UvonNP, BolayF ...  10.1371/journal.pntd.0005703 28892473
+        item-113 at level 4: list_item: SilumbweA, ZuluJM, HalwindiH, Ja ... 15. 10.1186/s12889-017-4414-5 28532397
+        item-114 at level 4: list_item: AdamsAM, VuckovicM, BirchE, Bran ... ;3 10.3390/tropicalmed3040122 30469342
+        item-115 at level 4: list_item: RaoRU, SamarasekeraSD, Nagodavit ...  10.1371/journal.pntd.0006066 29084213
+        item-116 at level 4: list_item: XuZ, GravesPM, LauCL, ClementsA, ...  10.1016/j.epidem.2018.12.003 30611745
+        item-117 at level 4: list_item: IdCM, TetteviEJ, MechanF, IdunB, ... Ghana. PLoS Negl Trop Dis. 2019; 1–17.
+        item-118 at level 4: list_item: EigegeA, KalA, MiriE, SallauA, U ...  10.1371/journal.pntd.0002508 24205421
+        item-119 at level 4: list_item: Van den BergH, Kelly-HopeLA, Lin ... 10.1016/S1473-3099(12)70148-2 23084831
+        item-120 at level 4: list_item: WebberR. Eradication of Wucherer ... ol. Trans R Soc Trop Med Hyg. 1979;73.
+  item-121 at level 1: caption: Table 1 Categorization of potential factors influencing pre-TAS results.
+  item-122 at level 1: caption: Fig 1 Number of pre-TAS by country.
+  item-123 at level 1: caption: Fig 2 District-level baseline prevalence by country.
+  item-124 at level 1: caption: Fig 3 Percent pre-TAS failure by each characteristic (unadjusted).
+  item-125 at level 1: caption: Fig 4 Adjusted risk ratios for p ... ence Interval from log-binomial model.
+  item-126 at level 1: caption: Table 2 Adjusted risk ratios for ... g-binomial model sensitivity analysis.
+  item-127 at level 1: caption: Fig 5 Analysis of failures by model combinations.
--- a/tests/data/groundtruth/docling_v2/pntd.0008301.xml.json
+++ b/tests/data/groundtruth/docling_v2/pntd.0008301.xml.json
--- a/tests/data/groundtruth/docling_v2/pntd.0008301.xml.md
+++ b/tests/data/groundtruth/docling_v2/pntd.0008301.xml.md
@ -1,10 +1,16 @@
 # Risk factors associated with failing pre-transmission assessment surveys (pre-TAS) in lymphatic filariasis elimination programs: Results of a multi-country analysis

-Burgert-Brucker Clara R.; 1: Global Health Division, RTI International, Washington, DC, United States of America; Zoerhoff Kathryn L.; 1: Global Health Division, RTI International, Washington, DC, United States of America; Headland Maureen; 1: Global Health Division, RTI International, Washington, DC, United States of America, 2: Global Health, Population, and Nutrition, FHI 360, Washington, DC, United States of America; Shoemaker Erica A.; 1: Global Health Division, RTI International, Washington, DC, United States of America; Stelmach Rachel; 1: Global Health Division, RTI International, Washington, DC, United States of America; Karim Mohammad Jahirul; 3: Department of Disease Control, Ministry of Health and Family Welfare, Dhaka, Bangladesh; Batcho Wilfrid; 4: National Control Program of Communicable Diseases, Ministry of Health, Cotonou, Benin; Bougouma Clarisse; 5: Lymphatic Filariasis Elimination Program, Ministère de la Santé, Ouagadougou, Burkina Faso; Bougma Roland; 5: Lymphatic Filariasis Elimination Program, Ministère de la Santé, Ouagadougou, Burkina Faso; Benjamin Didier Biholong; 6: National Onchocerciasis and Lymphatic Filariasis Control Program, Ministry of Health, Yaounde, Cameroon; Georges Nko'Ayissi; 6: National Onchocerciasis and Lymphatic Filariasis Control Program, Ministry of Health, Yaounde, Cameroon; Marfo Benjamin; 7: Neglected Tropical Diseases Programme, Ghana Health Service, Accra, Ghana; Lemoine Jean Frantz; 8: Ministry of Health, Port-au-Prince, Haiti; Pangaribuan Helena Ullyartha; 9: National Institute Health Research &amp; Development, Ministry of Health, Jakarta, Indonesia; Wijayanti Eksi; 9: National Institute Health Research &amp; Development, Ministry of Health, Jakarta, Indonesia; Coulibaly Yaya Ibrahim; 10: Filariasis Unit, International Center of Excellence in Research, Faculty of Medicine and Odontostomatology, Bamako, Mali; Doumbia Salif Seriba; 10: Filariasis Unit, International Center of Excellence in Research, Faculty of Medicine and Odontostomatology, Bamako, Mali; Rimal Pradip; 11: Epidemiology and Disease Control Division, Department of Health Service, Kathmandu, Nepal; Salissou Adamou Bacthiri; 12: Programme Onchocercose et Filariose Lymphatique, Ministère de la Santé, Niamey, Niger; Bah Yukaba; 13: National Neglected Tropical Disease Program, Ministry of Health and Sanitation, Freetown, Sierra Leone; Mwingira Upendo; 14: Neglected Tropical Disease Control Programme, National Institute for Medical Research, Dar es Salaam, Tanzania; Nshala Andreas; 15: IMA World Health/Tanzania NTD Control Programme, Uppsala University, &amp; TIBA Fellow, Dar es Salaam, Tanzania; Muheki Edridah; 16: Programme to Eliminate Lymphatic Filariasis, Ministry of Health, Kampala, Uganda; Shott Joseph; 17: Division of Neglected Tropical Diseases, Office of Infectious Diseases, Bureau for Global Health, USAID, Washington, DC, United States of America; Yevstigneyeva Violetta; 17: Division of Neglected Tropical Diseases, Office of Infectious Diseases, Bureau for Global Health, USAID, Washington, DC, United States of America; Ndayishimye Egide; 2: Global Health, Population, and Nutrition, FHI 360, Washington, DC, United States of America; Baker Margaret; 1: Global Health Division, RTI International, Washington, DC, United States of America; Kraemer John; 1: Global Health Division, RTI International, Washington, DC, United States of America, 18: Georgetown University, Washington, DC, United States of America; Brady Molly; 1: Global Health Division, RTI International, Washington, DC, United States of America
+Clara R. Burgert-Brucker, Kathryn L. Zoerhoff, Maureen Headland, Erica A. Shoemaker, Rachel Stelmach, Mohammad Jahirul Karim, Wilfrid Batcho, Clarisse Bougouma, Roland Bougma, Biholong Benjamin Didier, Nko'Ayissi Georges, Benjamin Marfo, Jean Frantz Lemoine, Helena Ullyartha Pangaribuan, Eksi Wijayanti, Yaya Ibrahim Coulibaly, Salif Seriba Doumbia, Pradip Rimal, Adamou Bacthiri Salissou, Yukaba Bah, Upendo Mwingira, Andreas Nshala, Edridah Muheki, Joseph Shott, Violetta Yevstigneyeva, Egide Ndayishimye, Margaret Baker, John Kraemer, Molly Brady
+
+Global Health Division, RTI International, Washington, DC, United States of America; Global Health, Population, and Nutrition, FHI 360, Washington, DC, United States of America; Department of Disease Control, Ministry of Health and Family Welfare, Dhaka, Bangladesh; National Control Program of Communicable Diseases, Ministry of Health, Cotonou, Benin; Lymphatic Filariasis Elimination Program, Ministère de la Santé, Ouagadougou, Burkina Faso; National Onchocerciasis and Lymphatic Filariasis Control Program, Ministry of Health, Yaounde, Cameroon; Neglected Tropical Diseases Programme, Ghana Health Service, Accra, Ghana; Ministry of Health, Port-au-Prince, Haiti; National Institute Health Research &amp; Development, Ministry of Health, Jakarta, Indonesia; Filariasis Unit, International Center of Excellence in Research, Faculty of Medicine and Odontostomatology, Bamako, Mali; Epidemiology and Disease Control Division, Department of Health Service, Kathmandu, Nepal; Programme Onchocercose et Filariose Lymphatique, Ministère de la Santé, Niamey, Niger; National Neglected Tropical Disease Program, Ministry of Health and Sanitation, Freetown, Sierra Leone; Neglected Tropical Disease Control Programme, National Institute for Medical Research, Dar es Salaam, Tanzania; IMA World Health/Tanzania NTD Control Programme, Uppsala University, &amp; TIBA Fellow, Dar es Salaam, Tanzania; Programme to Eliminate Lymphatic Filariasis, Ministry of Health, Kampala, Uganda; Division of Neglected Tropical Diseases, Office of Infectious Diseases, Bureau for Global Health, USAID, Washington, DC, United States of America; Georgetown University, Washington, DC, United States of America

 ## Abstract

-Achieving elimination of lymphatic filariasis (LF) as a public health problem requires a minimum of five effective rounds of mass drug administration (MDA) and demonstrating low prevalence in subsequent assessments. The first assessments recommended by the World Health Organization (WHO) are sentinel and spot-check sites—referred to as pre-transmission assessment surveys (pre-TAS)—in each implementation unit after MDA. If pre-TAS shows that prevalence in each site has been lowered to less than 1% microfilaremia or less than 2% antigenemia, the implementation unit conducts a TAS to determine whether MDA can be stopped. Failure to pass pre-TAS means that further rounds of MDA are required. This study aims to understand factors influencing pre-TAS results using existing programmatic data from 554 implementation units, of which 74 (13%) failed, in 13 countries. Secondary data analysis was completed using existing data from Bangladesh, Benin, Burkina Faso, Cameroon, Ghana, Haiti, Indonesia, Mali, Nepal, Niger, Sierra Leone, Tanzania, and Uganda. Additional covariate data were obtained from spatial raster data sets. Bivariate analysis and multilinear regression were performed to establish potential relationships between variables and the pre-TAS result. Higher baseline prevalence and lower elevation were significant in the regression model. Variables statistically significantly associated with failure (p-value ≤0.05) in the bivariate analyses included baseline prevalence at or above 5% or 10%, use of Filariasis Test Strips (FTS), primary vector of Culex, treatment with diethylcarbamazine-albendazole, higher elevation, higher population density, higher enhanced vegetation index (EVI), higher annual rainfall, and 6 or more rounds of MDA. This paper reports for the first time factors associated with pre-TAS results from a multi-country analysis. This information can help countries more effectively forecast program activities, such as the potential need for more rounds of MDA, and prioritize resources to ensure adequate coverage of all persons in areas at highest risk of failing pre-TAS.Author summaryAchieving elimination of lymphatic filariasis (LF) as a public health problem requires a minimum of five rounds of mass drug administration (MDA) and being able to demonstrate low prevalence in several subsequent assessments. LF elimination programs implement sentinel and spot-check site assessments, called pre-TAS, to determine whether districts are eligible to implement more rigorous population-based surveys to determine whether MDA can be stopped or if further rounds are required. Reasons for failing pre-TAS are not well understood and have not previously been examined with data compiled from multiple countries. For this analysis, we analyzed data from routine USAID and WHO reports from Bangladesh, Benin, Burkina Faso, Cameroon, Ghana, Haiti, Indonesia, Mali, Nepal, Niger, Sierra Leone, Tanzania, and Uganda. In a model that included multiple variables, high baseline prevalence and lower elevation were significant. In models comparing only one variable to the outcome, the following were statistically significantly associated with failure: higher baseline prevalence at or above 5% or 10%, use of the FTS, primary vector of Culex, treatment with diethylcarbamazine-albendazole, lower elevation, higher population density, higher Enhanced Vegetation Index, higher annual rainfall, and six or more rounds of mass drug administration. These results can help national programs plan MDA more effectively, e.g., by focusing resources on areas with higher baseline prevalence and/or lower elevation.
+Achieving elimination of lymphatic filariasis (LF) as a public health problem requires a minimum of five effective rounds of mass drug administration (MDA) and demonstrating low prevalence in subsequent assessments. The first assessments recommended by the World Health Organization (WHO) are sentinel and spot-check sites—referred to as pre-transmission assessment surveys (pre-TAS)—in each implementation unit after MDA. If pre-TAS shows that prevalence in each site has been lowered to less than 1% microfilaremia or less than 2% antigenemia, the implementation unit conducts a TAS to determine whether MDA can be stopped. Failure to pass pre-TAS means that further rounds of MDA are required. This study aims to understand factors influencing pre-TAS results using existing programmatic data from 554 implementation units, of which 74 (13%) failed, in 13 countries. Secondary data analysis was completed using existing data from Bangladesh, Benin, Burkina Faso, Cameroon, Ghana, Haiti, Indonesia, Mali, Nepal, Niger, Sierra Leone, Tanzania, and Uganda. Additional covariate data were obtained from spatial raster data sets. Bivariate analysis and multilinear regression were performed to establish potential relationships between variables and the pre-TAS result. Higher baseline prevalence and lower elevation were significant in the regression model. Variables statistically significantly associated with failure (p-value ≤0.05) in the bivariate analyses included baseline prevalence at or above 5% or 10%, use of Filariasis Test Strips (FTS), primary vector of Culex, treatment with diethylcarbamazine-albendazole, higher elevation, higher population density, higher enhanced vegetation index (EVI), higher annual rainfall, and 6 or more rounds of MDA. This paper reports for the first time factors associated with pre-TAS results from a multi-country analysis. This information can help countries more effectively forecast program activities, such as the potential need for more rounds of MDA, and prioritize resources to ensure adequate coverage of all persons in areas at highest risk of failing pre-TAS.
+
+## Author summary
+
+Achieving elimination of lymphatic filariasis (LF) as a public health problem requires a minimum of five rounds of mass drug administration (MDA) and being able to demonstrate low prevalence in several subsequent assessments. LF elimination programs implement sentinel and spot-check site assessments, called pre-TAS, to determine whether districts are eligible to implement more rigorous population-based surveys to determine whether MDA can be stopped or if further rounds are required. Reasons for failing pre-TAS are not well understood and have not previously been examined with data compiled from multiple countries. For this analysis, we analyzed data from routine USAID and WHO reports from Bangladesh, Benin, Burkina Faso, Cameroon, Ghana, Haiti, Indonesia, Mali, Nepal, Niger, Sierra Leone, Tanzania, and Uganda. In a model that included multiple variables, high baseline prevalence and lower elevation were significant. In models comparing only one variable to the outcome, the following were statistically significantly associated with failure: higher baseline prevalence at or above 5% or 10%, use of the FTS, primary vector of Culex, treatment with diethylcarbamazine-albendazole, lower elevation, higher population density, higher Enhanced Vegetation Index, higher annual rainfall, and six or more rounds of mass drug administration. These results can help national programs plan MDA more effectively, e.g., by focusing resources on areas with higher baseline prevalence and/or lower elevation.

 ## Introduction

@ -24,6 +30,28 @@ This is a secondary data analysis using existing data, collected for programmati

 Building on previous work, we delineated five domains of variables that could influence pre-TAS outcomes: prevalence, agent, environment, MDA, and pre-TAS implementation (Table 1) [6–8]. We prioritized key concepts that could be measured through our data or captured through publicly available global geospatial data sets.

+Table 1 Categorization of potential factors influencing pre-TAS results.
+
+| Domain                 | Factor                | Covariate                     | Description                                                     | Reference Group      | Summary statistic   | Temporal Resolution   | Source             |
+|------------------------|-----------------------|-------------------------------|-----------------------------------------------------------------|----------------------|---------------------|-----------------------|--------------------|
+| Prevalence             | Baseline prevalence   | 5% cut off                    | Maximum reported mapping or baseline sentinel site prevalence   | &lt;5%                  | Maximum             | Varies                | Programmatic data  |
+| Prevalence             | Baseline prevalence   | 10% cut off                   | Maximum reported mapping or baseline sentinel site prevalence   | &lt;10%                 | Maximum             | Varies                | Programmatic data  |
+| Agent                  | Parasite              | Parasite                      | Predominate parasite in district                                | W. bancrofti &amp; mixed | Binary value        | 2018                  | Programmatic data  |
+| Environment            | Vector                | Vector                        | Predominate vector in district                                  | Anopheles &amp; Mansonia | Binary value        | 2018                  | Country expert     |
+| Environment            | Geography             | Elevation                     | Elevation measured in meters                                    | &gt;350                 | Mean                | 2000                  | CGIAR-CSI SRTM [9] |
+| Environment            | Geography             | District area                 | Area measured in km2                                            | &gt;2,500               | Maximum sum         | Static                | Programmatic data  |
+| Environment            | Climate               | EVI                           | Enhanced vegetation index                                       | &gt; 0.3                | Mean                | 2015                  | MODIS [10]         |
+| Environment            | Climate               | Rainfall                      | Annual rainfall measured in mm                                  | ≤ 700                | Mean                | 2015                  | CHIRPS [11]        |
+| Environment            | Socio-economic        | Population density            | Number of people per km2                                        | ≤ 100                | Mean                | 2015                  | WorldPop [12]      |
+| Environment            | Socio-economic        | Nighttime lights              | Nighttime light index from 0 to 63                              | &gt;1.5                 | Mean                | 2015                  | VIIRS [13]         |
+| Environment            | Co-endemicity         | Co-endemic for onchocerciasis | Part or all of district is also endemic for onchocerciases      | Non-endemic          | Binary value        | 2018                  | Programmatic data  |
+| MDA                    | Drug efficacy         | Drug package                  | DEC-ALB or IVM-ALB                                              | DEC-ALB              | Binary value        | 2018                  | Programmatic data  |
+| MDA                    | Implementation of MDA | Coverage                      | Median MDA coverage for last 5 rounds                           | ≥ 65%                | Median              | Varies                | Programmatic data  |
+| MDA                    | Implementation of MDA | Sufficient rounds             | Number of rounds of sufficient (≥ 65% coverage) in last 5 years | ≥ 3                  | Count               | Varies                | Programmatic data  |
+| MDA                    | Implementation of MDA | Number of rounds              | Maximum number of recorded rounds of MDA                        | ≥ 6                  | Maximum             | Varies                | Programmatic data  |
+| Pre-TAS implementation | Quality of survey     | Diagnostic method             | Using Mf or Ag                                                  | Mf                   | Binary value        | Varies                | Programmatic data  |
+| Pre-TAS implementation | Quality of survey     | Diagnostic test               | Using Mf, ICT, or FTS                                           | Mf                   | Categorical         | Varies                | Programmatic data  |
+
 ### Data sources

 Information on baseline prevalence, MDA coverage, the number of MDA rounds, and pre-TAS information (month and year of survey, district, site name, and outcome) was gathered through regular reporting for the USAID-funded NTD programs (ENVISION, END in Africa, and END in Asia). These data were augmented by other reporting data such as the country’s dossier data annexes, the WHO Preventive Chemotherapy and Transmission Control Databank, and WHO reporting forms. Data were then reviewed by country experts, including the Ministry of Health program staff and implementing program staff, and updated as necessary. Data on vectors were also obtained from country experts. The district geographic boundaries were matched to geospatial shapefiles from the ENVISION project geospatial data repository, while other geospatial data were obtained through publicly available sources (Table 1).
@ -74,16 +102,51 @@ Sensitivity analysis was performed for the final log-binomial model to test for

 The overall pre-TAS pass rate for the districts included in this analysis was 87% (74 failures in 554 districts). Nearly 40% of the 554 districts were from Cameroon (134) and Tanzania (87) (Fig 1). No districts in Bangladesh, Cameroon, Mali, or Uganda failed a pre-TAS in this data set; over 25% of districts in Burkina Faso, Ghana, Haiti, Nepal, and Sierra Leone failed pre-TAS in this data set. Baseline prevalence varied widely within and between the 13 countries. Fig 2 shows the highest, lowest, and median baseline prevalence in the study districts by country. Burkina Faso had the highest median baseline prevalence at 52% and Burkina Faso, Tanzania, and Ghana all had at least one district with a very high baseline of over 70%. In Mali, Indonesia, Benin, and Bangladesh, all districts had baseline prevalences below 20%.

+Fig 1 Number of pre-TAS by country.
+
+<!-- image -->
+
+Fig 2 District-level baseline prevalence by country.
+
+<!-- image -->
+
 Fig 3 shows the unadjusted analysis for key variables by pre-TAS result. Variables statistically significantly associated with failure (p-value ≤0.05) included higher baseline prevalence at or above 5% or 10%, FTS diagnostic test, primary vector of Culex, treatment with DEC-ALB, higher elevation, higher population density, higher EVI, higher annual rainfall, and six or more rounds of MDA. Variables that were not significantly associated with pre-TAS failure included diagnostic method used (Ag or Mf), parasite, co-endemicity for onchocerciasis, median MDA coverage, and sufficient rounds of MDA.

+Fig 3 Percent pre-TAS failure by each characteristic (unadjusted).
+
+<!-- image -->
+
 The final log-binomial model included the variables of baseline prevalence ≥10%, the diagnostic test used (FTS and ICT), and elevation. The final model also included a significant interaction term between high baseline and diagnostic test used.

 Fig 4 shows the risk ratio results with their corresponding confidence intervals. In a model with interaction between baseline and diagnostic test the baseline parameter was significant while diagnostic test and the interaction term were not. Districts with high baseline had a statistically significant (p-value ≤0.05) 2.52 times higher risk of failure (95% CI 1.37–4.64) compared to those with low baseline prevalence. The FTS diagnostic test or ICT diagnostic test alone were not significant nor was the interaction term. Additionally, districts with an elevation below 350 meters had a statistically significant (p-value ≤0.05) 3.07 times higher risk of failing pre-TAS (95% CI 1.95–4.83).

+Fig 4 Adjusted risk ratios for pre-TAS failure with 95% Confidence Interval from log-binomial model.
+
+<!-- image -->
+
 Sensitivity analyses were conducted using the same model with different subsets of the dataset including (1) all districts except for districts in Cameroon (134 total with no failures), (2) only districts in Africa, (3) only districts with W. bancrofti, and (4) only districts with Anopheles as primary vector. The results of the sensitivity models (Table 2) indicate an overall robust model. High baseline and lower elevation remained significant across all the models. The ICT diagnostic test used remains insignificant across all models. The FTS diagnostic test was positively significant in model 1 and negatively significant in model 4. The interaction term of baseline prevalence and FTS diagnostic test was significant in three models though the estimate was unstable in the W. bancrofti-only and Anopheles-only models (models 3 and 4 respectively), as signified by large confidence intervals.

+Table 2 Adjusted risk ratios for pre-TAS failure from log-binomial model sensitivity analysis.
+
+|                                             |                  | (1)                        | (2)                      | (3)                                  | (4)                             |
+|---------------------------------------------|------------------|----------------------------|--------------------------|--------------------------------------|---------------------------------|
+|                                             | Full Model       | Without Cameroon districts | Only districts in Africa | Only W. bancrofti parasite districts | Only Anopheles vector districts |
+| Number of Failures                          | 74               | 74                         | 44                       | 72                                   | 46                              |
+| Number of total districts                   | (N = 554)        | (N = 420)                  | (N = 407)                | (N = 518)                            | (N = 414)                       |
+| Covariate                                   | RR (95% CI)      | RR (95% CI)                | RR (95% CI)              | RR (95% CI)                          | RR (95% CI)                     |
+| Baseline prevalence &gt; = 10% &amp; used FTS test | 2.38 (0.96–5.90) | 1.23 (0.52–2.92)           | 14.52 (1.79–117.82)      | 2.61 (1.03–6.61)                     | 15.80 (1.95–127.67)             |
+| Baseline prevalence &gt; = 10% &amp; used ICT test | 0.80 (0.20–3.24) | 0.42 (0.11–1.68)           | 1.00 (0.00–0.00)         | 0.88 (0.21–3.60)                     | 1.00 (0.00–0.00)                |
+| +Used FTS test                              | 1.16 (0.52–2.59) | 2.40 (1.12–5.11)           | 0.15 (0.02–1.11)         | 1.03 (0.45–2.36)                     | 0.13 (0.02–0.96)                |
+| +Used ICT test                              | 0.92 (0.32–2.67) | 1.47 (0.51–4.21)           | 0.33 (0.04–2.54)         | 0.82 (0.28–2.43)                     | 0.27 (0.03–2.04)                |
+| +Baseline prevalence &gt; = 10%                | 2.52 (1.37–4.64) | 2.42 (1.31–4.47)           | 2.03 (1.06–3.90)         | 2.30 (1.21–4.36)                     | 2.01 (1.07–3.77)                |
+| Elevation &lt; 350m                            | 3.07 (1.95–4.83) | 2.21 (1.42–3.43)           | 4.68 (2.22–9.87)         | 3.04 (1.93–4.79)                     | 3.76 (1.92–7.37)                |
+
 Overall 74 districts in the dataset failed pre-TAS. Fig 5 summarizes the likelihood of failure by variable combinations identified in the log-binomial model. For those districts with a baseline prevalence ≥10% that used a FTS diagnostic test and have an average elevation below 350 meters (Combination C01), 87% of the 23 districts failed. Of districts with high baseline that used an ICT diagnostic test and have a low average elevation (C02) 45% failed. Overall, combinations with high baseline and low elevation C01, C02, and C04 accounted for 51% of all the failures (38 of 74).

+Fig 5 Analysis of failures by model combinations.
+
+<!-- image -->
+
 ## Discussion

 This paper reports for the first time factors associated with pre-TAS results from a multi-country analysis. Variables significantly associated with failure were higher baseline prevalence and lower elevation. Districts with a baseline prevalence of 10% or more were at 2.52 times higher risk to fail pre-TAS in the final log-binomial model. In the bivariate analysis, baseline prevalence above 5% was also significantly more likely to fail compared to lower baselines, which indicates that the threshold for higher baseline prevalence may be as little as 5%, similar to what was found in Goldberg et al., which explored ecological and socioeconomic factors associated with TAS failure [7].
@ -104,119 +167,62 @@ As this analysis used data across a variety of countries and epidemiological sit

 This paper provides evidence from analysis of 554 districts and 13 countries on the factors associated with pre-TAS results. Baseline prevalence, elevation, vector, population density, EVI, rainfall, and number of MDA rounds were all significant in either bivariate or multivariate analyses. This information along with knowledge of local context can help countries more effectively plan pre-TAS and forecast program activities, such as the potential need for more than five rounds of MDA in areas with high baseline and/or low elevation.

-## Tables
+## Acknowledgments

-Table 1: Categorization of potential factors influencing pre-TAS results.
-
-| Domain                 | Factor                | Covariate                     | Description                                                     | Reference Group      | Summary statistic   | Temporal Resolution   | Source             |
-|------------------------|-----------------------|-------------------------------|-----------------------------------------------------------------|----------------------|---------------------|-----------------------|--------------------|
-| Prevalence             | Baseline prevalence   | 5% cut off                    | Maximum reported mapping or baseline sentinel site prevalence   | &lt;5%                  | Maximum             | Varies                | Programmatic data  |
-| Prevalence             | Baseline prevalence   | 10% cut off                   | Maximum reported mapping or baseline sentinel site prevalence   | &lt;10%                 | Maximum             | Varies                | Programmatic data  |
-| Agent                  | Parasite              | Parasite                      | Predominate parasite in district                                | W. bancrofti &amp; mixed | Binary value        | 2018                  | Programmatic data  |
-| Environment            | Vector                | Vector                        | Predominate vector in district                                  | Anopheles &amp; Mansonia | Binary value        | 2018                  | Country expert     |
-| Environment            | Geography             | Elevation                     | Elevation measured in meters                                    | &gt;350                 | Mean                | 2000                  | CGIAR-CSI SRTM [9] |
-| Environment            | Geography             | District area                 | Area measured in km2                                            | &gt;2,500               | Maximum sum         | Static                | Programmatic data  |
-| Environment            | Climate               | EVI                           | Enhanced vegetation index                                       | &gt; 0.3                | Mean                | 2015                  | MODIS [10]         |
-| Environment            | Climate               | Rainfall                      | Annual rainfall measured in mm                                  | ≤ 700                | Mean                | 2015                  | CHIRPS [11]        |
-| Environment            | Socio-economic        | Population density            | Number of people per km2                                        | ≤ 100                | Mean                | 2015                  | WorldPop [12]      |
-| Environment            | Socio-economic        | Nighttime lights              | Nighttime light index from 0 to 63                              | &gt;1.5                 | Mean                | 2015                  | VIIRS [13]         |
-| Environment            | Co-endemicity         | Co-endemic for onchocerciasis | Part or all of district is also endemic for onchocerciases      | Non-endemic          | Binary value        | 2018                  | Programmatic data  |
-| MDA                    | Drug efficacy         | Drug package                  | DEC-ALB or IVM-ALB                                              | DEC-ALB              | Binary value        | 2018                  | Programmatic data  |
-| MDA                    | Implementation of MDA | Coverage                      | Median MDA coverage for last 5 rounds                           | ≥ 65%                | Median              | Varies                | Programmatic data  |
-| MDA                    | Implementation of MDA | Sufficient rounds             | Number of rounds of sufficient (≥ 65% coverage) in last 5 years | ≥ 3                  | Count               | Varies                | Programmatic data  |
-| MDA                    | Implementation of MDA | Number of rounds              | Maximum number of recorded rounds of MDA                        | ≥ 6                  | Maximum             | Varies                | Programmatic data  |
-| Pre-TAS implementation | Quality of survey     | Diagnostic method             | Using Mf or Ag                                                  | Mf                   | Binary value        | Varies                | Programmatic data  |
-| Pre-TAS implementation | Quality of survey     | Diagnostic test               | Using Mf, ICT, or FTS                                           | Mf                   | Categorical         | Varies                | Programmatic data  |
-
-Table 2: Adjusted risk ratios for pre-TAS failure from log-binomial model sensitivity analysis.
-
-|                                             |                  | (1)                        | (2)                      | (3)                                  | (4)                             |
-|---------------------------------------------|------------------|----------------------------|--------------------------|--------------------------------------|---------------------------------|
-|                                             | Full Model       | Without Cameroon districts | Only districts in Africa | Only W. bancrofti parasite districts | Only Anopheles vector districts |
-| Number of Failures                          | 74               | 74                         | 44                       | 72                                   | 46                              |
-| Number of total districts                   | (N = 554)        | (N = 420)                  | (N = 407)                | (N = 518)                            | (N = 414)                       |
-| Covariate                                   | RR (95% CI)      | RR (95% CI)                | RR (95% CI)              | RR (95% CI)                          | RR (95% CI)                     |
-| Baseline prevalence &gt; = 10% &amp; used FTS test | 2.38 (0.96–5.90) | 1.23 (0.52–2.92)           | 14.52 (1.79–117.82)      | 2.61 (1.03–6.61)                     | 15.80 (1.95–127.67)             |
-| Baseline prevalence &gt; = 10% &amp; used ICT test | 0.80 (0.20–3.24) | 0.42 (0.11–1.68)           | 1.00 (0.00–0.00)         | 0.88 (0.21–3.60)                     | 1.00 (0.00–0.00)                |
-| +Used FTS test                              | 1.16 (0.52–2.59) | 2.40 (1.12–5.11)           | 0.15 (0.02–1.11)         | 1.03 (0.45–2.36)                     | 0.13 (0.02–0.96)                |
-| +Used ICT test                              | 0.92 (0.32–2.67) | 1.47 (0.51–4.21)           | 0.33 (0.04–2.54)         | 0.82 (0.28–2.43)                     | 0.27 (0.03–2.04)                |
-| +Baseline prevalence &gt; = 10%                | 2.52 (1.37–4.64) | 2.42 (1.31–4.47)           | 2.03 (1.06–3.90)         | 2.30 (1.21–4.36)                     | 2.01 (1.07–3.77)                |
-| Elevation &lt; 350m                            | 3.07 (1.95–4.83) | 2.21 (1.42–3.43)           | 4.68 (2.22–9.87)         | 3.04 (1.93–4.79)                     | 3.76 (1.92–7.37)                |
-
-## Figures
-
-Fig 1: Number of pre-TAS by country.
-
-<!-- image -->
-
-Fig 2: District-level baseline prevalence by country.
-
-<!-- image -->
-
-Fig 3: Percent pre-TAS failure by each characteristic (unadjusted).
-
-<!-- image -->
-
-Fig 4: Adjusted risk ratios for pre-TAS failure with 95% Confidence Interval from log-binomial model.
-
-<!-- image -->
-
-Fig 5: Analysis of failures by model combinations.
-
-<!-- image -->
+The authors would like to thank all those involved from the Ministries of Health, volunteers and community members in the sentinel and spot-check site surveys for their tireless commitment to ridding the world of LF. In addition, gratitude is given to Joseph Koroma and all the partners, including USAID, RTI International, FHI 360, IMA World Health, and Helen Keller International, who supported the surveys financially and technically.

 ## References

- World Health Organization. Lymphatic filariasis: progress report 2000–2009 and strategic plan 2010–2020. Geneva; 2010. 
- World Health Organization. Validation of elimination of lymphatic filariasis as a public health problem. Geneva; 2017. 
- Global programme to eliminate lymphatic filariasis: progress report, 2018. Wkly Epidemiol Rec (2019)
- World Health Organization. Global programme to eliminate lymphatic filariasis: monitoring and epidemiological assessment of mass drug administration. Geneva; 2011. 
- World Health Organization. Strengthening the assessment of lymphatic filariasis transmission and documenting the achievement of elimination—Meeting of the Neglected Tropical Diseases Strategic and Technical Advisory Group’s Monitoring and Evaluation Subgroup on Disease-specific Indicators. 2016; 42. 
- Kyelem D; Biswas G; Bockarie MJ; Bradley MH; El-Setouhy M; Fischer PU. Determinants of success in national programs to eliminate lymphatic filariasis: a perspective identifying essential elements and research needs. Am J Trop Med Hyg (2008)
- Goldberg EM; King JD; Mupfasoni D; Kwong K; Hay SI; Pigott DM. Ecological and socioeconomic predictors of transmission assessment survey failure for lymphatic filariasis. Am J Trop Med Hyg (2019)
- Cano J; Rebollo MP; Golding N; Pullan RL; Crellen T; Soler A. The global distribution and transmission limits of lymphatic filariasis: past and present. Parasites and Vectors (2014)
- CGIAR-CSI. CGIAR-CSI SRTM 90m DEM Digital Elevation Database. In: . 
- USGS NASA. Vegetation indices 16-DAy L3 global 500 MOD13A1 dataset [Internet]. [cited 1 May 2018]. Available: . 
- Funk C; Peterson P; Landsfeld M; Pedreros D; Verdin J; Shukla S. The climate hazards infrared precipitation with stations—A new environmental record for monitoring extremes. Sci Data (2015)
- Lloyd CT; Sorichetta A; Tatem AJ. High resolution global gridded data for use in population studies. Sci Data (2017)
- Elvidge CD; Baugh KE; Zhizhin M; Hsu F-C. Why VIIRS data are superior to DMSP for mapping nighttime lights. Proc Asia-Pacific Adv Netw (2013)
- Jambulingam P; Subramanian S; De Vlas SJ; Vinubala C; Stolk WA. Mathematical modelling of lymphatic filariasis elimination programmes in India: required duration of mass drug administration and post-treatment level of infection indicators. Parasites and Vectors (2016)
- Michael E; Malecela-Lazaro MN; Simonsen PE; Pedersen EM; Barker G; Kumar A. Mathematical modelling and the control of lymphatic filariasis. Lancet Infect Dis (2004)
- Stolk WA; Swaminathan S; van Oortmarssen GJ; Das PK; Habbema JDF. Prospects for elimination of bancroftian filariasis by mass drug treatment in Pondicherry, India: a simulation study. J Infect Dis (2003)
- Grady CA; De Rochars MB; Direny AN; Orelus JN; Wendt J; Radday J. Endpoints for lymphatic filariasis programs. Emerg Infect Dis (2007)
- Evans D; McFarland D; Adamani W; Eigege A; Miri E; Schulz J. Cost-effectiveness of triple drug administration (TDA) with praziquantel, ivermectin and albendazole for the prevention of neglected tropical diseases in Nigeria. Ann Trop Med Parasitol (2011)
- Richards FO; Eigege A; Miri ES; Kal A; Umaru J; Pam D. Epidemiological and entomological evaluations after six years or more of mass drug administration for lymphatic filariasis elimination in Nigeria. PLoS Negl Trop Dis (2011)
- Biritwum NK; Yikpotey P; Marfo BK; Odoom S; Mensah EO; Asiedu O. Persistent “hotspots” of lymphatic filariasis microfilaraemia despite 14 years of mass drug administration in Ghana. Trans R Soc Trop Med Hyg (2016)
- Moraga P; Cano J; Baggaley RF; Gyapong JO; Njenga SM; Nikolay B. Modelling the distribution and transmission intensity of lymphatic filariasis in sub-Saharan Africa prior to scaling up interventions: integrated use of geostatistical and mathematical modelling. Parasites and Vectors (2015)
- Irvine MA; Njenga SM; Gunawardena S; Wamae CN; Cano J; Brooker SJ. Understanding the relationship between prevalence of microfilariae and antigenaemia using a model of lymphatic filariasis infection. Trans R Soc Trop Med Hyg (2016)
- Ottesen EA. Efficacy of diethylcarbamazine in eradicating infection with lymphatic-dwelling filariae in humans. Rev Infect Dis (1985)
- Gambhir M; Bockarie M; Tisch D; Kazura J; Remais J; Spear R. Geographic and ecologic heterogeneity in elimination thresholds for the major vector-borne helminthic disease, lymphatic filariasis. BMC Biol (2010)
- World Health Organization. Global programme to eliminate lymphatic filariasis: practical entomology handbook. Geneva; 2013. 
- Slater H; Michael E. Predicting the current and future potential distributions of lymphatic filariasis in Africa using maximum entropy ecological niche modelling. PLoS One (2012)
- Slater H; Michael E. Mapping, Bayesian geostatistical analysis and spatial prediction of lymphatic filariasis prevalence in Africa. PLoS One (2013)
- Sabesan S; Raju KHK; Subramanian S; Srivastava PK; Jambulingam P. Lymphatic filariasis transmission risk map of India, based on a geo-environmental risk model. Vector-Borne Zoonotic Dis (2013)
- Stanton MC; Molyneux DH; Kyelem D; Bougma RW; Koudou BG; Kelly-Hope LA. Baseline drivers of lymphatic filariasis in Burkina Faso. Geospat Health (2013)
- Manhenje I; Teresa Galán-Puchades M; Fuentes M V. Socio-environmental variables and transmission risk of lymphatic filariasis in central and northern Mozambique. Geospat Health (2013)
- Ngwira BM; Tambala P; Perez a M; Bowie C; Molyneux DH. The geographical distribution of lymphatic filariasis infection in Malawi. Filaria J (2007)
- Simonsen PE; Mwakitalu ME. Urban lymphatic filariasis. Parasitol Res (2013)
- Proville J; Zavala-Araiza D; Wagner G. Night-time lights: a global, long term look at links to socio-economic trends. PLoS One (2017)
- Endeshaw T; Taye A; Tadesse Z; Katabarwa MN; Shafi O; Seid T. Presence of Wuchereria bancrofti microfilaremia despite seven years of annual ivermectin monotherapy mass drug administration for onchocerciasis control: a study in north-west Ethiopia. Pathog Glob Health (2015)
- Richards FO; Eigege A; Pam D; Kal A; Lenhart A; Oneyka JOA. Mass ivermectin treatment for onchocerciasis: lack of evidence for collateral impact on transmission of Wuchereria bancrofti in areas of co-endemicity. Filaria J (2005)
- Kyelem D; Sanou S; Boatin B a; Medlock J; Couibaly S; Molyneux DH. Impact of long-term ivermectin (Mectizan) on Wuchereria bancrofti and Mansonella perstans infections in Burkina Faso: strategic and policy implications. Ann Trop Med Parasitol (2003)
- Weil GJ; Lammie PJ; Richards FO; Eberhard ML. Changes in circulating parasite antigen levels after treatment of bancroftian filariasis with diethylcarbamazine and ivermectin. J Infect Dis (1991)
- Kumar A; Sachan P. Measuring impact on filarial infection status in a community study: role of coverage of mass drug administration. Trop Biomed (2014)
- Njenga SM; Mwandawiro CS; Wamae CN; Mukoko DA; Omar AA; Shimada M. Sustained reduction in prevalence of lymphatic filariasis infection in spite of missed rounds of mass drug administration in an area under mosquito nets for malaria control. Parasites and Vectors (2011)
- Boyd A; Won KY; McClintock SK; Donovan C V; Laney SJ; Williams SA. A community-based study of factors associated with continuing transmission of lymphatic filariasis in Leogane, Haiti. PLoS Negl Trop Dis (2010)
- Irvine MA; Reimer LJ; Njenga SM; Gunawardena S; Kelly-Hope L; Bockarie M. Modelling strategies to break transmission of lymphatic filariasis—aggregation, adherence and vector competence greatly alter elimination. Parasites and Vectors (2015)
- Irvine MA; Stolk WA; Smith ME; Subramanian S; Singh BK; Weil GJ. Effectiveness of a triple-drug regimen for global elimination of lymphatic filariasis: a modelling study. Lancet Infect Dis (2017)
- Pion SD; Montavon C; Chesnais CB; Kamgno J; Wanji S; Klion AD. Positivity of antigen tests used for diagnosis of lymphatic filariasis in individuals without Wuchereria bancrofti infection but with high loa loa microfilaremia. Am J Trop Med Hyg (2016)
- Wanji S; Esum ME; Njouendou AJ; Mbeng AA; Chounna Ndongmo PW; Abong RA. Mapping of lymphatic filariasis in loiasis areas: a new strategy shows no evidence for Wuchereria bancrofti endemicity in Cameroon. PLoS Negl Trop Dis (2018)
- Chesnais CB; Awaca-Uvon NP; Bolay FK; Boussinesq M; Fischer PU; Gankpala L. A multi-center field study of two point-of-care tests for circulating Wuchereria bancrofti antigenemia in Africa. PLoS Negl Trop Dis (2017)
- Silumbwe A; Zulu JM; Halwindi H; Jacobs C; Zgambo J; Dambe R. A systematic review of factors that shape implementation of mass drug administration for lymphatic filariasis in sub-Saharan Africa. BMC Public Health (2017)
- Adams AM; Vuckovic M; Birch E; Brant TA; Bialek S; Yoon D. Eliminating neglected tropical diseases in urban areas: a review of challenges, strategies and research directions for successful mass drug administration. Trop Med Infect Dis (2018)
- Rao RU; Samarasekera SD; Nagodavithana KC; Dassanayaka TDM; Punchihewa MW; Ranasinghe USB. Reassessment of areas with persistent lymphatic filariasis nine years after cessation of mass drug administration in Sri Lanka. PLoS Negl Trop Dis (2017)
- Xu Z; Graves PM; Lau CL; Clements A; Geard N; Glass K. GEOFIL: a spatially-explicit agent-based modelling framework for predicting the long-term transmission dynamics of lymphatic filariasis in American Samoa. Epidemics (2018)
- Id CM; Tettevi EJ; Mechan F; Idun B; Biritwum N; Osei-atweneboana MY. Elimination within reach: a cross-sectional study highlighting the factors that contribute to persistent lymphatic filariasis in eight communities in rural Ghana. PLoS Negl Trop Dis (2019)
- Eigege A; Kal A; Miri E; Sallau A; Umaru J; Mafuyai H. Long-lasting insecticidal nets are synergistic with mass drug administration for interruption of lymphatic filariasis transmission in Nigeria. PLoS Negl Trop Dis (2013)
- Van den Berg H; Kelly-Hope LA; Lindsay SW. Malaria and lymphatic filariasis: The case for integrated vector management. Lancet Infect Dis (2013)
- Webber R.. Eradication of Wuchereria bancrofti infection through vector control. Trans R Soc Trop Med Hyg (1979)
+- World Health Organization. Lymphatic filariasis: progress report 2000–2009 and strategic plan 2010–2020. Geneva; 2010.
+- World Health Organization. Validation of elimination of lymphatic filariasis as a public health problem. Geneva; 2017.
+- World Health Organization. Global programme to eliminate lymphatic filariasis: progress report, 2018. Wkly Epidemiol Rec. 2019;94: 457–472.
+- World Health Organization. Global programme to eliminate lymphatic filariasis: monitoring and epidemiological assessment of mass drug administration. Geneva; 2011.
+- World Health Organization. Strengthening the assessment of lymphatic filariasis transmission and documenting the achievement of elimination—Meeting of the Neglected Tropical Diseases Strategic and Technical Advisory Group’s Monitoring and Evaluation Subgroup on Disease-specific Indicators. 2016; 42.
+- KyelemD, BiswasG, BockarieMJ, BradleyMH, El-SetouhyM, FischerPU, et al Determinants of success in national programs to eliminate lymphatic filariasis: a perspective identifying essential elements and research needs. Am J Trop Med Hyg. 2008;79: 480–4. 18840733
+- GoldbergEM, KingJD, MupfasoniD, KwongK, HaySI, PigottDM, et al Ecological and socioeconomic predictors of transmission assessment survey failure for lymphatic filariasis. Am J Trop Med Hyg. 2019; 10.4269/ajtmh.18-0721 31115301
+- CanoJ, RebolloMP, GoldingN, PullanRL, CrellenT, SolerA, et al The global distribution and transmission limits of lymphatic filariasis: past and present. Parasites and Vectors. 2014;7: 1–19. 10.1186/1756-3305-7-1 24411014
+- CGIAR-CSI. CGIAR-CSI SRTM 90m DEM Digital Elevation Database. In: http://Srtm.Csi.Cgiar.Org/ [Internet]. 2008 [cited 1 May 2018]. Available: http://srtm.csi.cgiar.org/
+- USGS NASA. Vegetation indices 16-DAy L3 global 500 MOD13A1 dataset [Internet]. [cited 1 May 2018]. Available: https://lpdaac.usgs.gov/products/myd13a1v006/
+- FunkC, PetersonP, LandsfeldM, PedrerosD, VerdinJ, ShuklaS, et al The climate hazards infrared precipitation with stations—A new environmental record for monitoring extremes. Sci Data. Nature Publishing Groups; 2015;2 10.1038/sdata.2015.66 26646728
+- LloydCT, SorichettaA, TatemAJ. High resolution global gridded data for use in population studies. Sci Data. 2017;4: 170001 10.1038/sdata.2017.1 28140386
+- ElvidgeCD, BaughKE, ZhizhinM, HsuF-C. Why VIIRS data are superior to DMSP for mapping nighttime lights. Proc Asia-Pacific Adv Netw. Proceedings of the Asia-Pacific Advanced Network; 2013;35: 62 10.7125/apan.35.7
+- JambulingamP, SubramanianS, De VlasSJ, VinubalaC, StolkWA. Mathematical modelling of lymphatic filariasis elimination programmes in India: required duration of mass drug administration and post-treatment level of infection indicators. Parasites and Vectors. 2016;9: 1–18. 10.1186/s13071-015-1291-6 26728523
+- MichaelE, Malecela-LazaroMN, SimonsenPE, PedersenEM, BarkerG, KumarA, et al Mathematical modelling and the control of lymphatic filariasis. Lancet Infect Dis. 2004;4: 223–234. 10.1016/S1473-3099(04)00973-9 15050941
+- StolkWA, SwaminathanS, van OortmarssenGJ, DasPK, HabbemaJDF. Prospects for elimination of bancroftian filariasis by mass drug treatment in Pondicherry, India: a simulation study. J Infect Dis. 2003;188: 1371–81. 10.1086/378354 14593597
+- GradyCA, De RocharsMB, DirenyAN, OrelusJN, WendtJ, RaddayJ, et al Endpoints for lymphatic filariasis programs. Emerg Infect Dis. 2007;13: 608–610. 10.3201/eid1304.061063 17553278
+- EvansD, McFarlandD, AdamaniW, EigegeA, MiriE, SchulzJ, et al Cost-effectiveness of triple drug administration (TDA) with praziquantel, ivermectin and albendazole for the prevention of neglected tropical diseases in Nigeria. Ann Trop Med Parasitol. 2011;105: 537–47. 10.1179/2047773211Y.0000000010 22325813
+- RichardsFO, EigegeA, MiriES, KalA, UmaruJ, PamD, et al Epidemiological and entomological evaluations after six years or more of mass drug administration for lymphatic filariasis elimination in Nigeria. PLoS Negl Trop Dis. 2011;5: e1346 10.1371/journal.pntd.0001346 22022627
+- BiritwumNK, YikpoteyP, MarfoBK, OdoomS, MensahEO, AsieduO, et al Persistent “hotspots” of lymphatic filariasis microfilaraemia despite 14 years of mass drug administration in Ghana. Trans R Soc Trop Med Hyg. 2016;110: 690–695. 10.1093/trstmh/trx007 28938053
+- MoragaP, CanoJ, BaggaleyRF, GyapongJO, NjengaSM, NikolayB, et al Modelling the distribution and transmission intensity of lymphatic filariasis in sub-Saharan Africa prior to scaling up interventions: integrated use of geostatistical and mathematical modelling. Parasites and Vectors. 2015;8: 1–16. 10.1186/s13071-014-0608-1 25561160
+- IrvineMA, NjengaSM, GunawardenaS, WamaeCN, CanoJ, BrookerSJ, et al Understanding the relationship between prevalence of microfilariae and antigenaemia using a model of lymphatic filariasis infection. Trans R Soc Trop Med Hyg. 2016;110: 118–124. 10.1093/trstmh/trv096 26822604
+- OttesenEA. Efficacy of diethylcarbamazine in eradicating infection with lymphatic-dwelling filariae in humans. Rev Infect Dis. 1985;7.
+- GambhirM, BockarieM, TischD, KazuraJ, RemaisJ, SpearR, et al Geographic and ecologic heterogeneity in elimination thresholds for the major vector-borne helminthic disease, lymphatic filariasis. BMC Biol. 2010;8 10.1186/1741-7007-8-22 20236528
+- World Health Organization. Global programme to eliminate lymphatic filariasis: practical entomology handbook. Geneva; 2013.
+- SlaterH, MichaelE. Predicting the current and future potential distributions of lymphatic filariasis in Africa using maximum entropy ecological niche modelling. PLoS One. 2012;7: e32202 10.1371/journal.pone.0032202 22359670
+- SlaterH, MichaelE. Mapping, Bayesian geostatistical analysis and spatial prediction of lymphatic filariasis prevalence in Africa. PLoS One. 2013;8: 28–32. 10.1371/journal.pone.0071574 23951194
+- SabesanS, RajuKHK, SubramanianS, SrivastavaPK, JambulingamP. Lymphatic filariasis transmission risk map of India, based on a geo-environmental risk model. Vector-Borne Zoonotic Dis. 2013;13: 657–665. 10.1089/vbz.2012.1238 23808973
+- StantonMC, MolyneuxDH, KyelemD, BougmaRW, KoudouBG, Kelly-HopeLA. Baseline drivers of lymphatic filariasis in Burkina Faso. Geospat Health. 2013;8: 159–173. 10.4081/gh.2013.63 24258892
+- ManhenjeI, Teresa Galán-PuchadesM, FuentesM V. Socio-environmental variables and transmission risk of lymphatic filariasis in central and northern Mozambique. Geospat Health. 2013;7: 391–398. 10.4081/gh.2013.96 23733300
+- NgwiraBM, TambalaP, Perez aM, BowieC, MolyneuxDH. The geographical distribution of lymphatic filariasis infection in Malawi. Filaria J. 2007;6: 12 10.1186/1475-2883-6-12 18047646
+- SimonsenPE, MwakitaluME. Urban lymphatic filariasis. Parasitol Res. 2013;112: 35–44. 10.1007/s00436-012-3226-x 23239094
+- ProvilleJ, Zavala-AraizaD, WagnerG. Night-time lights: a global, long term look at links to socio-economic trends. PLoS One. Public Library of Science; 2017;12 10.1371/journal.pone.0174610 28346500
+- EndeshawT, TayeA, TadesseZ, KatabarwaMN, ShafiO, SeidT, et al Presence of Wuchereria bancrofti microfilaremia despite seven years of annual ivermectin monotherapy mass drug administration for onchocerciasis control: a study in north-west Ethiopia. Pathog Glob Health. 2015;109: 344–351. 10.1080/20477724.2015.1103501 26878935
+- RichardsFO, EigegeA, PamD, KalA, LenhartA, OneykaJOA, et al Mass ivermectin treatment for onchocerciasis: lack of evidence for collateral impact on transmission of Wuchereria bancrofti in areas of co-endemicity. Filaria J. 2005;4: 3–5. 10.1186/1475-2883-4-3 15916708
+- KyelemD, SanouS, BoatinB a., MedlockJ, CouibalyS, MolyneuxDH. Impact of long-term ivermectin (Mectizan) on Wuchereria bancrofti and Mansonella perstans infections in Burkina Faso: strategic and policy implications. Ann Trop Med Parasitol. 2003;97: 827–38. 10.1179/000349803225002462 14754495
+- WeilGJ, LammiePJ, RichardsFO, EberhardML. Changes in circulating parasite antigen levels after treatment of bancroftian filariasis with diethylcarbamazine and ivermectin. J Infect Dis. 1991;164: 814–816. 10.1093/infdis/164.4.814 1894943
+- KumarA, SachanP. Measuring impact on filarial infection status in a community study: role of coverage of mass drug administration. Trop Biomed. 2014;31: 225–229. 25134891
+- NjengaSM, MwandawiroCS, WamaeCN, MukokoDA, OmarAA, ShimadaM, et al Sustained reduction in prevalence of lymphatic filariasis infection in spite of missed rounds of mass drug administration in an area under mosquito nets for malaria control. Parasites and Vectors. 2011;4: 1–9. 10.1186/1756-3305-4-1 21205315
+- BoydA, WonKY, McClintockSK, DonovanC V., LaneySJ, WilliamsSA, et al A community-based study of factors associated with continuing transmission of lymphatic filariasis in Leogane, Haiti. PLoS Negl Trop Dis. 2010;4: 1–10. 10.1371/journal.pntd.0000640 20351776
+- IrvineMA, ReimerLJ, NjengaSM, GunawardenaS, Kelly-HopeL, BockarieM, et al Modelling strategies to break transmission of lymphatic filariasis—aggregation, adherence and vector competence greatly alter elimination. Parasites and Vectors. 2015;8: 1–19. 10.1186/s13071-014-0608-1 25561160
+- IrvineMA, StolkWA, SmithME, SubramanianS, SinghBK, WeilGJ, et al Effectiveness of a triple-drug regimen for global elimination of lymphatic filariasis: a modelling study. Lancet Infect Dis. 2017;17: 451–458. 10.1016/S1473-3099(16)30467-4 28012943
+- PionSD, MontavonC, ChesnaisCB, KamgnoJ, WanjiS, KlionAD, et al Positivity of antigen tests used for diagnosis of lymphatic filariasis in individuals without Wuchereria bancrofti infection but with high loa loa microfilaremia. Am J Trop Med Hyg. 2016;95: 1417–1423. 10.4269/ajtmh.16-0547 27729568
+- WanjiS, EsumME, NjouendouAJ, MbengAA, Chounna NdongmoPW, AbongRA, et al Mapping of lymphatic filariasis in loiasis areas: a new strategy shows no evidence for Wuchereria bancrofti endemicity in Cameroon. PLoS Negl Trop Dis. 2018;13: 1–15. 10.1371/journal.pntd.0007192 30849120
+- ChesnaisCB, Awaca-UvonNP, BolayFK, BoussinesqM, FischerPU, GankpalaL, et al A multi-center field study of two point-of-care tests for circulating Wuchereria bancrofti antigenemia in Africa. PLoS Negl Trop Dis. 2017;11: 1–15. 10.1371/journal.pntd.0005703 28892473
+- SilumbweA, ZuluJM, HalwindiH, JacobsC, ZgamboJ, DambeR, et al A systematic review of factors that shape implementation of mass drug administration for lymphatic filariasis in sub-Saharan Africa. BMC Public Health; 2017; 1–15. 10.1186/s12889-017-4414-5 28532397
+- AdamsAM, VuckovicM, BirchE, BrantTA, BialekS, YoonD, et al Eliminating neglected tropical diseases in urban areas: a review of challenges, strategies and research directions for successful mass drug administration. Trop Med Infect Dis. 2018;3 10.3390/tropicalmed3040122 30469342
+- RaoRU, SamarasekeraSD, NagodavithanaKC, DassanayakaTDM, PunchihewaMW, RanasingheUSB, et al Reassessment of areas with persistent lymphatic filariasis nine years after cessation of mass drug administration in Sri Lanka. PLoS Negl Trop Dis. 2017;11: 1–17. 10.1371/journal.pntd.0006066 29084213
+- XuZ, GravesPM, LauCL, ClementsA, GeardN, GlassK. GEOFIL: a spatially-explicit agent-based modelling framework for predicting the long-term transmission dynamics of lymphatic filariasis in American Samoa. Epidemics. 2018; 10.1016/j.epidem.2018.12.003 30611745
+- IdCM, TetteviEJ, MechanF, IdunB, BiritwumN, Osei-atweneboanaMY, et al Elimination within reach: a cross-sectional study highlighting the factors that contribute to persistent lymphatic filariasis in eight communities in rural Ghana. PLoS Negl Trop Dis. 2019; 1–17.
+- EigegeA, KalA, MiriE, SallauA, UmaruJ, MafuyaiH, et al Long-lasting insecticidal nets are synergistic with mass drug administration for interruption of lymphatic filariasis transmission in Nigeria. PLoS Negl Trop Dis. 2013;7: 7–10. 10.1371/journal.pntd.0002508 24205421
+- Van den BergH, Kelly-HopeLA, LindsaySW. Malaria and lymphatic filariasis: The case for integrated vector management. Lancet Infect Dis. 2013;13: 89–94. 10.1016/S1473-3099(12)70148-2 23084831
+- WebberR. Eradication of Wuchereria bancrofti infection through vector control. Trans R Soc Trop Med Hyg. 1979;73.
--- a/tests/data/groundtruth/docling_v2/pone.0234687.xml.itxt
+++ b/tests/data/groundtruth/docling_v2/pone.0234687.xml.itxt
@ -1,177 +1,176 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: title: Potential to reduce greenhouse g ...  cattle systems in subtropical regions
-    item-2 at level 2: paragraph: Ribeiro-Filho Henrique M. N.; 1: ... , California, United States of America
-    item-3 at level 2: section_header: Abstract
-      item-4 at level 3: text: Carbon (C) footprint of dairy pr ... uce the C footprint to a small extent.
-    item-5 at level 2: section_header: Introduction
-      item-6 at level 3: text: Greenhouse gas (GHG) emissions f ... suitable for food crop production [4].
-      item-7 at level 3: text: Considering the key role of live ... anagement to mitigate the C footprint.
-      item-8 at level 3: text: In subtropical climate zones, co ... t in tropical pastures (e.g. [17–19]).
-      item-9 at level 3: text: It has been shown that dairy cow ... sions from crop and reduced DM intake.
-      item-10 at level 3: text: The aim of this work was to quan ... uring lactation periods was evaluated.
-    item-11 at level 2: section_header: Materials and methods
-      item-12 at level 3: text: An LCA was developed according t ... 90816 - https://www.udesc.br/cav/ceua.
-      item-13 at level 3: section_header: System boundary
-        item-14 at level 4: text: The goal of the study was to ass ... n were outside of the system boundary.
-      item-15 at level 3: section_header: Functional unit
-        item-16 at level 4: text: The functional unit was one kilo ... tein according to NRC [20] as follows:
-        item-17 at level 4: text: ECM = Milk production × (0.0929  ...  characteristics described in Table 1.
-      item-18 at level 3: section_header: Data sources and livestock system description
-        item-19 at level 4: text: The individual feed requirements ... ed to the ad libitum TMR intake group.
-        item-20 at level 4: text: Using experimental data, three s ... med during an entire lactation period.
-      item-21 at level 3: section_header: Impact assessment
-        item-22 at level 4: text: The CO2e emissions were calculat ... 65 for CO2, CH4 and N2O, respectively.
-      item-23 at level 3: section_header: Feed production
-        item-24 at level 4: section_header: Diets composition
-          item-25 at level 5: text: The DM intake of each ingredient ...  collected throughout the experiments.
-        item-26 at level 4: section_header: GHG emissions from crop and pasture production
-          item-27 at level 5: text: GHG emission factors used for of ... onsume 70% of pastures during grazing.
-          item-28 at level 5: text: Emissions from on-farm feed prod ... factors described by Rotz et al. [42].
-      item-29 at level 3: section_header: Animal husbandry
-        item-30 at level 4: text: The CH4 emissions from enteric f ... 1) = 13.8 + 0.185 × NDF (% DM intake).
-      item-31 at level 3: section_header: Manure from confined cows and urine and dung from grazing animals
-        item-32 at level 4: text: The CH4 emission from manure (kg ... for dietary GE per kg of DM (MJ kg-1).
-        item-33 at level 4: text: The OM digestibility was estimat ... h were 31%, 26% and 46%, respectively.
-        item-34 at level 4: text: The N2O-N emissions from urine a ...  using the IPCC [38] emission factors.
-      item-35 at level 3: section_header: Farm management
-        item-36 at level 4: text: Emissions due to farm management ...  crop and pasture production’ section.
-        item-37 at level 4: text: The amount of fuel use for manur ... me that animals stayed on confinement.
-        item-38 at level 4: text: The emissions from fuel were est ...  × kg CO2e (kg machinery mass)-1 [42].
-        item-39 at level 4: text: Emissions from electricity for m ... ws in naturally ventilated barns [47].
-        item-40 at level 4: text: The lower impact of emissions fr ...  greater than 5% of total C footprint.
-        item-41 at level 4: text: Emissions from farm management d ...  gas and hard coal, respectively [46].
-      item-42 at level 3: section_header: Co-product allocation
-        item-43 at level 4: text: The C footprint for milk produce ...  directly assigned to milk production.
-      item-44 at level 3: section_header: Sensitivity analysis
-        item-45 at level 4: text: A sensitivity index was calculat ... ses a similar change in the footprint.
-    item-46 at level 2: section_header: Results and discussion
-      item-47 at level 3: text: The study has assessed the impac ... , feed production and electricity use.
-      item-48 at level 3: section_header: Greenhouse gas emissions
-        item-49 at level 4: text: Depending on emission factors us ... more than 5% of overall GHG emissions.
-        item-50 at level 4: text: Considering IPCC emission factor ...  the C footprint of the dairy systems.
-        item-51 at level 4: text: The similarity of C footprint be ... of TMR was replaced by pasture access.
-        item-52 at level 4: text: The lower C footprint in scenari ... r, averaging 0.004 kg N2O-N kg-1 [37].
-      item-53 at level 3: section_header: Methane emissions
-        item-54 at level 4: text: The enteric CH4 intensity was si ... ], which did not happen in this study.
-        item-55 at level 4: text: The lack of difference in enteri ...  same scenarios as in this study [26].
-      item-56 at level 3: section_header: Emissions from excreta and feed production
-        item-57 at level 4: text: Using IPCC emission factors for  ...  may not be captured by microbes [65].
-        item-58 at level 4: text: Using local emission factors for ... be revised for the subtropical region.
-        item-59 at level 4: text: Emissions for feed production de ... act, particularly in confinements [9].
-      item-60 at level 3: section_header: Assumptions and limitations
-        item-61 at level 4: text: The milk production and composit ... ions as a function of soil management.
-      item-62 at level 3: section_header: Further considerations
-        item-63 at level 4: text: The potential for using pasture  ... g ECM)-1 in case of foot lesions [72].
-        item-64 at level 4: text: Grazing lands may also improve b ... hange of CO2 would be negligible [76].
-    item-65 at level 2: section_header: Conclusions
-      item-66 at level 3: text: This study assessed the C footpr ... on with or without access to pastures.
-    item-67 at level 2: section_header: Tables
-      item-68 at level 3: table with [13x3]
-        item-68 at level 4: caption: Table 1: Descriptive characteristics of the herd.
-      item-69 at level 3: table with [21x11]
-        item-69 at level 4: caption: Table 2: Dairy cows’ diets in different scenariosa.
-      item-70 at level 3: table with [9x5]
-        item-70 at level 4: caption: Table 3: GHG emission factors for Off- and On-farm feed production.
-      item-71 at level 3: table with [28x5]
-        item-71 at level 4: caption: Table 4: GHG emissions from On-farm feed production.
-      item-72 at level 3: table with [12x4]
-        item-72 at level 4: caption: Table 5: Factors for major resource inputs in farm management.
-    item-73 at level 2: section_header: Figures
-      item-74 at level 3: picture
-        item-74 at level 4: caption: Fig 1: Overview of the milk production system boundary considered in the study.
-      item-75 at level 3: picture
-        item-75 at level 4: caption: Fig 2: Overall greenhouse gas emissions in dairy cattle systems under various scenarios.
-TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].
-      item-76 at level 3: picture
-        item-76 at level 4: caption: Fig 3: Sensitivity of the C footprint.
-Sensitivity index = percentage change in C footprint for a 10% change in the given emission source divided by 10% of. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].
-      item-77 at level 3: picture
-        item-77 at level 4: caption: Fig 4: Greenhouse gas emissions (GHG) from manure and feed production in dairy cattle systems.
-TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38]. (b) Feed production emission factors from Table 3. (c) N2O emission factors for urine and dung from local data [37]. (d) Feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture.
-    item-78 at level 2: section_header: References
-      item-79 at level 3: list: group list
-        item-80 at level 4: list_item: Climate Change and Land. Chapter 5: Food Security (2019)
-        item-81 at level 4: list_item: Herrero M; Henderson B; Havlík P ... ivestock sector. Nat Clim Chang (2016)
-        item-82 at level 4: list_item: Rivera-Ferre MG; López-i-Gelats  ... iley Interdiscip Rev Clim Chang (2016)
-        item-83 at level 4: list_item: van Zanten HHE; Mollenhorst H; K ... ystems. Int J Life Cycle Assess (2016)
-        item-84 at level 4: list_item: Hristov AN; Oh J; Firkins L; Dij ...  mitigation options. J Anim Sci (2013)
-        item-85 at level 4: list_item: Hristov AN; Ott T; Tricarico J;  ...  mitigation options. J Anim Sci (2013)
-        item-86 at level 4: list_item: Montes F; Meinen R; Dell C; Rotz ...  mitigation options. J Anim Sci (2013)
-        item-87 at level 4: list_item: Ledgard SF; Wei S; Wang X; Falco ...  mitigations. Agric Water Manag (2019)
-        item-88 at level 4: list_item: O’Brien D; Shalloo L; Patton J;  ... inement dairy farms. Agric Syst (2012)
-        item-89 at level 4: list_item: Salou T; Le Mouël C; van der Wer ... nal unit matters!. J Clean Prod (2017)
-        item-90 at level 4: list_item: Lizarralde C; Picasso V; Rotz CA ... Case Studies. Sustain Agric Res (2014)
-        item-91 at level 4: list_item: Clark CEF; Kaur R; Millapan LO;  ... ction and behavior. J Dairy Sci (2018)
-        item-92 at level 4: list_item: FAOSTAT.  (2017)
-        item-93 at level 4: list_item: Vogeler I; Mackay A; Vibart R; R ... ms modelling. Sci Total Environ (2016)
-        item-94 at level 4: list_item: Wilkinson JM; Lee MRF; Rivero MJ ... ate pastures. Grass Forage Sci. (2020)
-        item-95 at level 4: list_item: Wales WJ; Marett LC; Greenwood J ... ons of Australia. Anim Prod Sci (2013)
-        item-96 at level 4: list_item: Bargo F; Muller LD; Delahoy JE;  ... otal mixed rations. J Dairy Sci (2002)
-        item-97 at level 4: list_item: Vibart RE; Fellner V; Burns JC;  ... ration and pasture. J Dairy Res (2008)
-        item-98 at level 4: list_item: Mendoza A; Cajarville C; Repetto ... total mixed ration. J Dairy Sci (2016)
-        item-99 at level 4: list_item: Nutrient Requirements of Dairy Cattle (2001)
-        item-100 at level 4: list_item: Noizère P; Sauvant D; Delaby L.  (2018)
-        item-101 at level 4: list_item: Lorenz H; Reinsch T; Hess S; Tau ... roduction systems. J Clean Prod (2019)
-        item-102 at level 4: list_item: INTERNATIONAL STANDARD—Environme ... ent—Requirements and guidelines (2006)
-        item-103 at level 4: list_item: Environmental management—Life cy ... ciples and framework. Iso 14040 (2006)
-        item-104 at level 4: list_item: FAO. Environmental Performance o ... ains: Guidelines for assessment (2016)
-        item-105 at level 4: list_item: Civiero M; Ribeiro-Filho HMN; Sc ... ture Conference,. Foz do Iguaçu (2019)
-        item-106 at level 4: list_item: IPCC—Intergovernmental Panel on  ... d Version). 2014. Available: ttps://. 
-        item-107 at level 4: list_item: INRA. Alimentation des bovins, o ... nra 2007. 4th ed. INRA, editor. 2007. 
-        item-108 at level 4: list_item: Delagarde R; Faverdin P; Baratte ... ng management. Grass Forage Sci (2011)
-        item-109 at level 4: list_item: Ma BL; Liang BC; Biswas DK; Morr ... tions. Nutr Cycl Agroecosystems (2012)
-        item-110 at level 4: list_item: Rauccci GS; Moreira CS; Alves PS ... Mato Grosso State. J Clean Prod (2015)
-        item-111 at level 4: list_item: Camargo GGT; Ryan MR; Richard TL ... nergy Analysis Tool. Bioscience (2013)
-        item-112 at level 4: list_item: da Silva MSJ; Jobim CC; Poppi EC ... outhern Brazil. Rev Bras Zootec (2015)
-        item-113 at level 4: list_item: Duchini PGPG Guzatti GCGC; Ribei ...  monocultures. Crop Pasture Sci (2016)
-        item-114 at level 4: list_item: Scaravelli LFB; Pereira LET; Oli ... om vacas leiteiras. Cienc Rural (2007)
-        item-115 at level 4: list_item: Sbrissia AF; Duchini PG; Zanini  ... ge of grazing heights. Crop Sci (2018)
-        item-116 at level 4: list_item: Almeida JGR; Dall-Orsoletta AC;  ... grazing temperate grass. Animal (2020)
-        item-117 at level 4: list_item: Eggleston H.S.; Buendia L.; Miwa ... nal greenhouse gas inventories. (2006)
-        item-118 at level 4: list_item: Ramalho B; Dieckow J; Barth G; S ... mbric Ferralsol. Eur J Soil Sci (2020)
-        item-119 at level 4: list_item: Fernandes HC; da Silveira JCM; R ... nizadas. Cienc e Agrotecnologia (2008)
-        item-120 at level 4: list_item: Wang M Q. GREET 1.8a Spreadsheet Model. 2007. Available: . 
-        item-121 at level 4: list_item: Rotz CAA; Montes F; Chianese DS; ... e cycle assessment. J Dairy Sci (2010)
-        item-122 at level 4: list_item: Niu M; Kebreab E; Hristov AN; Oh ... ental database. Glob Chang Biol (2018)
-        item-123 at level 4: list_item: Eugène M; Sauvant D; Nozière P;  ... for ruminants. J Environ Manage (2019)
-        item-124 at level 4: list_item: Reed KF; Moraes LE; Casper DP; K ... retion from cattle. J Dairy Sci (2015)
-        item-125 at level 4: list_item: Barros MV; Piekarski CM; De Fran ...  the 2016–2026 period. Energies (2018)
-        item-126 at level 4: list_item: Ludington D; Johnson E. Dairy Fa ...  York State Energy Res Dev Auth (2003)
-        item-127 at level 4: list_item: Thoma G; Jolliet O; Wang Y. A bi ... ply chain analysis. Int Dairy J (2013)
-        item-128 at level 4: list_item: Naranjo A; Johnson A; Rossow H.  ...  dairy industry over 50 years.  (2020)
-        item-129 at level 4: list_item: Jayasundara S; Worden D; Weersin ... roduction systems. J Clean Prod (2019)
-        item-130 at level 4: list_item: Williams SRO; Fisher PD; Berrisf ... ssions. Int J Life Cycle Assess (2014)
-        item-131 at level 4: list_item: Gollnow S; Lundie S; Moore AD; M ...  cows in Australia. Int Dairy J (2014)
-        item-132 at level 4: list_item: O’Brien D; Capper JL; Garnsworth ... -based dairy farms. J Dairy Sci (2014)
-        item-133 at level 4: list_item: Chobtang J; McLaren SJ; Ledgard  ... Region, New Zealand. J Ind Ecol (2017)
-        item-134 at level 4: list_item: Garg MR; Phondba BT; Sherasia PL ... cycle assessment. Anim Prod Sci (2016)
-        item-135 at level 4: list_item: de Léis CM; Cherubini E; Ruviaro ...  study. Int J Life Cycle Assess (2015)
-        item-136 at level 4: list_item: O’Brien D; Geoghegan A; McNamara ... otprint of milk?. Anim Prod Sci (2016)
-        item-137 at level 4: list_item: O’Brien D; Brennan P; Humphreys  ... dology. Int J Life Cycle Assess (2014)
-        item-138 at level 4: list_item: Baek CY; Lee KM; Park KH. Quanti ...  dairy cow system. J Clean Prod (2014)
-        item-139 at level 4: list_item: Dall-Orsoletta AC; Almeida JGR;  ...  to late lactation. J Dairy Sci (2016)
-        item-140 at level 4: list_item: Dall-Orsoletta AC; Oziemblowski  ... entation. Anim Feed Sci Technol (2019)
-        item-141 at level 4: list_item: Niu M; Appuhamy JADRN; Leytem AB ... s simultaneously. Anim Prod Sci (2016)
-        item-142 at level 4: list_item: Waghorn GC; Law N; Bryant M; Pac ... with fodder beet. Anim Prod Sci (2019)
-        item-143 at level 4: list_item: Dickhoefer U; Glowacki S; Gómez  ...  protein and starch. Livest Sci (2018)
-        item-144 at level 4: list_item: Schwab CG; Broderick GA. A 100-Y ... tion in dairy cows. J Dairy Sci (2017)
-        item-145 at level 4: list_item: Sordi A; Dieckow J; Bayer C; Alb ... tureland. Agric Ecosyst Environ (2014)
-        item-146 at level 4: list_item: Simon PL; Dieckow J; de Klein CA ... pastures. Agric Ecosyst Environ (2018)
-        item-147 at level 4: list_item: Wang X; Ledgard S; Luo J; Guo Y; ... e assessment. Sci Total Environ (2018)
-        item-148 at level 4: list_item: Pirlo G; Lolli S. Environmental  ...  Lombardy (Italy). J Clean Prod (2019)
-        item-149 at level 4: list_item: Herzog A; Winckler C; Zollitsch  ... tigation. Agric Ecosyst Environ (2018)
-        item-150 at level 4: list_item: Mostert PF; van Middelaar CE; Bo ... f milk production. J Clean Prod (2018)
-        item-151 at level 4: list_item: Mostert PF; van Middelaar CE; de ...  of milk production. Agric Syst (2018)
-        item-152 at level 4: list_item: Foley JA; Ramankutty N; Brauman  ... for a cultivated planet. Nature (2011)
-        item-153 at level 4: list_item: Lal R.. Soil Carbon Sequestratio ... nd Food Security. Science (80-) (2004)
-        item-154 at level 4: list_item: Boddey RM; Jantalia CP; Conceiça ... al agriculture. Glob Chang Biol (2010)
-        item-155 at level 4: list_item: McConkey B; Angers D; Bentham M; ... he LULUCF sector for NIR 2014.  (2014)
-  item-156 at level 1: caption: Table 1: Descriptive characteristics of the herd.
-  item-157 at level 1: caption: Table 2: Dairy cows’ diets in different scenariosa.
-  item-158 at level 1: caption: Table 3: GHG emission factors for Off- and On-farm feed production.
-  item-159 at level 1: caption: Table 4: GHG emissions from On-farm feed production.
-  item-160 at level 1: caption: Table 5: Factors for major resource inputs in farm management.
-  item-161 at level 1: caption: Fig 1: Overview of the milk prod ... stem boundary considered in the study.
-  item-162 at level 1: caption: Fig 2: Overall greenhouse gas em ... lectricity = 0.205 kg CO2e kWh-1 [46].
-  item-163 at level 1: caption: Fig 3: Sensitivity of the C foot ... lectricity = 0.205 kg CO2e kWh-1 [46].
-  item-164 at level 1: caption: Fig 4: Greenhouse gas emissions  ... uestered CO2-C from perennial pasture.
+    item-2 at level 2: paragraph: Henrique M. N. Ribeiro-Filho, Maurício Civiero, Ermias Kebreab
+    item-3 at level 2: paragraph: Department of Animal Science, Un ... atarina, Lages, Santa Catarina, Brazil
+    item-4 at level 2: section_header: Abstract
+      item-5 at level 3: text: Carbon (C) footprint of dairy pr ... uce the C footprint to a small extent.
+    item-6 at level 2: section_header: Introduction
+      item-7 at level 3: text: Greenhouse gas (GHG) emissions f ... suitable for food crop production [4].
+      item-8 at level 3: text: Considering the key role of live ... anagement to mitigate the C footprint.
+      item-9 at level 3: text: In subtropical climate zones, co ... t in tropical pastures (e.g. [17–19]).
+      item-10 at level 3: text: It has been shown that dairy cow ... sions from crop and reduced DM intake.
+      item-11 at level 3: text: The aim of this work was to quan ... uring lactation periods was evaluated.
+    item-12 at level 2: section_header: Materials and methods
+      item-13 at level 3: text: An LCA was developed according t ... 90816 - https://www.udesc.br/cav/ceua.
+      item-14 at level 3: section_header: System boundary
+        item-15 at level 4: text: The goal of the study was to ass ... n were outside of the system boundary.
+        item-16 at level 4: picture
+          item-16 at level 5: caption: Fig 1 Overview of the milk production system boundary considered in the study.
+      item-17 at level 3: section_header: Functional unit
+        item-18 at level 4: text: The functional unit was one kilo ... tein according to NRC [20] as follows:
+        item-19 at level 4: text: ECM = Milk production × (0.0929  ...  characteristics described in Table 1.
+        item-20 at level 4: table with [13x3]
+          item-20 at level 5: caption: Table 1 Descriptive characteristics of the herd.
+      item-21 at level 3: section_header: Data sources and livestock system description
+        item-22 at level 4: text: The individual feed requirements ... ed to the ad libitum TMR intake group.
+        item-23 at level 4: text: Using experimental data, three s ... med during an entire lactation period.
+      item-24 at level 3: section_header: Impact assessment
+        item-25 at level 4: text: The CO2e emissions were calculat ... 65 for CO2, CH4 and N2O, respectively.
+      item-26 at level 3: section_header: Feed production
+        item-27 at level 4: section_header: Diets composition
+          item-28 at level 5: text: The DM intake of each ingredient ...  collected throughout the experiments.
+          item-29 at level 5: table with [21x11]
+            item-29 at level 6: caption: Table 2 Dairy cows’ diets in different scenariosa.
+        item-30 at level 4: section_header: GHG emissions from crop and pasture production
+          item-31 at level 5: text: GHG emission factors used for of ... onsume 70% of pastures during grazing.
+          item-32 at level 5: table with [9x5]
+            item-32 at level 6: caption: Table 3 GHG emission factors for Off- and On-farm feed production.
+          item-33 at level 5: text: Emissions from on-farm feed prod ... factors described by Rotz et al. [42].
+          item-34 at level 5: table with [28x5]
+            item-34 at level 6: caption: Table 4 GHG emissions from On-farm feed production.
+      item-35 at level 3: section_header: Animal husbandry
+        item-36 at level 4: text: The CH4 emissions from enteric f ... 1) = 13.8 + 0.185 × NDF (% DM intake).
+      item-37 at level 3: section_header: Manure from confined cows and urine and dung from grazing animals
+        item-38 at level 4: text: The CH4 emission from manure (kg ... for dietary GE per kg of DM (MJ kg-1).
+        item-39 at level 4: text: The OM digestibility was estimat ... h were 31%, 26% and 46%, respectively.
+        item-40 at level 4: text: The N2O-N emissions from urine a ...  using the IPCC [38] emission factors.
+      item-41 at level 3: section_header: Farm management
+        item-42 at level 4: text: Emissions due to farm management ...  crop and pasture production’ section.
+        item-43 at level 4: table with [12x4]
+          item-43 at level 5: caption: Table 5 Factors for major resource inputs in farm management.
+        item-44 at level 4: text: The amount of fuel use for manur ... me that animals stayed on confinement.
+        item-45 at level 4: text: The emissions from fuel were est ...  × kg CO2e (kg machinery mass)-1 [42].
+        item-46 at level 4: text: Emissions from electricity for m ... ws in naturally ventilated barns [47].
+      item-47 at level 3: section_header: Co-product allocation
+        item-48 at level 4: text: The C footprint for milk produce ...  directly assigned to milk production.
+      item-49 at level 3: section_header: Sensitivity analysis
+        item-50 at level 4: text: A sensitivity index was calculat ... ses a similar change in the footprint.
+    item-51 at level 2: section_header: Results and discussion
+      item-52 at level 3: text: The study has assessed the impac ... , feed production and electricity use.
+      item-53 at level 3: section_header: Greenhouse gas emissions
+        item-54 at level 4: text: Depending on emission factors us ... more than 5% of overall GHG emissions.
+        item-55 at level 4: picture
+          item-55 at level 5: caption: Fig 2 Overall greenhouse gas emissions in dairy cattle systems under various scenarios. TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].
+        item-56 at level 4: text: Considering IPCC emission factor ...  the C footprint of the dairy systems.
+        item-57 at level 4: text: The similarity of C footprint be ... of TMR was replaced by pasture access.
+        item-58 at level 4: text: The lower C footprint in scenari ... r, averaging 0.004 kg N2O-N kg-1 [37].
+      item-59 at level 3: section_header: Methane emissions
+        item-60 at level 4: text: The enteric CH4 intensity was si ... ], which did not happen in this study.
+        item-61 at level 4: picture
+          item-61 at level 5: caption: Fig 3 Sensitivity of the C footprint. Sensitivity index = percentage change in C footprint for a 10% change in the given emission source divided by 10% of. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].
+        item-62 at level 4: text: The lack of difference in enteri ...  same scenarios as in this study [26].
+      item-63 at level 3: section_header: Emissions from excreta and feed production
+        item-64 at level 4: text: Using IPCC emission factors for  ...  may not be captured by microbes [65].
+        item-65 at level 4: picture
+          item-65 at level 5: caption: Fig 4 Greenhouse gas emissions (GHG) from manure and feed production in dairy cattle systems. TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38]. (b) Feed production emission factors from Table 3. (c) N2O emission factors for urine and dung from local data [37]. (d) Feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture.
+        item-66 at level 4: text: Using local emission factors for ... be revised for the subtropical region.
+        item-67 at level 4: text: Emissions for feed production de ... act, particularly in confinements [9].
+      item-68 at level 3: section_header: Farm management
+        item-69 at level 4: text: The lower impact of emissions fr ...  greater than 5% of total C footprint.
+        item-70 at level 4: text: Emissions from farm management d ...  gas and hard coal, respectively [46].
+      item-71 at level 3: section_header: Assumptions and limitations
+        item-72 at level 4: text: The milk production and composit ... ions as a function of soil management.
+      item-73 at level 3: section_header: Further considerations
+        item-74 at level 4: text: The potential for using pasture  ... g ECM)-1 in case of foot lesions [72].
+        item-75 at level 4: text: Grazing lands may also improve b ... hange of CO2 would be negligible [76].
+    item-76 at level 2: section_header: Conclusions
+      item-77 at level 3: text: This study assessed the C footpr ... on with or without access to pastures.
+    item-78 at level 2: section_header: Acknowledgments
+      item-79 at level 3: text: Thanks to Anna Naranjo for helpf ...  of the herd considered in this study.
+    item-80 at level 2: section_header: References
+      item-81 at level 3: list: group list
+        item-82 at level 4: list_item: IPCC. Climate Change and Land. Chapter 5: Food Security. 2019.
+        item-83 at level 4: list_item: HerreroM, HendersonB, HavlíkP, T ...  2016;6: 452–461. 10.1038/nclimate2925
+        item-84 at level 4: list_item: Rivera-FerreMG, López-i-GelatsF, ... hang. 2016;7: 869–892. 10.1002/wcc.421
+        item-85 at level 4: list_item: van ZantenHHE, MollenhorstH, Klo ... 21: 747–758. 10.1007/s11367-015-0944-1
+        item-86 at level 4: list_item: HristovAN, OhJ, FirkinsL, Dijkst ... 5–5069. 10.2527/jas.2013-6583 24045497
+        item-87 at level 4: list_item: HristovAN, OttT, TricaricoJ, Rot ... 5–5113. 10.2527/jas.2013-6585 24045470
+        item-88 at level 4: list_item: MontesF, MeinenR, DellC, RotzA,  ... 0–5094. 10.2527/jas.2013-6584 24045493
+        item-89 at level 4: list_item: LedgardSF, WeiS, WangX, Falconer ... : 155–163. 10.1016/j.agwat.2018.10.009
+        item-90 at level 4: list_item: O’BrienD, ShallooL, PattonJ, Buc ... 107: 33–46. 10.1016/j.agsy.2011.11.004
+        item-91 at level 4: list_item: SalouT, Le MouëlC, van der WerfH ... od. 2017 10.1016/j.jclepro.2016.05.019
+        item-92 at level 4: list_item: LizarraldeC, PicassoV, RotzCA, C ... gric Res. 2014;3: 1 10.5539/sar.v3n2p1
+        item-93 at level 4: list_item: ClarkCEF, KaurR, MillapanLO, Gol ... –5465. 10.3168/jds.2017-13388 29550132
+        item-94 at level 4: list_item: Food and Agriculture Organization. FAOSTAT. 2017.
+        item-95 at level 4: list_item: VogelerI, MackayA, VibartR, Rend ... .1016/j.scitotenv.2016.05.006 27203517
+        item-96 at level 4: list_item: WilkinsonJM, LeeMRF, RiveroMJ, C ... 0;75: 1–17. 10.1111/gfs.12458 32109974
+        item-97 at level 4: list_item: WalesWJ, MarettLC, GreenwoodJS,  ... i. 2013;53: 1167–1178. 10.1071/AN13207
+        item-98 at level 4: list_item: BargoF, MullerLD, DelahoyJE, Cas ... 168/jds.S0022-0302(02)74381-6 12487461
+        item-99 at level 4: list_item: VibartRE, FellnerV, BurnsJC, Hun ... 80. 10.1017/S0022029908003361 18701000
+        item-100 at level 4: list_item: MendozaA, CajarvilleC, RepettoJL ... –1944. 10.3168/jds.2015-10257 26778319
+        item-101 at level 4: list_item: NRC. Nutrient Requirements of Da ... gton DC: National Academy Press; 2001.
+        item-102 at level 4: list_item: INRA. INRA Feeding System for Ru ... shiers; 2018 10.3920/978-90-8686-872-8
+        item-103 at level 4: list_item: LorenzH, ReinschT, HessS, TaubeF ... 161–170. 10.1016/j.jclepro.2018.11.113
+        item-104 at level 4: list_item: ISO 14044. INTERNATIONAL STANDAR ... rements and guidelines. 2006;2006: 46.
+        item-105 at level 4: list_item: ISO 14040. The International Sta ... ;2006: 1–28. 10.1136/bmj.332.7550.1107
+        item-106 at level 4: list_item: FAO. Environmental Performance o ... nerships/leap/resources/guidelines/en/
+        item-107 at level 4: list_item: CivieroM, Ribeiro-FilhoHMN, Scha ... nce,. Foz do Iguaçu; 2019 pp. 141–141.
+        item-108 at level 4: list_item: IPCC—Intergovernmental Panel on  ... /2018/05/SYR_AR5_FINAL_full_wcover.pdf
+        item-109 at level 4: list_item: INRA. Alimentation des bovins, o ... Inra 2007. 4th ed. INRA, editor. 2007.
+        item-110 at level 4: list_item: DelagardeR, FaverdinP, BaratteC, ... 5–60. 10.1111/j.1365-2494.2010.00770.x
+        item-111 at level 4: list_item: MaBL, LiangBC, BiswasDK, Morriso ... 2;94: 15–31. 10.1007/s10705-012-9522-0
+        item-112 at level 4: list_item: RauccciGS, MoreiraCS, AlvesPS, M ... State. J Clean Prod. 2015;96: 418–425.
+        item-113 at level 4: list_item: CamargoGGT, RyanMR, RichardTL. E ... 3;63: 263–273. 10.1525/bio.2013.63.4.6
+        item-114 at level 4: list_item: da SilvaMSJ, JobimCC, PoppiEC, T ... 3–313. 10.1590/S1806-92902015000900001
+        item-115 at level 4: list_item: Duchini PGPGGuzatti GCGC, Ribeir ... Sci. 2016;67: 574–581. 10.1071/CP15170
+        item-116 at level 4: list_item: ScaravelliLFB, PereiraLET, Olivo ... teiras. Cienc Rural. 2007;37: 841–846.
+        item-117 at level 4: list_item: SbrissiaAF, DuchiniPG, ZaniniGD, ... : 945–954. 10.2135/cropsci2017.07.0447
+        item-118 at level 4: list_item: AlmeidaJGR, Dall-OrsolettaAC, Oz ... 12. 10.1017/S1751731119003057 31907089
+        item-119 at level 4: list_item: Intergovernamental Panel on Clim ... Global Environmental Strategies; 2006.
+        item-120 at level 4: list_item: RamalhoB, DieckowJ, BarthG, Simo ... il Sci. 2020; 1–14. 10.1111/ejss.12933
+        item-121 at level 4: list_item: FernandesHC, da SilveiraJCM, Rin ... –1587. 10.1590/s1413-70542008000500034
+        item-122 at level 4: list_item: Wang M Q. GREET 1.8a Spreadsheet ... transportation.anl.gov/software/GREET/
+        item-123 at level 4: list_item: RotzCAA, MontesF, ChianeseDS, Ch ... 6–1282. 10.3168/jds.2009-2162 20172247
+        item-124 at level 4: list_item: NiuM, KebreabE, HristovAN, OhJ,  ...  3368–3389. 10.1111/gcb.14094 29450980
+        item-125 at level 4: list_item: EugèneM, SauvantD, NozièreP, Via ... 10.1016/j.jenvman.2018.10.086 30602259
+        item-126 at level 4: list_item: ReedKF, MoraesLE, CasperDP, Kebr ... 5–3035. 10.3168/jds.2014-8397 25747829
+        item-127 at level 4: list_item: BarrosMV, PiekarskiCM, De Franci ... . Energies. 2018;11 10.3390/en11061412
+        item-128 at level 4: list_item: LudingtonD, JohnsonE. Dairy Farm ...  York State Energy Res Dev Auth. 2003.
+        item-129 at level 4: list_item: ThomaG, JollietO, WangY. A bioph ...  2013;31 10.1016/j.idairyj.2012.08.012
+        item-130 at level 4: list_item: NaranjoA, JohnsonA, RossowH. Gre ... . 2020 10.3168/jds.2019-16576 32037166
+        item-131 at level 4: list_item: JayasundaraS, WordenD, WeersinkA ... 18–1028. 10.1016/j.jclepro.2019.04.013
+        item-132 at level 4: list_item: WilliamsSRO, FisherPD, Berrisfor ... 4;19: 69–78. 10.1007/s11367-013-0619-8
+        item-133 at level 4: list_item: GollnowS, LundieS, MooreAD, McLa ... : 31–38. 10.1016/j.idairyj.2014.02.005
+        item-134 at level 4: list_item: O’BrienD, CapperJL, GarnsworthyP ... i. 2014 10.3168/jds.2013-7174 24440256
+        item-135 at level 4: list_item: ChobtangJ, McLarenSJ, LedgardSF, ... 2017;21: 1139–1152. 10.1111/jiec.12484
+        item-136 at level 4: list_item: GargMR, PhondbaBT, SherasiaPL, M ... Sci. 2016;56: 423–436. 10.1071/AN15464
+        item-137 at level 4: list_item: de LéisCM, CherubiniE, RuviaroCF ... 5;20: 46–60. 10.1007/s11367-014-0813-3
+        item-138 at level 4: list_item: O’BrienD, GeogheganA, McNamaraK, ... Sci. 2016;56: 495–500. 10.1071/AN15490
+        item-139 at level 4: list_item: O’BrienD, BrennanP, HumphreysJ,  ... : 1469–1481. 10.1007/s11367-014-0755-9
+        item-140 at level 4: list_item: BaekCY, LeeKM, ParkKH. Quantific ... : 50–60. 10.1016/j.jclepro.2014.02.010
+        item-141 at level 4: list_item: Dall-OrsolettaAC, AlmeidaJGR, Ca ... –4383. 10.3168/jds.2015-10396 27016830
+        item-142 at level 4: list_item: Dall-OrsolettaAC, OziemblowskiMM ... 5–73. 10.1016/j.anifeedsci.2019.05.009
+        item-143 at level 4: list_item: NiuM, AppuhamyJADRN, LeytemAB, D ... Sci. 2016;56: 312–321. 10.1071/AN15498
+        item-144 at level 4: list_item: WaghornGC, LawN, BryantM, Pachec ... i. 2019;59: 1261–1270. 10.1071/AN18018
+        item-145 at level 4: list_item: DickhoeferU, GlowackiS, GómezCA, ...  109–118. 10.1016/j.livsci.2018.08.004
+        item-146 at level 4: list_item: SchwabCG, BroderickGA. A 100-Yea ... 10112. 10.3168/jds.2017-13320 29153157
+        item-147 at level 4: list_item: SordiA, DieckowJ, BayerC, Alburq ... 90: 94–103. 10.1016/j.agee.2013.09.004
+        item-148 at level 4: list_item: SimonPL, DieckowJ, de KleinCAM,  ... 267: 74–82. 10.1016/j.agee.2018.08.013
+        item-149 at level 4: list_item: WangX, LedgardS, LuoJ, GuoY, Zha ... .1016/j.scitotenv.2017.12.259 29291563
+        item-150 at level 4: list_item: PirloG, LolliS. Environmental im ... 962–971. 10.1016/j.jclepro.2018.11.070
+        item-151 at level 4: list_item: HerzogA, WincklerC, ZollitschW.  ... 7: 174–187. 10.1016/j.agee.2018.07.029
+        item-152 at level 4: list_item: MostertPF, van MiddelaarCE, Bokk ... od. 2018 10.1016/j.jclepro.2017.10.019
+        item-153 at level 4: list_item: MostertPF, van MiddelaarCE, de B ... 7: 206–212. 10.1016/j.agsy.2018.09.006
+        item-154 at level 4: list_item: FoleyJA, RamankuttyN, BraumanKA, ...  337–342. 10.1038/nature10452 21993620
+        item-155 at level 4: list_item: LalR. Soil Carbon Sequestration  ... 1627. 10.1126/science.1097396 15192216
+        item-156 at level 4: list_item: BoddeyRM, JantaliaCP, ConceiçaoP ... –795. 10.1111/j.1365-2486.2009.02020.x
+        item-157 at level 4: list_item: McConkeyB, AngersD, BenthamM, Bo ...  the LULUCF sector for NIR 2014. 2014.
+  item-158 at level 1: caption: Fig 1 Overview of the milk produ ... stem boundary considered in the study.
+  item-159 at level 1: caption: Table 1 Descriptive characteristics of the herd.
+  item-160 at level 1: caption: Table 2 Dairy cows’ diets in different scenariosa.
+  item-161 at level 1: caption: Table 3 GHG emission factors for Off- and On-farm feed production.
+  item-162 at level 1: caption: Table 4 GHG emissions from On-farm feed production.
+  item-163 at level 1: caption: Table 5 Factors for major resource inputs in farm management.
+  item-164 at level 1: caption: Fig 2 Overall greenhouse gas emi ... lectricity = 0.205 kg CO2e kWh-1 [46].
+  item-165 at level 1: caption: Fig 3 Sensitivity of the C footp ... lectricity = 0.205 kg CO2e kWh-1 [46].
+  item-166 at level 1: caption: Fig 4 Greenhouse gas emissions ( ... uestered CO2-C from perennial pasture.
--- a/tests/data/groundtruth/docling_v2/pone.0234687.xml.json
+++ b/tests/data/groundtruth/docling_v2/pone.0234687.xml.json
--- a/tests/data/groundtruth/docling_v2/pone.0234687.xml.md
+++ b/tests/data/groundtruth/docling_v2/pone.0234687.xml.md
@ -1,6 +1,8 @@
 # Potential to reduce greenhouse gas emissions through different dairy cattle systems in subtropical regions

-Ribeiro-Filho Henrique M. N.; 1: Department of Animal Science, University of California, Davis, California, United States of America, 2: Programa de Pós-graduação em Ciência Animal, Universidade do Estado de Santa Catarina, Lages, Santa Catarina, Brazil; Civiero Maurício; 2: Programa de Pós-graduação em Ciência Animal, Universidade do Estado de Santa Catarina, Lages, Santa Catarina, Brazil; Kebreab Ermias; 1: Department of Animal Science, University of California, Davis, California, United States of America
+Henrique M. N. Ribeiro-Filho, Maurício Civiero, Ermias Kebreab
+
+Department of Animal Science, University of California, Davis, California, United States of America; Programa de Pós-graduação em Ciência Animal, Universidade do Estado de Santa Catarina, Lages, Santa Catarina, Brazil

 ## Abstract

@ -26,12 +28,33 @@ An LCA was developed according to the ISO standards [23,24] and Food and Agricul

 The goal of the study was to assess the C footprint of annual tropical and temperate pastures in lactating dairy cow diets. The production system was divided into four main processes: (i) animal husbandry, (ii) manure management and urine and dung deposited by grazing animals, (iii) production of feed ingredients and (iv) farm management (Fig 1). The study boundary included all processes up to the animal farm gate (cradle to gate), including secondary sources such as GHG emissions during the production of fuel, electricity, machinery, manufacturing of fertilizer, pesticides, seeds and plastic used in silage production. Fuel combustion and machinery (manufacture and repairs) for manure handling and electricity for milking and confinement were accounted as emissions from farm management. Emissions post milk production were assumed to be similar for all scenarios, therefore, activities including milk processing, distribution, retail or consumption were outside of the system boundary.

+Fig 1 Overview of the milk production system boundary considered in the study.
+
+<!-- image -->
+
 ### Functional unit

 The functional unit was one kilogram of energy-corrected milk (ECM) at the farm gate. All processes in the system were calculated based on one kilogram ECM. The ECM was calculated by multiplying milk production by the ratio of the energy content of the milk to the energy content of standard milk with 4% fat and 3.3% true protein according to NRC [20] as follows:

 ECM = Milk production × (0.0929 × fat% + 0.0588× true protein% + 0.192) / (0.0929 × (4%) + 0.0588 × (3.3%) + 0.192), where fat% and protein% are fat and protein percentages in milk, respectively. The average milk production and composition were recorded from the University of Santa Catarina State (Brazil) herd, considering 165 lactations between 2009 and 2018. The herd is predominantly Holstein × Jersey cows, with key characteristics described in Table 1.

+Table 1 Descriptive characteristics of the herd.
+
+| Item                          | Unit      | Average   |
+|-------------------------------|-----------|-----------|
+| Milking cows                  | #         | 165       |
+| Milk production               | kg year-1 | 7,015     |
+| Milk fat                      | %         | 4.0       |
+| Milk protein                  | %         | 3.3       |
+| Length of lactation           | days      | 305       |
+| Body weight                   | kg        | 553       |
+| Lactations per cow            | #         | 4         |
+| Replacement rate              | %         | 25        |
+| Cull rate                     | %         | 25        |
+| First artificial insemination | months    | 16        |
+| Weaned                        | days      | 60        |
+| Mortality                     | %         | 3.0       |
+
 ### Data sources and livestock system description

 The individual feed requirements, as well as the milk production responses based on feed strategies were based on data recorded from the herd described above and two experiments performed using lactating cows from the same herd. Due to the variation on herbage production throughout the year, feed requirements were estimated taking into consideration that livestock systems have a calving period in April, which represents the beginning of fall season in the southern Hemisphere. The experiments have shown a 10% reduction in ECM production in dairy cows that received both 75 and 50% of ad libitum TMR intake with access to grazing a tropical pasture (pearl-millet, Pennisetum glaucum ‘Campeiro’) compared to cows receiving ad libitum TMR intake. Cows grazing on a temperate pasture (ryegrass, Lolium multiflorum ‘Maximus’) did not need changes to ECM production compared to the ad libitum TMR intake group.
@ -48,108 +71,7 @@ The CO2e emissions were calculated by multiplying the emissions of CO2, CH4 and

 The DM intake of each ingredient throughout the entire life of animals during lactation periods was calculated for each scenario: cows receiving only TMR, cows receiving 75% of TMR with annual pastures and cows receiving 50% of TMR with annual pastures (Table 2). In each of other phases of life (calf, heifer, dry cow), animals received the same diet, including a perennial tropical pasture (kikuyu grass, Pennisetum clandestinum). The DM intake of calves, heifers and dry cows was calculated assuming 2.8, 2.5 and 1.9% body weight, respectively [20]. In each case, the actual DM intake of concentrate and corn silage was recorded, and pasture DM intake was estimated by the difference between daily expected DM intake and actual DM intake of concentrate and corn silage. For lactating heifers and cows, TMR was formulated to meet the net energy for lactation (NEL) and metabolizable protein (MP) requirements of experimental animals, according to [28]. The INRA system was used because it is possible to estimate pasture DM intake taking into account the TMR intake, pasture management and the time of access to pasture using the GrazeIn model [29], which was integrated in the software INRAtion 4.07 (https://www.inration.educagri.fr/fr/forum.php). The nutrient intake was calculated as a product of TMR and pasture intake and the nutrient contents of TMR and pasture, respectively, which were determined in feed samples collected throughout the experiments.

-#### GHG emissions from crop and pasture production
-
-GHG emission factors used for off- and on-farm feed production were based on literature values, and are presented in Table 3. The emission factor used for corn grain is the average of emission factors observed in different levels of synthetic N fertilization [30]. The emission factor used for soybean is based on Brazilian soybean production [31]. The emissions used for corn silage, including feed processing (cutting, crushing and mixing), and annual or perennial grass productions were 3300 and 1500 kg CO2e ha-1, respectively [32]. The DM production (kg ha-1) of corn silage and pastures were based on regional and locally recorded data [33–36], assuming that animals are able to consume 70% of pastures during grazing.
-
-Emissions from on-farm feed production (corn silage and pasture) were estimated using primary and secondary sources based on the actual amount of each input (Table 4). Primary sources were direct and indirect N2O-N emissions from organic and synthetic fertilizers and crop/pasture residues, CO2-C emissions from lime and urea applications, as well as fuel combustion. The direct N2O-N emission factor (kg (kg N input)-1) is based on a local study performed previously [37]. For indirect N2O-N emissions (kg N2O-N (kg NH3-N + NOx)-1), as well as CO2-C emissions from lime + urea, default values proposed by IPCC [38] were used. For perennial pastures, a C sequestration of 0.57 t ha-1 was used based on a 9-year study conducted in southern Brazil [39]. Due to the use of conventional tillage, no C sequestration was considered for annual pastures. The amount of fuel required was 8.9 (no-tillage) and 14.3 L ha-1 (disking) for annual tropical and temperate pastures, respectively [40]. The CO2 from fuel combustion was 2.7 kg CO2 L-1 [41]. Secondary sources of emissions during the production of fuel, machinery, fertilizer, pesticides, seeds and plastic for ensilage were estimated using emission factors described by Rotz et al. [42].
-
-### Animal husbandry
-
-The CH4 emissions from enteric fermentation intensity (g (kg ECM)-1) was a function of estimated CH4 yield (g (kg DM intake)-1), actual DM intake and ECM. The enteric CH4 yield was estimated as a function of neutral detergent fiber (NDF) concentration on total DM intake, as proposed by Niu et al. [43], where: CH4 yield (g (kg DM intake)-1) = 13.8 + 0.185 × NDF (% DM intake).
-
-### Manure from confined cows and urine and dung from grazing animals
-
-The CH4 emission from manure (kg (kg ECM)-1) was a function of daily CH4 emission from manure (kg cow-1) and daily ECM (kg cow-1). The daily CH4 emission from manure was estimated according to IPCC [38], which considered daily volatile solid (VS) excreted (kg DM cow-1) in manure. The daily VS was estimated as proposed by Eugène et al. [44] as: VS = NDOMI + (UE × GE) × (OM/18.45), where: VS = volatile solid excretion on an organic matter (OM) basis (kg day-1), NDOMI = non-digestible OM intake (kg day-1): (1- OM digestibility) × OM intake, UE = urinary energy excretion as a fraction of GE (0.04), GE = gross energy intake (MJ day-1), OM = organic matter (g), 18.45 = conversion factor for dietary GE per kg of DM (MJ kg-1).
-
-The OM digestibility was estimated as a function of chemical composition, using equations published by INRA [21], which takes into account the effects of digestive interactions due to feeding level, the proportion of concentrate and rumen protein balance on OM digestibility. For scenarios where cows had access to grazing, the amount of calculated VS were corrected as a function of the time at pasture. The biodegradability of manure factor (0.13 for dairy cows in Latin America) and methane conversion factor (MCF) values were taken from IPCC [38]. The MCF values for pit storage below animal confinements (&gt; 1 month) were used for the calculation, taking into account the annual average temperature (16.6ºC) or the average temperatures during the growth period of temperate (14.4ºC) or tropical (21ºC) annual pastures, which were 31%, 26% and 46%, respectively.
-
-The N2O-N emissions from urine and feces were estimated considering the proportion of N excreted as manure and storage or as urine and dung deposited by grazing animals. These proportions were calculated based on the proportion of daily time that animals stayed on pasture (7 h/24 h = 0.29) or confinement (1−0.29 = 0.71). For lactating heifers and cows, the total amount of N excreted was calculated by the difference between N intake and milk N excretion. For heifers and non-lactating cows, urinary and fecal N excretion were estimated as proposed by Reed et al. [45] (Table 3: equations 10 and 12, respectively). The N2O emissions from stored manure as well as urine and dung during grazing were calculated based on the conversion of N2O-N emissions to N2O emissions, where N2O emissions = N2O-N emissions × 44/28. The emission factors were 0.002 kg N2O-N (kg N)-1 stored in a pit below animal confinements, and 0.02 kg N2O-N (kg of urine and dung)-1 deposited on pasture [38]. The indirect N2O emissions from storage manure and urine and dung deposits on pasture were also estimated using the IPCC [38] emission factors.
-
-### Farm management
-
-Emissions due to farm management included those from fuel and machinery for manure handling and electricity for milking and confinement (Table 5). Emissions due to feed processing such as cutting, crushing, mixing and distributing, as well as secondary sources of emissions during the production of fuel, machinery, fertilizer, pesticides, seeds and plastic for ensilage were included in ‘Emissions from crop and pasture production’ section.
-
-The amount of fuel use for manure handling were estimated taking into consideration the amount of manure produced per cow and the amounts of fuel required for manure handling (L diesel t-1) [42]. The amount of manure was estimated from OM excretions (kg cow-1), assuming that the manure has 8% ash on DM basis and 60% DM content. The OM excretions were calculated by NDOMI × days in confinement × proportion of daily time that animals stayed on confinement.
-
-The emissions from fuel were estimated considering the primary (emissions from fuel burned) and secondary (emissions for producing and transporting fuel) emissions. The primary emissions were calculated by the amount of fuel required for manure handling (L) × (kg CO2e L-1) [41]. The secondary emissions from fuel were calculated by the amount of fuel required for manure handling × emissions for production and transport of fuel (kg CO2e L-1) [41]. Emissions from manufacture and repair of machinery for manure handling were estimated by manure produced per cow (t) × (kg machinery mass (kg manure)-1 × 10−3) [42] × kg CO2e (kg machinery mass)-1 [42].
-
-Emissions from electricity for milking and confinement were estimated using two emission factors (kg CO2 kWh-1). The first one is based on United States electricity matrix [41], and was used as a reference of an electricity matrix with less hydroelectric power than the region under study. The second is based on the Brazilian electricity matrix [46]. The electricity required for milking activities is 0.06 kWh (kg milk produced)-1 [47]. The annual electricity use for lighting was 75 kWh cow-1, which is the value considered for lactating cows in naturally ventilated barns [47].
-
-The lower impact of emissions from farm management is in agreement with other studies conducted in Europe [9, 62] and USA [42, 55], where the authors found that most emissions in dairy production systems are from enteric fermentation, feed production and emissions from excreta. As emissions from fuel for on-farm feed production were accounted into the ‘emissions from crop and pasture production’, total emissions from farm management were not greater than 5% of total C footprint.
-
-Emissions from farm management dropped when the emission factor for electricity generation was based on the Brazilian matrix. In this case, the emission factor for electricity generation (0.205 kg CO2e kWh-1 [46]) is much lower than that in a LCA study conducted in US (0.73 kg CO2e kWh-1 [42]). This apparent discrepancy is explained because in 2016, almost 66% of the electricity generated in Brazil was from hydropower, which has an emission factor of 0.074 kg CO2e kWh-1 against 0.382 and 0.926 kg CO2e kWh-1 produced by natural gas and hard coal, respectively [46].
-
-### Co-product allocation
-
-The C footprint for milk produced in the system was calculated using a biophysical allocation approach, as recommended by the International Dairy Federation [49], and described by Thoma et al. [48]. Briefly, ARmilk = 1–6.04 × BMR, where: ARmilk is the allocation ratio for milk and BMR is cow BW at the time of slaughter (kg) + calf BW sold (kg) divided by the total ECM produced during cow`s entire life (kg). The ARmilk were 0.854 and 0.849 for TMR and TMR with both pasture scenarios, respectively. The ARmilk was applied to the whole emissions, except for the electricity consumed for milking (milking parlor) and refrigerant loss, which was directly assigned to milk production.
-
-### Sensitivity analysis
-
-A sensitivity index was calculated as described by Rotz et al. [42]. The sensitivity index was defined for each emission source as the percentage change in the C footprint for a 10% change in the given emission source divided by 10%. Thus, a value near 0 indicates a low sensitivity, whereas an index near or greater than 1 indicates a high sensitivity because a change in this value causes a similar change in the footprint.
-
-## Results and discussion
-
-The study has assessed the impact of tropical and temperate pastures in dairy cows fed TMR on the C footprint of dairy production in subtropics. Different factors were taken in to consideration to estimate emissions from manure (or urine and dung) of grazing animals, feed production and electricity use.
-
-### Greenhouse gas emissions
-
-Depending on emission factors used for calculating emissions from urine and dung (IPCC or local data) and feed production (Tables 3 or 4), the C footprint was similar (Fig 2A and 2B) or decreased by 0.04 kg CO2e (kg ECM)-1 (Fig 2C and 2D) in scenarios that included pastures compared to ad libitum TMR intake. Due to differences in emission factors, the overall GHG emission values ranged from 0.92 to 1.04 kg CO2e (kg ECM)-1 for dairy cows receiving TMR exclusively, and from 0.88 to 1.04 kg CO2e (kg ECM)-1 for cows with access to pasture. Using IPCC emission factors [38], manure emissions increased as TMR intake went down (Fig 2A and 2B). However, using local emission factors for estimating N2O-N emissions [37], manure emissions decreased as TMR intake went down (Fig 2C and 2D). Regardless of emission factors used (Tables 3 or 4), emissions from feed production decreased to a small extent as the proportion of TMR intake decreased. Emissions from farm management did not contribute more than 5% of overall GHG emissions.
-
-Considering IPCC emission factors for N2O emissions from urine and dung [38] and those from Table 3, the C footprint ranged from 0.99 to 1.04 kg CO2e (kg ECM)-1, and was close to those reported under confined based systems in California [49], Canada [50], China [8], Ireland [9], different scenarios in Australia [51,52] and Uruguay [11], which ranged from 0.98 to 1.16 kg CO2e (kg ECM)-1. When local emission factors for N2O emissions from urine and dung [37] and those from Table 4 were taking into account, the C footprint for scenarios including pasture, without accounting for sequestered CO2-C from perennial pasture—0.91 kg CO2e (kg ECM)-1—was lower than the range of values described above. However, these values were still greater than high-performance confinement systems in UK and USA [53] or grass based dairy systems in Ireland [9,53] and New Zealand [8,54], which ranged from 0.52 to 0.89 kg CO2e (kg ECM)-1. Regardless of which emission factor was used, we found a lower C footprint in all conditions compared to scenarios with lower milk production per cow or in poor conditions of manure management, which ranged from 1.4 to 2.3 kg CO2e (kg ECM)-1 [8,55]. Thus, even though differences between studies may be partially explained by various assumptions (e.g., emission factors, co-product allocation, methane emissions estimation, sequestered CO2-C, etc.), herd productivity and manure management were systematically associated with the C footprint of the dairy systems.
-
-The similarity of C footprint between different scenarios using IPCC [38] for estimating emissions from manure and for emissions from feed production (Table 3) was a consequence of the trade-off between greater manure emissions and lower emissions to produce feed, as the proportion of pasture in diets increased. Additionally, the small negative effect of pasture on ECM production also contributed to the trade-off. The impact of milk production on the C footprint was reported in a meta-analysis comprising 30 studies from 15 different countries [22]. As observed in this study (Fig 2A and 2B) the authors reported no significant difference between the C footprint of pasture-based vs. confinement systems. However, they observed that an increase of 1000 kg cow-1 (5000 to 6000 kg ECM) reduced the C footprint by 0.12 kg CO2e (kg ECM)-1, which may explain an apparent discrepancy between our study and an LCA performed in south Brazilian conditions [56]. Their study compared a confinement and a grazing-based dairy system with annual average milk production of 7667 and 5535 kg cow, respectively. In this study, the same herd was used in all systems, with an annual average milk production of around 7000 kg cow-1. Experimental data showed a reduction not greater than 3% of ECM when 50% of TMR was replaced by pasture access.
-
-The lower C footprint in scenarios with access to pasture, when local emission factors [37] were used for N2O emissions from urine and dung and for feed production (Table 4), may also be partially attributed to the small negative effect of pasture on ECM production. Nevertheless, local emission factors for urine and dung had a great impact on scenarios including pastures compared to ad libitum TMR intake. Whereas the IPCC [38] considers an emission of 0.02 kg N2O-N (kg N)-1 for urine and dung from grazing animals, experimental evidence shows that it may be up to five times lower, averaging 0.004 kg N2O-N kg-1 [37].
-
-### Methane emissions
-
-The enteric CH4 intensity was similar between different scenarios (Fig 2), showing the greatest sensitivity index, with values ranging from 0.53 to 0.62, which indicate that for a 10% change in this source, the C footprint may change between 5.3 and 6.2% (Fig 3). The large effect of enteric CH4 emissions on the whole C footprint was expected, because the impact of enteric CH4 on GHG emissions of milk production in different dairy systems has been estimated to range from 44 to 60% of the total CO2e [50,52,57,58]. However, emissions in feed production may be the most important source of GHG when emission factors for producing concentrate feeds are greater than 0.7 kg CO2e kg-1 [59], which did not happen in this study.
-
-The lack of difference in enteric CH4 emissions in different systems can be explained by the narrow range of NDF content in diets (&lt;4% difference). This non-difference is due to the lower NDF content of annual temperate pastures (495 g (kg DM)-1) compared to corn silage (550 g (kg DM)-1). Hence, an expected, increase NDF content with decreased concentrate was partially offset by an increase in the pasture proportion relatively low in NDF. This is in agreement with studies conducted in southern Brazil, which have shown that the actual enteric CH4 emissions may decrease with inclusion of temperate pastures in cows receiving corn silage and soybean meal [60] or increase enteric CH4 emissions when dairy cows grazing a temperate pasture was supplemented with corn silage [61]. Additionally, enteric CH4 emissions did not differ between dairy cows receiving TMR exclusively or grazing a tropical pasture in the same scenarios as in this study [26].
-
-### Emissions from excreta and feed production
-
-Using IPCC emission factors for N2O emissions from urine and dung [38] and those from Table 3, CH4 emissions from manure decreased 0.07 kg CO2e (kg ECM)-1, but N2O emissions from manure increased 0.09 kg CO2e (kg ECM)-1, as TMR intake was restricted to 50% ad libitum (Fig 4A). Emissions for pastures increased by 0.06 kg CO2e (kg ECM)-1, whereas emissions for producing concentrate feeds and corn silage decreased by 0.09 kg CO2e (kg ECM)-1, as TMR intake decreased (Fig 4B). In this situation, the lack of difference in calculated C footprints of different systems was also due to the greater emissions from manure, and offset by lower emissions from feed production with inclusion of pasture in lactating dairy cow diets. The greater N2O-N emissions from manure with pasture was a consequence of higher N2O-N emissions due to greater CP content and N urine excretion, as pasture intake increased. The effect of CP content on urine N excretion has been shown by several authors in lactating dairy cows [62–64]. For instance, by decreasing CP content from 185 to 152 g (kg DM)-1, N intake decreased by 20% and urine N excretion by 60% [62]. In this study, the CP content for lactating dairy cows ranged from 150 g (kg DM)-1 on TMR system to 198 g (kg DM)-1 on 50% TMR with pasture. Additionally, greater urine N excretion is expected with greater use of pasture. This occurs because protein utilization in pastures is inefficient, as the protein in fresh forages is highly degradable in the rumen and may not be captured by microbes [65].
-
-Using local emission factors for N2O emissions from urine and dung [37] and those from Table 4, reductions in CH4 emissions from stocked manure, when pastures were included on diets, did not offset by increases in N2O emissions from excreta (Fig 4C). In this case, total emissions from manure (Fig 4C) and feed production (Fig 4D) decreased with the inclusion of pasture. The impact of greater CP content and N urine excretion with increased pasture intake was offset by the much lower emission factors used for N2O emissions from urine and dung. As suggested by other authors [66,67], these results show that IPCC default value may need to be revised for the subtropical region.
-
-Emissions for feed production decreased when pasture was included due to the greater emission factor for corn grain production compared to pastures. Emissions from concentrate and silage had at least twice the sensitivity index compared to emissions from pastures. The amount of grain required per cow in a lifetime decreased from 7,300 kg to 4,000 kg when 50% of TMR was replaced by pasture access. These results are in agreement with other studies which found lower C footprint, as concentrate use is reduced and/or pasture is included [9,68,69]. Moreover, it has been demonstrated that in intensive dairy systems, after enteric fermentation, feed production is the second main contributor to C footprint [50]. There is potential to decrease the environmental impact of dairy systems by reducing the use of concentrate ingredients with high environmental impact, particularly in confinements [9].
-
-### Assumptions and limitations
-
-The milk production and composition data are the average for a typical herd, which might have great animal-to-animal variability. Likewise, DM yield of crops and pastures were collected from experimental observations, and may change as a function of inter-annual variation, climatic conditions, soil type, fertilization level etc. The emission factors for direct and indirect N2O emissions from urine and dung were alternatively estimated using local data, but more experiments are necessary to reduce the uncertainty. The CO2 emitted from lime and urea application was estimated from IPCC default values, which may not represent emissions in subtropical conditions. This LCA may be improved by reducing the uncertainty of factors for estimating emissions from excreta and feed production, including the C sequestration or emissions as a function of soil management.
-
-### Further considerations
-
-The potential for using pasture can reduce the C footprint because milk production kept pace with animal confinement. However, if milk production is to decrease with lower TMR intake and inclusion of pasture [19], the C footprint would be expected to increase. Lorenz et al. [22] showed that an increase in milk yield from 5,000 to 6,000 kg ECM reduced the C footprint by 0.12 kg CO2e (kg ECM)-1, whereas an increase from 10,000 to 11,000 kg ECM reduced the C footprint by only 0.06 kg CO2e (kg ECM)-1. Hence, the impact of increasing milk production on decreasing C footprint is not linear, and mitigation measures, such as breeding for increased genetic yield potential and increasing concentrate ratio in the diet, are potentially harmful for animal’s health and welfare [70]. For instance, increasing concentrate ratio potentially increases the occurrence of subclinical ketosis and foot lesions, and C footprint may increase by 0.03 kg CO2e (kg ECM)-1 in subclinical ketosis [71] and by 0.02 kg CO2e (kg ECM)-1 in case of foot lesions [72].
-
-Grazing lands may also improve biodiversity [73]. Strategies such as zero tillage may increase stocks of soil C [74]. This study did not consider C sequestration during the growth of annual pastures, because it was assumed these grasses were planted with tillage, having a balance between C sequestration and C emissions [38]. Considering the C sequestration from no-tillage perennial pasture, the amount of C sequestration will more than compensates for C emitted. These results are in agreement with other authors who have shown that a reduction or elimination of soil tillage increases annual soil C sequestration in subtropical areas by 0.5 to 1.5 t ha-1 [75]. If 50% of tilled areas were under perennial grasslands, 1.0 t C ha-1 would be sequestered, further reducing the C footprint by 0.015 and 0.025 kg CO2e (kg ECM)-1 for the scenarios using 75 and 50% TMR, respectively. Eliminating tillage, the reduction on total GHG emissions would be 0.03 and 0.05 kg CO2e (kg ECM)-1 for 75 and 50% TMR, respectively. However, this approach may be controversial because lands which have been consistently managed for decades have approached steady state C storage, so that net exchange of CO2 would be negligible [76].
-
-## Conclusions
-
-This study assessed the C footprint of dairy cattle systems with or without access to pastures. Including pastures showed potential to maintain or decrease to a small extent the C footprint, which may be attributable to the evidence of low N2O emissions from urine and dung in dairy systems in subtropical areas. Even though the enteric CH4 intensity was the largest source of CO2e emissions, it did not change between different scenarios due to the narrow range of NDF content in diets and maintaining the same milk production with or without access to pastures.
-
-## Tables
-
-Table 1: Descriptive characteristics of the herd.
-
-| Item                          | Unit      | Average   |
-|-------------------------------|-----------|-----------|
-| Milking cows                  | #         | 165       |
-| Milk production               | kg year-1 | 7,015     |
-| Milk fat                      | %         | 4.0       |
-| Milk protein                  | %         | 3.3       |
-| Length of lactation           | days      | 305       |
-| Body weight                   | kg        | 553       |
-| Lactations per cow            | #         | 4         |
-| Replacement rate              | %         | 25        |
-| Cull rate                     | %         | 25        |
-| First artificial insemination | months    | 16        |
-| Weaned                        | days      | 60        |
-| Mortality                     | %         | 3.0       |
-
-Table 2: Dairy cows’ diets in different scenariosa.
+Table 2 Dairy cows’ diets in different scenariosa.

 |                                   | Calf                              | Calf                              | Pregnant/dry                      | Pregnant/dry                      | Lactation                         | Lactation                         | Lactation                         | Weighted average                  | Weighted average                  | Weighted average                  |
 |-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|
@ -174,7 +96,11 @@ Table 2: Dairy cows’ diets in different scenariosa.
 | NEL, Mcal (kg DM)-1               | 1.96                              | 1.69                              | 1.63                              | 1.44                              | 1.81                              | 1.78                              | 1.74                              | 1.8                               | 1.8                               | 1.7                               |
 | MP, g (kg DM)-1                   | 111                               | 93.6                              | 97.6                              | 90.0                              | 95.0                              | 102                               | 102                               | 97.5                              | 102                               | 101                               |

-Table 3: GHG emission factors for Off- and On-farm feed production.
+#### GHG emissions from crop and pasture production
+
+GHG emission factors used for off- and on-farm feed production were based on literature values, and are presented in Table 3. The emission factor used for corn grain is the average of emission factors observed in different levels of synthetic N fertilization [30]. The emission factor used for soybean is based on Brazilian soybean production [31]. The emissions used for corn silage, including feed processing (cutting, crushing and mixing), and annual or perennial grass productions were 3300 and 1500 kg CO2e ha-1, respectively [32]. The DM production (kg ha-1) of corn silage and pastures were based on regional and locally recorded data [33–36], assuming that animals are able to consume 70% of pastures during grazing.
+
+Table 3 GHG emission factors for Off- and On-farm feed production.

 | Feed             | DM yield (kg ha-1)   | Emission factor   | Unita                | References   |
 |------------------|----------------------|-------------------|----------------------|--------------|
@ -187,7 +113,9 @@ Table 3: GHG emission factors for Off- and On-farm feed production.
 | Pearl milletd    | 11,000               | 0.195             | kg CO2e (kg DM)-1    | [32,35]      |
 | Kikuyu grasse    | 9,500                | 0.226             | kg CO2e (kg DM)-1    | [32,36]      |

-Table 4: GHG emissions from On-farm feed production.
+Emissions from on-farm feed production (corn silage and pasture) were estimated using primary and secondary sources based on the actual amount of each input (Table 4). Primary sources were direct and indirect N2O-N emissions from organic and synthetic fertilizers and crop/pasture residues, CO2-C emissions from lime and urea applications, as well as fuel combustion. The direct N2O-N emission factor (kg (kg N input)-1) is based on a local study performed previously [37]. For indirect N2O-N emissions (kg N2O-N (kg NH3-N + NOx)-1), as well as CO2-C emissions from lime + urea, default values proposed by IPCC [38] were used. For perennial pastures, a C sequestration of 0.57 t ha-1 was used based on a 9-year study conducted in southern Brazil [39]. Due to the use of conventional tillage, no C sequestration was considered for annual pastures. The amount of fuel required was 8.9 (no-tillage) and 14.3 L ha-1 (disking) for annual tropical and temperate pastures, respectively [40]. The CO2 from fuel combustion was 2.7 kg CO2 L-1 [41]. Secondary sources of emissions during the production of fuel, machinery, fertilizer, pesticides, seeds and plastic for ensilage were estimated using emission factors described by Rotz et al. [42].
+
+Table 4 GHG emissions from On-farm feed production.

 | Item                                      | Corn silage   | Annual temperate pasture   | Annual tropical pasture   | Perennial tropical pasture   |
 |-------------------------------------------|---------------|----------------------------|---------------------------|------------------------------|
@ -219,7 +147,23 @@ Table 4: GHG emissions from On-farm feed production.
 | kg CO2e ha-1 (emitted—sequestered)        | 1833          | 964                        | 1130                      | -245                         |
 | Emission factor, kg CO2e (kg DM)-1i       | 0.115         | 0.145                      | 0.147                     | -0.037                       |

-Table 5: Factors for major resource inputs in farm management.
+### Animal husbandry
+
+The CH4 emissions from enteric fermentation intensity (g (kg ECM)-1) was a function of estimated CH4 yield (g (kg DM intake)-1), actual DM intake and ECM. The enteric CH4 yield was estimated as a function of neutral detergent fiber (NDF) concentration on total DM intake, as proposed by Niu et al. [43], where: CH4 yield (g (kg DM intake)-1) = 13.8 + 0.185 × NDF (% DM intake).
+
+### Manure from confined cows and urine and dung from grazing animals
+
+The CH4 emission from manure (kg (kg ECM)-1) was a function of daily CH4 emission from manure (kg cow-1) and daily ECM (kg cow-1). The daily CH4 emission from manure was estimated according to IPCC [38], which considered daily volatile solid (VS) excreted (kg DM cow-1) in manure. The daily VS was estimated as proposed by Eugène et al. [44] as: VS = NDOMI + (UE × GE) × (OM/18.45), where: VS = volatile solid excretion on an organic matter (OM) basis (kg day-1), NDOMI = non-digestible OM intake (kg day-1): (1- OM digestibility) × OM intake, UE = urinary energy excretion as a fraction of GE (0.04), GE = gross energy intake (MJ day-1), OM = organic matter (g), 18.45 = conversion factor for dietary GE per kg of DM (MJ kg-1).
+
+The OM digestibility was estimated as a function of chemical composition, using equations published by INRA [21], which takes into account the effects of digestive interactions due to feeding level, the proportion of concentrate and rumen protein balance on OM digestibility. For scenarios where cows had access to grazing, the amount of calculated VS were corrected as a function of the time at pasture. The biodegradability of manure factor (0.13 for dairy cows in Latin America) and methane conversion factor (MCF) values were taken from IPCC [38]. The MCF values for pit storage below animal confinements (&gt; 1 month) were used for the calculation, taking into account the annual average temperature (16.6ºC) or the average temperatures during the growth period of temperate (14.4ºC) or tropical (21ºC) annual pastures, which were 31%, 26% and 46%, respectively.
+
+The N2O-N emissions from urine and feces were estimated considering the proportion of N excreted as manure and storage or as urine and dung deposited by grazing animals. These proportions were calculated based on the proportion of daily time that animals stayed on pasture (7 h/24 h = 0.29) or confinement (1−0.29 = 0.71). For lactating heifers and cows, the total amount of N excreted was calculated by the difference between N intake and milk N excretion. For heifers and non-lactating cows, urinary and fecal N excretion were estimated as proposed by Reed et al. [45] (Table 3: equations 10 and 12, respectively). The N2O emissions from stored manure as well as urine and dung during grazing were calculated based on the conversion of N2O-N emissions to N2O emissions, where N2O emissions = N2O-N emissions × 44/28. The emission factors were 0.002 kg N2O-N (kg N)-1 stored in a pit below animal confinements, and 0.02 kg N2O-N (kg of urine and dung)-1 deposited on pasture [38]. The indirect N2O emissions from storage manure and urine and dung deposits on pasture were also estimated using the IPCC [38] emission factors.
+
+### Farm management
+
+Emissions due to farm management included those from fuel and machinery for manure handling and electricity for milking and confinement (Table 5). Emissions due to feed processing such as cutting, crushing, mixing and distributing, as well as secondary sources of emissions during the production of fuel, machinery, fertilizer, pesticides, seeds and plastic for ensilage were included in ‘Emissions from crop and pasture production’ section.
+
+Table 5 Factors for major resource inputs in farm management.

 | Item                                     | Factor   | Unita             | References   |
 |------------------------------------------|----------|-------------------|--------------|
@ -235,102 +179,159 @@ Table 5: Factors for major resource inputs in farm management.
 | Electricity for milking                  | 0.06     | kWh (kg milk)-1   | [47]         |
 | Electricity for lightingd                | 75       | kWh cow-1         | [47]         |

-## Figures
+The amount of fuel use for manure handling were estimated taking into consideration the amount of manure produced per cow and the amounts of fuel required for manure handling (L diesel t-1) [42]. The amount of manure was estimated from OM excretions (kg cow-1), assuming that the manure has 8% ash on DM basis and 60% DM content. The OM excretions were calculated by NDOMI × days in confinement × proportion of daily time that animals stayed on confinement.

-Fig 1: Overview of the milk production system boundary considered in the study.
+The emissions from fuel were estimated considering the primary (emissions from fuel burned) and secondary (emissions for producing and transporting fuel) emissions. The primary emissions were calculated by the amount of fuel required for manure handling (L) × (kg CO2e L-1) [41]. The secondary emissions from fuel were calculated by the amount of fuel required for manure handling × emissions for production and transport of fuel (kg CO2e L-1) [41]. Emissions from manufacture and repair of machinery for manure handling were estimated by manure produced per cow (t) × (kg machinery mass (kg manure)-1 × 10−3) [42] × kg CO2e (kg machinery mass)-1 [42].
+
+Emissions from electricity for milking and confinement were estimated using two emission factors (kg CO2 kWh-1). The first one is based on United States electricity matrix [41], and was used as a reference of an electricity matrix with less hydroelectric power than the region under study. The second is based on the Brazilian electricity matrix [46]. The electricity required for milking activities is 0.06 kWh (kg milk produced)-1 [47]. The annual electricity use for lighting was 75 kWh cow-1, which is the value considered for lactating cows in naturally ventilated barns [47].
+
+### Co-product allocation
+
+The C footprint for milk produced in the system was calculated using a biophysical allocation approach, as recommended by the International Dairy Federation [49], and described by Thoma et al. [48]. Briefly, ARmilk = 1–6.04 × BMR, where: ARmilk is the allocation ratio for milk and BMR is cow BW at the time of slaughter (kg) + calf BW sold (kg) divided by the total ECM produced during cow`s entire life (kg). The ARmilk were 0.854 and 0.849 for TMR and TMR with both pasture scenarios, respectively. The ARmilk was applied to the whole emissions, except for the electricity consumed for milking (milking parlor) and refrigerant loss, which was directly assigned to milk production.
+
+### Sensitivity analysis
+
+A sensitivity index was calculated as described by Rotz et al. [42]. The sensitivity index was defined for each emission source as the percentage change in the C footprint for a 10% change in the given emission source divided by 10%. Thus, a value near 0 indicates a low sensitivity, whereas an index near or greater than 1 indicates a high sensitivity because a change in this value causes a similar change in the footprint.
+
+## Results and discussion
+
+The study has assessed the impact of tropical and temperate pastures in dairy cows fed TMR on the C footprint of dairy production in subtropics. Different factors were taken in to consideration to estimate emissions from manure (or urine and dung) of grazing animals, feed production and electricity use.
+
+### Greenhouse gas emissions
+
+Depending on emission factors used for calculating emissions from urine and dung (IPCC or local data) and feed production (Tables 3 or 4), the C footprint was similar (Fig 2A and 2B) or decreased by 0.04 kg CO2e (kg ECM)-1 (Fig 2C and 2D) in scenarios that included pastures compared to ad libitum TMR intake. Due to differences in emission factors, the overall GHG emission values ranged from 0.92 to 1.04 kg CO2e (kg ECM)-1 for dairy cows receiving TMR exclusively, and from 0.88 to 1.04 kg CO2e (kg ECM)-1 for cows with access to pasture. Using IPCC emission factors [38], manure emissions increased as TMR intake went down (Fig 2A and 2B). However, using local emission factors for estimating N2O-N emissions [37], manure emissions decreased as TMR intake went down (Fig 2C and 2D). Regardless of emission factors used (Tables 3 or 4), emissions from feed production decreased to a small extent as the proportion of TMR intake decreased. Emissions from farm management did not contribute more than 5% of overall GHG emissions.
+
+Fig 2 Overall greenhouse gas emissions in dairy cattle systems under various scenarios. TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].

 <!-- image -->

-Fig 2: Overall greenhouse gas emissions in dairy cattle systems under various scenarios.
-TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].
+Considering IPCC emission factors for N2O emissions from urine and dung [38] and those from Table 3, the C footprint ranged from 0.99 to 1.04 kg CO2e (kg ECM)-1, and was close to those reported under confined based systems in California [49], Canada [50], China [8], Ireland [9], different scenarios in Australia [51,52] and Uruguay [11], which ranged from 0.98 to 1.16 kg CO2e (kg ECM)-1. When local emission factors for N2O emissions from urine and dung [37] and those from Table 4 were taking into account, the C footprint for scenarios including pasture, without accounting for sequestered CO2-C from perennial pasture—0.91 kg CO2e (kg ECM)-1—was lower than the range of values described above. However, these values were still greater than high-performance confinement systems in UK and USA [53] or grass based dairy systems in Ireland [9,53] and New Zealand [8,54], which ranged from 0.52 to 0.89 kg CO2e (kg ECM)-1. Regardless of which emission factor was used, we found a lower C footprint in all conditions compared to scenarios with lower milk production per cow or in poor conditions of manure management, which ranged from 1.4 to 2.3 kg CO2e (kg ECM)-1 [8,55]. Thus, even though differences between studies may be partially explained by various assumptions (e.g., emission factors, co-product allocation, methane emissions estimation, sequestered CO2-C, etc.), herd productivity and manure management were systematically associated with the C footprint of the dairy systems.
+
+The similarity of C footprint between different scenarios using IPCC [38] for estimating emissions from manure and for emissions from feed production (Table 3) was a consequence of the trade-off between greater manure emissions and lower emissions to produce feed, as the proportion of pasture in diets increased. Additionally, the small negative effect of pasture on ECM production also contributed to the trade-off. The impact of milk production on the C footprint was reported in a meta-analysis comprising 30 studies from 15 different countries [22]. As observed in this study (Fig 2A and 2B) the authors reported no significant difference between the C footprint of pasture-based vs. confinement systems. However, they observed that an increase of 1000 kg cow-1 (5000 to 6000 kg ECM) reduced the C footprint by 0.12 kg CO2e (kg ECM)-1, which may explain an apparent discrepancy between our study and an LCA performed in south Brazilian conditions [56]. Their study compared a confinement and a grazing-based dairy system with annual average milk production of 7667 and 5535 kg cow, respectively. In this study, the same herd was used in all systems, with an annual average milk production of around 7000 kg cow-1. Experimental data showed a reduction not greater than 3% of ECM when 50% of TMR was replaced by pasture access.
+
+The lower C footprint in scenarios with access to pasture, when local emission factors [37] were used for N2O emissions from urine and dung and for feed production (Table 4), may also be partially attributed to the small negative effect of pasture on ECM production. Nevertheless, local emission factors for urine and dung had a great impact on scenarios including pastures compared to ad libitum TMR intake. Whereas the IPCC [38] considers an emission of 0.02 kg N2O-N (kg N)-1 for urine and dung from grazing animals, experimental evidence shows that it may be up to five times lower, averaging 0.004 kg N2O-N kg-1 [37].
+
+### Methane emissions
+
+The enteric CH4 intensity was similar between different scenarios (Fig 2), showing the greatest sensitivity index, with values ranging from 0.53 to 0.62, which indicate that for a 10% change in this source, the C footprint may change between 5.3 and 6.2% (Fig 3). The large effect of enteric CH4 emissions on the whole C footprint was expected, because the impact of enteric CH4 on GHG emissions of milk production in different dairy systems has been estimated to range from 44 to 60% of the total CO2e [50,52,57,58]. However, emissions in feed production may be the most important source of GHG when emission factors for producing concentrate feeds are greater than 0.7 kg CO2e kg-1 [59], which did not happen in this study.
+
+Fig 3 Sensitivity of the C footprint. Sensitivity index = percentage change in C footprint for a 10% change in the given emission source divided by 10% of. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].

 <!-- image -->

-Fig 3: Sensitivity of the C footprint.
-Sensitivity index = percentage change in C footprint for a 10% change in the given emission source divided by 10% of. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].
+The lack of difference in enteric CH4 emissions in different systems can be explained by the narrow range of NDF content in diets (&lt;4% difference). This non-difference is due to the lower NDF content of annual temperate pastures (495 g (kg DM)-1) compared to corn silage (550 g (kg DM)-1). Hence, an expected, increase NDF content with decreased concentrate was partially offset by an increase in the pasture proportion relatively low in NDF. This is in agreement with studies conducted in southern Brazil, which have shown that the actual enteric CH4 emissions may decrease with inclusion of temperate pastures in cows receiving corn silage and soybean meal [60] or increase enteric CH4 emissions when dairy cows grazing a temperate pasture was supplemented with corn silage [61]. Additionally, enteric CH4 emissions did not differ between dairy cows receiving TMR exclusively or grazing a tropical pasture in the same scenarios as in this study [26].
+
+### Emissions from excreta and feed production
+
+Using IPCC emission factors for N2O emissions from urine and dung [38] and those from Table 3, CH4 emissions from manure decreased 0.07 kg CO2e (kg ECM)-1, but N2O emissions from manure increased 0.09 kg CO2e (kg ECM)-1, as TMR intake was restricted to 50% ad libitum (Fig 4A). Emissions for pastures increased by 0.06 kg CO2e (kg ECM)-1, whereas emissions for producing concentrate feeds and corn silage decreased by 0.09 kg CO2e (kg ECM)-1, as TMR intake decreased (Fig 4B). In this situation, the lack of difference in calculated C footprints of different systems was also due to the greater emissions from manure, and offset by lower emissions from feed production with inclusion of pasture in lactating dairy cow diets. The greater N2O-N emissions from manure with pasture was a consequence of higher N2O-N emissions due to greater CP content and N urine excretion, as pasture intake increased. The effect of CP content on urine N excretion has been shown by several authors in lactating dairy cows [62–64]. For instance, by decreasing CP content from 185 to 152 g (kg DM)-1, N intake decreased by 20% and urine N excretion by 60% [62]. In this study, the CP content for lactating dairy cows ranged from 150 g (kg DM)-1 on TMR system to 198 g (kg DM)-1 on 50% TMR with pasture. Additionally, greater urine N excretion is expected with greater use of pasture. This occurs because protein utilization in pastures is inefficient, as the protein in fresh forages is highly degradable in the rumen and may not be captured by microbes [65].
+
+Fig 4 Greenhouse gas emissions (GHG) from manure and feed production in dairy cattle systems. TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38]. (b) Feed production emission factors from Table 3. (c) N2O emission factors for urine and dung from local data [37]. (d) Feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture.

 <!-- image -->

-Fig 4: Greenhouse gas emissions (GHG) from manure and feed production in dairy cattle systems.
-TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38]. (b) Feed production emission factors from Table 3. (c) N2O emission factors for urine and dung from local data [37]. (d) Feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture.
+Using local emission factors for N2O emissions from urine and dung [37] and those from Table 4, reductions in CH4 emissions from stocked manure, when pastures were included on diets, did not offset by increases in N2O emissions from excreta (Fig 4C). In this case, total emissions from manure (Fig 4C) and feed production (Fig 4D) decreased with the inclusion of pasture. The impact of greater CP content and N urine excretion with increased pasture intake was offset by the much lower emission factors used for N2O emissions from urine and dung. As suggested by other authors [66,67], these results show that IPCC default value may need to be revised for the subtropical region.

-<!-- image -->
+Emissions for feed production decreased when pasture was included due to the greater emission factor for corn grain production compared to pastures. Emissions from concentrate and silage had at least twice the sensitivity index compared to emissions from pastures. The amount of grain required per cow in a lifetime decreased from 7,300 kg to 4,000 kg when 50% of TMR was replaced by pasture access. These results are in agreement with other studies which found lower C footprint, as concentrate use is reduced and/or pasture is included [9,68,69]. Moreover, it has been demonstrated that in intensive dairy systems, after enteric fermentation, feed production is the second main contributor to C footprint [50]. There is potential to decrease the environmental impact of dairy systems by reducing the use of concentrate ingredients with high environmental impact, particularly in confinements [9].
+
+### Farm management
+
+The lower impact of emissions from farm management is in agreement with other studies conducted in Europe [9, 62] and USA [42, 55], where the authors found that most emissions in dairy production systems are from enteric fermentation, feed production and emissions from excreta. As emissions from fuel for on-farm feed production were accounted into the ‘emissions from crop and pasture production’, total emissions from farm management were not greater than 5% of total C footprint.
+
+Emissions from farm management dropped when the emission factor for electricity generation was based on the Brazilian matrix. In this case, the emission factor for electricity generation (0.205 kg CO2e kWh-1 [46]) is much lower than that in a LCA study conducted in US (0.73 kg CO2e kWh-1 [42]). This apparent discrepancy is explained because in 2016, almost 66% of the electricity generated in Brazil was from hydropower, which has an emission factor of 0.074 kg CO2e kWh-1 against 0.382 and 0.926 kg CO2e kWh-1 produced by natural gas and hard coal, respectively [46].
+
+### Assumptions and limitations
+
+The milk production and composition data are the average for a typical herd, which might have great animal-to-animal variability. Likewise, DM yield of crops and pastures were collected from experimental observations, and may change as a function of inter-annual variation, climatic conditions, soil type, fertilization level etc. The emission factors for direct and indirect N2O emissions from urine and dung were alternatively estimated using local data, but more experiments are necessary to reduce the uncertainty. The CO2 emitted from lime and urea application was estimated from IPCC default values, which may not represent emissions in subtropical conditions. This LCA may be improved by reducing the uncertainty of factors for estimating emissions from excreta and feed production, including the C sequestration or emissions as a function of soil management.
+
+### Further considerations
+
+The potential for using pasture can reduce the C footprint because milk production kept pace with animal confinement. However, if milk production is to decrease with lower TMR intake and inclusion of pasture [19], the C footprint would be expected to increase. Lorenz et al. [22] showed that an increase in milk yield from 5,000 to 6,000 kg ECM reduced the C footprint by 0.12 kg CO2e (kg ECM)-1, whereas an increase from 10,000 to 11,000 kg ECM reduced the C footprint by only 0.06 kg CO2e (kg ECM)-1. Hence, the impact of increasing milk production on decreasing C footprint is not linear, and mitigation measures, such as breeding for increased genetic yield potential and increasing concentrate ratio in the diet, are potentially harmful for animal’s health and welfare [70]. For instance, increasing concentrate ratio potentially increases the occurrence of subclinical ketosis and foot lesions, and C footprint may increase by 0.03 kg CO2e (kg ECM)-1 in subclinical ketosis [71] and by 0.02 kg CO2e (kg ECM)-1 in case of foot lesions [72].
+
+Grazing lands may also improve biodiversity [73]. Strategies such as zero tillage may increase stocks of soil C [74]. This study did not consider C sequestration during the growth of annual pastures, because it was assumed these grasses were planted with tillage, having a balance between C sequestration and C emissions [38]. Considering the C sequestration from no-tillage perennial pasture, the amount of C sequestration will more than compensates for C emitted. These results are in agreement with other authors who have shown that a reduction or elimination of soil tillage increases annual soil C sequestration in subtropical areas by 0.5 to 1.5 t ha-1 [75]. If 50% of tilled areas were under perennial grasslands, 1.0 t C ha-1 would be sequestered, further reducing the C footprint by 0.015 and 0.025 kg CO2e (kg ECM)-1 for the scenarios using 75 and 50% TMR, respectively. Eliminating tillage, the reduction on total GHG emissions would be 0.03 and 0.05 kg CO2e (kg ECM)-1 for 75 and 50% TMR, respectively. However, this approach may be controversial because lands which have been consistently managed for decades have approached steady state C storage, so that net exchange of CO2 would be negligible [76].
+
+## Conclusions
+
+This study assessed the C footprint of dairy cattle systems with or without access to pastures. Including pastures showed potential to maintain or decrease to a small extent the C footprint, which may be attributable to the evidence of low N2O emissions from urine and dung in dairy systems in subtropical areas. Even though the enteric CH4 intensity was the largest source of CO2e emissions, it did not change between different scenarios due to the narrow range of NDF content in diets and maintaining the same milk production with or without access to pastures.
+
+## Acknowledgments
+
+Thanks to Anna Naranjo for helpful comments throughout the elaboration of this manuscript, and to André Thaler Neto and Roberto Kappes for providing the key characteristics of the herd considered in this study.

 ## References

- Climate Change and Land. Chapter 5: Food Security (2019)
- Herrero M; Henderson B; Havlík P; Thornton PK; Conant RT; Smith P. Greenhouse gas mitigation potentials in the livestock sector. Nat Clim Chang (2016)
- Rivera-Ferre MG; López-i-Gelats F; Howden M; Smith P; Morton JF; Herrero M. Re-framing the climate change debate in the livestock sector: mitigation and adaptation options. Wiley Interdiscip Rev Clim Chang (2016)
- van Zanten HHE; Mollenhorst H; Klootwijk CW; van Middelaar CE; de Boer IJM. Global food supply: land use efficiency of livestock systems. Int J Life Cycle Assess (2016)
- Hristov AN; Oh J; Firkins L; Dijkstra J; Kebreab E; Waghorn G. SPECIAL TOPICS—Mitigation of methane and nitrous oxide emissions from animal operations: I. A review of enteric methane mitigation options. J Anim Sci (2013)
- Hristov AN; Ott T; Tricarico J; Rotz A; Waghorn G; Adesogan A. SPECIAL TOPICS—Mitigation of methane and nitrous oxide emissions from animal operations: III. A review of animal management mitigation options. J Anim Sci (2013)
- Montes F; Meinen R; Dell C; Rotz A; Hristov AN; Oh J. SPECIAL TOPICS—Mitigation of methane and nitrous oxide emissions from animal operations: II. A review of manure management mitigation options. J Anim Sci (2013)
- Ledgard SF; Wei S; Wang X; Falconer S; Zhang N; Zhang X. Nitrogen and carbon footprints of dairy farm systems in China and New Zealand, as influenced by productivity, feed sources and mitigations. Agric Water Manag (2019)
- O’Brien D; Shalloo L; Patton J; Buckley F; Grainger C; Wallace M. A life cycle assessment of seasonal grass-based and confinement dairy farms. Agric Syst (2012)
- Salou T; Le Mouël C; van der Werf HMG. Environmental impacts of dairy system intensification: the functional unit matters!. J Clean Prod (2017)
- Lizarralde C; Picasso V; Rotz CA; Cadenazzi M; Astigarraga L. Practices to Reduce Milk Carbon Footprint on Grazing Dairy Farms in Southern Uruguay. Case Studies. Sustain Agric Res (2014)
- Clark CEF; Kaur R; Millapan LO; Golder HM; Thomson PC; Horadagoda A. The effect of temperate or tropical pasture grazing state and grain-based concentrate allocation on dairy cattle production and behavior. J Dairy Sci (2018)
- FAOSTAT.  (2017)
- Vogeler I; Mackay A; Vibart R; Rendel J; Beautrais J; Dennis S. Effect of inter-annual variability in pasture growth and irrigation response on farm productivity and profitability based on biophysical and farm systems modelling. Sci Total Environ (2016)
- Wilkinson JM; Lee MRF; Rivero MJ; Chamberlain AT. Some challenges and opportunities for grazing dairy cows on temperate pastures. Grass Forage Sci. (2020)
- Wales WJ; Marett LC; Greenwood JS; Wright MM; Thornhill JB; Jacobs JL. Use of partial mixed rations in pasture-based dairying in temperate regions of Australia. Anim Prod Sci (2013)
- Bargo F; Muller LD; Delahoy JE; Cassidy TW. Performance of high producing dairy cows with three different feeding systems combining pasture and total mixed rations. J Dairy Sci (2002)
- Vibart RE; Fellner V; Burns JC; Huntington GB; Green JT. Performance of lactating dairy cows fed varying levels of total mixed ration and pasture. J Dairy Res (2008)
- Mendoza A; Cajarville C; Repetto JL. Short communication: Intake, milk production, and milk fatty acid profile of dairy cows fed diets combining fresh forage with a total mixed ration. J Dairy Sci (2016)
- Nutrient Requirements of Dairy Cattle (2001)
- Noizère P; Sauvant D; Delaby L.  (2018)
- Lorenz H; Reinsch T; Hess S; Taube F. Is low-input dairy farming more climate friendly? A meta-analysis of the carbon footprints of different production systems. J Clean Prod (2019)
- INTERNATIONAL STANDARD—Environmental management—Life cycle assessment—Requirements and guidelines (2006)
- Environmental management—Life cycle assessment—Principles and framework. Iso 14040 (2006)
- FAO. Environmental Performance of Large Ruminant Supply Chains: Guidelines for assessment (2016)
- Civiero M; Ribeiro-Filho HMN; Schaitz LH. Pearl-millet grazing decreases daily methane emissions in dairy cows receiving total mixed ration. 7th Greenhouse Gas and Animal Agriculture Conference,. Foz do Iguaçu (2019)
- IPCC—Intergovernmental Panel on Climate Change. Climate Change 2014 Synthesis Report (Unedited Version). 2014. Available: ttps://. 
- INRA. Alimentation des bovins, ovins et caprins. Besoins des animaux—valeurs des aliments. Tables Inra 2007. 4th ed. INRA, editor. 2007. 
- Delagarde R; Faverdin P; Baratte C; Peyraud JL. GrazeIn: a model of herbage intake and milk production for grazing dairy cows. 2. Prediction of intake under rotational and continuously stocked grazing management. Grass Forage Sci (2011)
- Ma BL; Liang BC; Biswas DK; Morrison MJ; McLaughlin NB. The carbon footprint of maize production as affected by nitrogen fertilizer and maize-legume rotations. Nutr Cycl Agroecosystems (2012)
- Rauccci GS; Moreira CS; Alves PS; Mello FFC; Frazão LA; Cerri CEP. Greenhouse gas assessment of Brazilian soybean production: a case study of Mato Grosso State. J Clean Prod (2015)
- Camargo GGT; Ryan MR; Richard TL. Energy Use and Greenhouse Gas Emissions from Crop Production Using the Farm Energy Analysis Tool. Bioscience (2013)
- da Silva MSJ; Jobim CC; Poppi EC; Tres TT; Osmari MP. Production technology and quality of corn silage for feeding dairy cattle in Southern Brazil. Rev Bras Zootec (2015)
- Duchini PGPG Guzatti GCGC; Ribeiro-Filho HMNHMNN Sbrissia AFAFAF. Intercropping black oat (Avena strigosa) and annual ryegrass (Lolium multiflorum) can increase pasture leaf production compared with their monocultures. Crop Pasture Sci (2016)
- Scaravelli LFB; Pereira LET; Olivo CJ; Agnolin CA. Produção e qualidade de pastagens de Coastcross-1 e milheto utilizadas com vacas leiteiras. Cienc Rural (2007)
- Sbrissia AF; Duchini PG; Zanini GD; Santos GT; Padilha DA; Schmitt D. Defoliation strategies in pastures submitted to intermittent stocking method: Underlying mechanisms buffering forage accumulation over a range of grazing heights. Crop Sci (2018)
- Almeida JGR; Dall-Orsoletta AC; Oziemblowski MM; Michelon GM; Bayer C; Edouard N. Carbohydrate-rich supplements can improve nitrogen use efficiency and mitigate nitrogenous gas emissions from the excreta of dairy cows grazing temperate grass. Animal (2020)
- Eggleston H.S.; Buendia L.; Miwa K. IPCC guidlines for national greenhouse gas inventories. (2006)
- Ramalho B; Dieckow J; Barth G; Simon PL; Mangrich AS; Brevilieri RC. No-tillage and ryegrass grazing effects on stocks, stratification and lability of carbon and nitrogen in a subtropical Umbric Ferralsol. Eur J Soil Sci (2020)
- Fernandes HC; da Silveira JCM; Rinaldi PCN. Avaliação do custo energético de diferentes operações agrícolas mecanizadas. Cienc e Agrotecnologia (2008)
- Wang M Q. GREET 1.8a Spreadsheet Model. 2007. Available: . 
- Rotz CAA; Montes F; Chianese DS; Chiane DS. The carbon footprint of dairy production systems through partial life cycle assessment. J Dairy Sci (2010)
- Niu M; Kebreab E; Hristov AN; Oh J; Arndt C; Bannink A. Prediction of enteric methane production, yield, and intensity in dairy cattle using an intercontinental database. Glob Chang Biol (2018)
- Eugène M; Sauvant D; Nozière P; Viallard D; Oueslati K; Lherm M. A new Tier 3 method to calculate methane emission inventory for ruminants. J Environ Manage (2019)
- Reed KF; Moraes LE; Casper DP; Kebreab E. Predicting nitrogen excretion from cattle. J Dairy Sci (2015)
- Barros MV; Piekarski CM; De Francisco AC. Carbon footprint of electricity generation in Brazil: An analysis of the 2016–2026 period. Energies (2018)
- Ludington D; Johnson E. Dairy Farm Energy Audit Summary. New York State Energy Res Dev Auth (2003)
- Thoma G; Jolliet O; Wang Y. A biophysical approach to allocation of life cycle environmental burdens for fluid milk supply chain analysis. Int Dairy J (2013)
- Naranjo A; Johnson A; Rossow H. Greenhouse gas, water, and land footprint per unit of production of the California dairy industry over 50 years.  (2020)
- Jayasundara S; Worden D; Weersink A; Wright T; VanderZaag A; Gordon R. Improving farm profitability also reduces the carbon footprint of milk production in intensive dairy production systems. J Clean Prod (2019)
- Williams SRO; Fisher PD; Berrisford T; Moate PJ; Reynard K. Reducing methane on-farm by feeding diets high in fat may not always reduce life cycle greenhouse gas emissions. Int J Life Cycle Assess (2014)
- Gollnow S; Lundie S; Moore AD; McLaren J; van Buuren N; Stahle P. Carbon footprint of milk production from dairy cows in Australia. Int Dairy J (2014)
- O’Brien D; Capper JL; Garnsworthy PC; Grainger C; Shalloo L. A case study of the carbon footprint of milk from high-performing confinement and grass-based dairy farms. J Dairy Sci (2014)
- Chobtang J; McLaren SJ; Ledgard SF; Donaghy DJ. Consequential Life Cycle Assessment of Pasture-based Milk Production: A Case Study in the Waikato Region, New Zealand. J Ind Ecol (2017)
- Garg MR; Phondba BT; Sherasia PL; Makkar HPS. Carbon footprint of milk production under smallholder dairying in Anand district of Western India: A cradle-to-farm gate life cycle assessment. Anim Prod Sci (2016)
- de Léis CM; Cherubini E; Ruviaro CF; Prudêncio da Silva V; do Nascimento Lampert V; Spies A. Carbon footprint of milk production in Brazil: a comparative case study. Int J Life Cycle Assess (2015)
- O’Brien D; Geoghegan A; McNamara K; Shalloo L. How can grass-based dairy farmers reduce the carbon footprint of milk?. Anim Prod Sci (2016)
- O’Brien D; Brennan P; Humphreys J; Ruane E; Shalloo L. An appraisal of carbon footprint of milk from commercial grass-based dairy farms in Ireland according to a certified life cycle assessment methodology. Int J Life Cycle Assess (2014)
- Baek CY; Lee KM; Park KH. Quantification and control of the greenhouse gas emissions from a dairy cow system. J Clean Prod (2014)
- Dall-Orsoletta AC; Almeida JGR; Carvalho PCF; Savian J V. Ribeiro-Filho HMN. Ryegrass pasture combined with partial total mixed ration reduces enteric methane emissions and maintains the performance of dairy cows during mid to late lactation. J Dairy Sci (2016)
- Dall-Orsoletta AC; Oziemblowski MM; Berndt A; Ribeiro-Filho HMN. Enteric methane emission from grazing dairy cows receiving corn silage or ground corn supplementation. Anim Feed Sci Technol (2019)
- Niu M; Appuhamy JADRN; Leytem AB; Dungan RS; Kebreab E. Effect of dietary crude protein and forage contents on enteric methane emissions and nitrogen excretion from dairy cows simultaneously. Anim Prod Sci (2016)
- Waghorn GC; Law N; Bryant M; Pacheco D; Dalley D. Digestion and nitrogen excretion by Holstein-Friesian cows in late lactation offered ryegrass-based pasture supplemented with fodder beet. Anim Prod Sci (2019)
- Dickhoefer U; Glowacki S; Gómez CA; Castro-Montoya JM. Forage and protein use efficiency in dairy cows grazing a mixed grass-legume pasture and supplemented with different levels of protein and starch. Livest Sci (2018)
- Schwab CG; Broderick GA. A 100-Year Review: Protein and amino acid nutrition in dairy cows. J Dairy Sci (2017)
- Sordi A; Dieckow J; Bayer C; Alburquerque MA; Piva JT; Zanatta JA. Nitrous oxide emission factors for urine and dung patches in a subtropical Brazilian pastureland. Agric Ecosyst Environ (2014)
- Simon PL; Dieckow J; de Klein CAM; Zanatta JA; van der Weerden TJ; Ramalho B. Nitrous oxide emission factors from cattle urine and dung, and dicyandiamide (DCD) as a mitigation strategy in subtropical pastures. Agric Ecosyst Environ (2018)
- Wang X; Ledgard S; Luo J; Guo Y; Zhao Z; Guo L. Environmental impacts and resource use of milk production on the North China Plain, based on life cycle assessment. Sci Total Environ (2018)
- Pirlo G; Lolli S. Environmental impact of milk production from samples of organic and conventional farms in Lombardy (Italy). J Clean Prod (2019)
- Herzog A; Winckler C; Zollitsch W. In pursuit of sustainability in dairy farming: A review of interdependent effects of animal welfare improvement and environmental impact mitigation. Agric Ecosyst Environ (2018)
- Mostert PF; van Middelaar CE; Bokkers EAM; de Boer IJM. The impact of subclinical ketosis in dairy cows on greenhouse gas emissions of milk production. J Clean Prod (2018)
- Mostert PF; van Middelaar CE; de Boer IJM; Bokkers EAM. The impact of foot lesions in dairy cows on greenhouse gas emissions of milk production. Agric Syst (2018)
- Foley JA; Ramankutty N; Brauman KA; Cassidy ES; Gerber JS; Johnston M. Solutions for a cultivated planet. Nature (2011)
- Lal R.. Soil Carbon Sequestration Impacts on Global Climate Change and Food Security. Science (80-) (2004)
- Boddey RM; Jantalia CP; Conceiçao PC; Zanatta JA; Bayer C; Mielniczuk J. Carbon accumulation at depth in Ferralsols under zero-till subtropical agriculture. Glob Chang Biol (2010)
- McConkey B; Angers D; Bentham M; Boehm M; Brierley T; Cerkowniak D. Canadian agricultural greenhouse gas monitoring accounting and reporting system: methodology and greenhouse gas estimates for agricultural land in the LULUCF sector for NIR 2014.  (2014)
+- IPCC. Climate Change and Land. Chapter 5: Food Security. 2019.
+- HerreroM, HendersonB, HavlíkP, ThorntonPK, ConantRT, SmithP, et al Greenhouse gas mitigation potentials in the livestock sector. Nat Clim Chang. 2016;6: 452–461. 10.1038/nclimate2925
+- Rivera-FerreMG, López-i-GelatsF, HowdenM, SmithP, MortonJF, HerreroM. Re-framing the climate change debate in the livestock sector: mitigation and adaptation options. Wiley Interdiscip Rev Clim Chang. 2016;7: 869–892. 10.1002/wcc.421
+- van ZantenHHE, MollenhorstH, KlootwijkCW, van MiddelaarCE, de BoerIJM. Global food supply: land use efficiency of livestock systems. Int J Life Cycle Assess. 2016;21: 747–758. 10.1007/s11367-015-0944-1
+- HristovAN, OhJ, FirkinsL, DijkstraJ, KebreabE, WaghornG, et al SPECIAL TOPICS—Mitigation of methane and nitrous oxide emissions from animal operations: I. A review of enteric methane mitigation options. J Anim Sci. 2013;91: 5045–5069. 10.2527/jas.2013-6583 24045497
+- HristovAN, OttT, TricaricoJ, RotzA, WaghornG, AdesoganA, et al SPECIAL TOPICS—Mitigation of methane and nitrous oxide emissions from animal operations: III. A review of animal management mitigation options. J Anim Sci. 2013;91: 5095–5113. 10.2527/jas.2013-6585 24045470
+- MontesF, MeinenR, DellC, RotzA, HristovAN, OhJ, et al SPECIAL TOPICS—Mitigation of methane and nitrous oxide emissions from animal operations: II. A review of manure management mitigation options. J Anim Sci. 2013;91: 5070–5094. 10.2527/jas.2013-6584 24045493
+- LedgardSF, WeiS, WangX, FalconerS, ZhangN, ZhangX, et al Nitrogen and carbon footprints of dairy farm systems in China and New Zealand, as influenced by productivity, feed sources and mitigations. Agric Water Manag. 2019;213: 155–163. 10.1016/j.agwat.2018.10.009
+- O’BrienD, ShallooL, PattonJ, BuckleyF, GraingerC, WallaceM. A life cycle assessment of seasonal grass-based and confinement dairy farms. Agric Syst. 2012;107: 33–46. 10.1016/j.agsy.2011.11.004
+- SalouT, Le MouëlC, van der WerfHMG. Environmental impacts of dairy system intensification: the functional unit matters! J Clean Prod. 2017 10.1016/j.jclepro.2016.05.019
+- LizarraldeC, PicassoV, RotzCA, CadenazziM, AstigarragaL. Practices to Reduce Milk Carbon Footprint on Grazing Dairy Farms in Southern Uruguay: Case Studies. Sustain Agric Res. 2014;3: 1 10.5539/sar.v3n2p1
+- ClarkCEF, KaurR, MillapanLO, GolderHM, ThomsonPC, HoradagodaA, et al The effect of temperate or tropical pasture grazing state and grain-based concentrate allocation on dairy cattle production and behavior. J Dairy Sci. 2018;101: 5454–5465. 10.3168/jds.2017-13388 29550132
+- Food and Agriculture Organization. FAOSTAT. 2017.
+- VogelerI, MackayA, VibartR, RendelJ, BeautraisJ, DennisS. Effect of inter-annual variability in pasture growth and irrigation response on farm productivity and profitability based on biophysical and farm systems modelling. Sci Total Environ. 2016;565: 564–575. 10.1016/j.scitotenv.2016.05.006 27203517
+- WilkinsonJM, LeeMRF, RiveroMJ, ChamberlainAT. Some challenges and opportunities for grazing dairy cows on temperate pastures. Grass Forage Sci. 2020;75: 1–17. 10.1111/gfs.12458 32109974
+- WalesWJ, MarettLC, GreenwoodJS, WrightMM, ThornhillJB, JacobsJL, et al Use of partial mixed rations in pasture-based dairying in temperate regions of Australia. Anim Prod Sci. 2013;53: 1167–1178. 10.1071/AN13207
+- BargoF, MullerLD, DelahoyJE, CassidyTW. Performance of high producing dairy cows with three different feeding systems combining pasture and total mixed rations. J Dairy Sci. 2002;85: 2948–2963. 10.3168/jds.S0022-0302(02)74381-6 12487461
+- VibartRE, FellnerV, BurnsJC, HuntingtonGB, GreenJT. Performance of lactating dairy cows fed varying levels of total mixed ration and pasture. J Dairy Res. 2008;75: 471–480. 10.1017/S0022029908003361 18701000
+- MendozaA, CajarvilleC, RepettoJL. Short communication: Intake, milk production, and milk fatty acid profile of dairy cows fed diets combining fresh forage with a total mixed ration. J Dairy Sci. 2016;99: 1938–1944. 10.3168/jds.2015-10257 26778319
+- NRC. Nutrient Requirements of Dairy Cattle. 7th ed. Washington DC: National Academy Press; 2001.
+- INRA. INRA Feeding System for Ruminants. NoizèreP, SauvantD, DelabyL, editors. Wageningen: Wageningen Academic Publishiers; 2018 10.3920/978-90-8686-872-8
+- LorenzH, ReinschT, HessS, TaubeF. Is low-input dairy farming more climate friendly? A meta-analysis of the carbon footprints of different production systems. J Clean Prod. 2019;211: 161–170. 10.1016/j.jclepro.2018.11.113
+- ISO 14044. INTERNATIONAL STANDARD—Environmental management—Life cycle assessment—Requirements and guidelines. 2006;2006: 46.
+- ISO 14040. The International Standards Organisation. Environmental management—Life cycle assessment—Principles and framework. Iso 14040. 2006;2006: 1–28. 10.1136/bmj.332.7550.1107
+- FAO. Environmental Performance of Large Ruminant Supply Chains: Guidelines for assessment. Livestock Environmental Assessment and Performance Partnership, editor. Rome, Italy: FAO; 2016 Available: http://www.fao.org/partnerships/leap/resources/guidelines/en/
+- CivieroM, Ribeiro-FilhoHMN, SchaitzLH. Pearl-millet grazing decreases daily methane emissions in dairy cows receiving total mixed ration. 7th Greenhouse Gas and Animal Agriculture Conference,. Foz do Iguaçu; 2019 pp. 141–141.
+- IPCC—Intergovernmental Panel on Climate Change. Climate Change 2014 Synthesis Report (Unedited Version). 2014. Available: ttps://www.ipcc.ch/site/assets/uploads/2018/05/SYR\_AR5\_FINAL\_full\_wcover.pdf
+- INRA. Alimentation des bovins, ovins et caprins. Besoins des animaux—valeurs des aliments. Tables Inra 2007. 4th ed. INRA, editor. 2007.
+- DelagardeR, FaverdinP, BaratteC, PeyraudJL. GrazeIn: a model of herbage intake and milk production for grazing dairy cows. 2. Prediction of intake under rotational and continuously stocked grazing management. Grass Forage Sci. 2011;66: 45–60. 10.1111/j.1365-2494.2010.00770.x
+- MaBL, LiangBC, BiswasDK, MorrisonMJ, McLaughlinNB. The carbon footprint of maize production as affected by nitrogen fertilizer and maize-legume rotations. Nutr Cycl Agroecosystems. 2012;94: 15–31. 10.1007/s10705-012-9522-0
+- RauccciGS, MoreiraCS, AlvesPS, MelloFFC, FrazãoLA, CerriCEP, et al Greenhouse gas assessment of Brazilian soybean production: a case study of Mato Grosso State. J Clean Prod. 2015;96: 418–425.
+- CamargoGGT, RyanMR, RichardTL. Energy Use and Greenhouse Gas Emissions from Crop Production Using the Farm Energy Analysis Tool. Bioscience. 2013;63: 263–273. 10.1525/bio.2013.63.4.6
+- da SilvaMSJ, JobimCC, PoppiEC, TresTT, OsmariMP. Production technology and quality of corn silage for feeding dairy cattle in Southern Brazil. Rev Bras Zootec. 2015;44: 303–313. 10.1590/S1806-92902015000900001
+- Duchini PGPGGuzatti GCGC, Ribeiro-Filho HMNHMNNSbrissia AFAFAF. Intercropping black oat (Avena strigosa) and annual ryegrass (Lolium multiflorum) can increase pasture leaf production compared with their monocultures. Crop Pasture Sci. 2016;67: 574–581. 10.1071/CP15170
+- ScaravelliLFB, PereiraLET, OlivoCJ, AgnolinCA. Produção e qualidade de pastagens de Coastcross-1 e milheto utilizadas com vacas leiteiras. Cienc Rural. 2007;37: 841–846.
+- SbrissiaAF, DuchiniPG, ZaniniGD, SantosGT, PadilhaDA, SchmittD. Defoliation strategies in pastures submitted to intermittent stocking method: Underlying mechanisms buffering forage accumulation over a range of grazing heights. Crop Sci. 2018;58: 945–954. 10.2135/cropsci2017.07.0447
+- AlmeidaJGR, Dall-OrsolettaAC, OziemblowskiMM, MichelonGM, BayerC, EdouardN, et al Carbohydrate-rich supplements can improve nitrogen use efficiency and mitigate nitrogenous gas emissions from the excreta of dairy cows grazing temperate grass. Animal. 2020; 1–12. 10.1017/S1751731119003057 31907089
+- Intergovernamental Panel on Climate Change (IPCC). IPCC guidlines for national greenhouse gas inventories. EgglestonH.S., BuendiaL., MiwaK. NT and TK, editor. Hayama, Kanagawa, Japan: Institute for Global Environmental Strategies; 2006.
+- RamalhoB, DieckowJ, BarthG, SimonPL, MangrichAS, BrevilieriRC. No-tillage and ryegrass grazing effects on stocks, stratification and lability of carbon and nitrogen in a subtropical Umbric Ferralsol. Eur J Soil Sci. 2020; 1–14. 10.1111/ejss.12933
+- FernandesHC, da SilveiraJCM, RinaldiPCN. Avaliação do custo energético de diferentes operações agrícolas mecanizadas. Cienc e Agrotecnologia. 2008;32: 1582–1587. 10.1590/s1413-70542008000500034
+- Wang M Q. GREET 1.8a Spreadsheet Model. 2007. Available: http://www.transportation.anl.gov/software/GREET/
+- RotzCAA, MontesF, ChianeseDS, ChianeDS. The carbon footprint of dairy production systems through partial life cycle assessment. J Dairy Sci. 2010;93: 1266–1282. 10.3168/jds.2009-2162 20172247
+- NiuM, KebreabE, HristovAN, OhJ, ArndtC, BanninkA, et al Prediction of enteric methane production, yield, and intensity in dairy cattle using an intercontinental database. Glob Chang Biol. 2018;24: 3368–3389. 10.1111/gcb.14094 29450980
+- EugèneM, SauvantD, NozièreP, ViallardD, OueslatiK, LhermM, et al A new Tier 3 method to calculate methane emission inventory for ruminants. J Environ Manage. 2019;231: 982–988. 10.1016/j.jenvman.2018.10.086 30602259
+- ReedKF, MoraesLE, CasperDP, KebreabE. Predicting nitrogen excretion from cattle. J Dairy Sci. 2015;98: 3025–3035. 10.3168/jds.2014-8397 25747829
+- BarrosMV, PiekarskiCM, De FranciscoAC. Carbon footprint of electricity generation in Brazil: An analysis of the 2016–2026 period. Energies. 2018;11 10.3390/en11061412
+- LudingtonD, JohnsonE. Dairy Farm Energy Audit Summary. New York State Energy Res Dev Auth. 2003.
+- ThomaG, JollietO, WangY. A biophysical approach to allocation of life cycle environmental burdens for fluid milk supply chain analysis. Int Dairy J. 2013;31 10.1016/j.idairyj.2012.08.012
+- NaranjoA, JohnsonA, RossowH. Greenhouse gas, water, and land footprint per unit of production of the California dairy industry over 50 years. 2020 10.3168/jds.2019-16576 32037166
+- JayasundaraS, WordenD, WeersinkA, WrightT, VanderZaagA, GordonR, et al Improving farm profitability also reduces the carbon footprint of milk production in intensive dairy production systems. J Clean Prod. 2019;229: 1018–1028. 10.1016/j.jclepro.2019.04.013
+- WilliamsSRO, FisherPD, BerrisfordT, MoatePJ, ReynardK. Reducing methane on-farm by feeding diets high in fat may not always reduce life cycle greenhouse gas emissions. Int J Life Cycle Assess. 2014;19: 69–78. 10.1007/s11367-013-0619-8
+- GollnowS, LundieS, MooreAD, McLarenJ, van BuurenN, StahleP, et al Carbon footprint of milk production from dairy cows in Australia. Int Dairy J. 2014;37: 31–38. 10.1016/j.idairyj.2014.02.005
+- O’BrienD, CapperJL, GarnsworthyPC, GraingerC, ShallooL. A case study of the carbon footprint of milk from high-performing confinement and grass-based dairy farms. J Dairy Sci. 2014 10.3168/jds.2013-7174 24440256
+- ChobtangJ, McLarenSJ, LedgardSF, DonaghyDJ. Consequential Life Cycle Assessment of Pasture-based Milk Production: A Case Study in the Waikato Region, New Zealand. J Ind Ecol. 2017;21: 1139–1152. 10.1111/jiec.12484
+- GargMR, PhondbaBT, SherasiaPL, MakkarHPS. Carbon footprint of milk production under smallholder dairying in Anand district of Western India: A cradle-to-farm gate life cycle assessment. Anim Prod Sci. 2016;56: 423–436. 10.1071/AN15464
+- de LéisCM, CherubiniE, RuviaroCF, Prudêncio da SilvaV, do Nascimento LampertV, SpiesA, et al Carbon footprint of milk production in Brazil: a comparative case study. Int J Life Cycle Assess. 2015;20: 46–60. 10.1007/s11367-014-0813-3
+- O’BrienD, GeogheganA, McNamaraK, ShallooL. How can grass-based dairy farmers reduce the carbon footprint of milk? Anim Prod Sci. 2016;56: 495–500. 10.1071/AN15490
+- O’BrienD, BrennanP, HumphreysJ, RuaneE, ShallooL. An appraisal of carbon footprint of milk from commercial grass-based dairy farms in Ireland according to a certified life cycle assessment methodology. Int J Life Cycle Assess. 2014;19: 1469–1481. 10.1007/s11367-014-0755-9
+- BaekCY, LeeKM, ParkKH. Quantification and control of the greenhouse gas emissions from a dairy cow system. J Clean Prod. 2014;70: 50–60. 10.1016/j.jclepro.2014.02.010
+- Dall-OrsolettaAC, AlmeidaJGR, CarvalhoPCF, Savian JV., Ribeiro-Filho HMN. Ryegrass pasture combined with partial total mixed ration reduces enteric methane emissions and maintains the performance of dairy cows during mid to late lactation. J Dairy Sci. 2016;99: 4374–4383. 10.3168/jds.2015-10396 27016830
+- Dall-OrsolettaAC, OziemblowskiMM, BerndtA, Ribeiro-FilhoHMN. Enteric methane emission from grazing dairy cows receiving corn silage or ground corn supplementation. Anim Feed Sci Technol. 2019;253: 65–73. 10.1016/j.anifeedsci.2019.05.009
+- NiuM, AppuhamyJADRN, LeytemAB, DunganRS, KebreabE. Effect of dietary crude protein and forage contents on enteric methane emissions and nitrogen excretion from dairy cows simultaneously. Anim Prod Sci. 2016;56: 312–321. 10.1071/AN15498
+- WaghornGC, LawN, BryantM, PachecoD, DalleyD. Digestion and nitrogen excretion by Holstein-Friesian cows in late lactation offered ryegrass-based pasture supplemented with fodder beet. Anim Prod Sci. 2019;59: 1261–1270. 10.1071/AN18018
+- DickhoeferU, GlowackiS, GómezCA, Castro-MontoyaJM. Forage and protein use efficiency in dairy cows grazing a mixed grass-legume pasture and supplemented with different levels of protein and starch. Livest Sci. 2018;216: 109–118. 10.1016/j.livsci.2018.08.004
+- SchwabCG, BroderickGA. A 100-Year Review: Protein and amino acid nutrition in dairy cows. J Dairy Sci. 2017;100: 10094–10112. 10.3168/jds.2017-13320 29153157
+- SordiA, DieckowJ, BayerC, AlburquerqueMA, PivaJT, ZanattaJA, et al Nitrous oxide emission factors for urine and dung patches in a subtropical Brazilian pastureland. Agric Ecosyst Environ. 2014;190: 94–103. 10.1016/j.agee.2013.09.004
+- SimonPL, DieckowJ, de KleinCAM, ZanattaJA, van der WeerdenTJ, RamalhoB, et al Nitrous oxide emission factors from cattle urine and dung, and dicyandiamide (DCD) as a mitigation strategy in subtropical pastures. Agric Ecosyst Environ. 2018;267: 74–82. 10.1016/j.agee.2018.08.013
+- WangX, LedgardS, LuoJ, GuoY, ZhaoZ, GuoL, et al Environmental impacts and resource use of milk production on the North China Plain, based on life cycle assessment. Sci Total Environ. 2018;625: 486–495. 10.1016/j.scitotenv.2017.12.259 29291563
+- PirloG, LolliS. Environmental impact of milk production from samples of organic and conventional farms in Lombardy (Italy). J Clean Prod. 2019;211: 962–971. 10.1016/j.jclepro.2018.11.070
+- HerzogA, WincklerC, ZollitschW. In pursuit of sustainability in dairy farming: A review of interdependent effects of animal welfare improvement and environmental impact mitigation. Agric Ecosyst Environ. 2018;267: 174–187. 10.1016/j.agee.2018.07.029
+- MostertPF, van MiddelaarCE, BokkersEAM, de BoerIJM. The impact of subclinical ketosis in dairy cows on greenhouse gas emissions of milk production. J Clean Prod. 2018 10.1016/j.jclepro.2017.10.019
+- MostertPF, van MiddelaarCE, de BoerIJM, BokkersEAM. The impact of foot lesions in dairy cows on greenhouse gas emissions of milk production. Agric Syst. 2018;167: 206–212. 10.1016/j.agsy.2018.09.006
+- FoleyJA, RamankuttyN, BraumanKA, CassidyES, GerberJS, JohnstonM, et al Solutions for a cultivated planet. Nature. 2011;478: 337–342. 10.1038/nature10452 21993620
+- LalR. Soil Carbon Sequestration Impacts on Global Climate Change and Food Security. Science (80-). 2004;304: 1623–1627. 10.1126/science.1097396 15192216
+- BoddeyRM, JantaliaCP, ConceiçaoPC, ZanattaJA, BayerC, MielniczukJ, et al Carbon accumulation at depth in Ferralsols under zero-till subtropical agriculture. Glob Chang Biol. 2010;16: 784–795. 10.1111/j.1365-2486.2009.02020.x
+- McConkeyB, AngersD, BenthamM, BoehmM, BrierleyT, CerkowniakD, et al Canadian agricultural greenhouse gas monitoring accounting and reporting system: methodology and greenhouse gas estimates for agricultural land in the LULUCF sector for NIR 2014. 2014.
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.doctags.txt
@ -1,9 +1,8 @@
-<document>
-<section_header_level_1><location><page_1><loc_37><loc_89><loc_85><loc_91></location>Pythonو R ةغلب ةجمربلا للاخ نم تلاكشملا لحو ةيجاتنلإا نيسحت</section_header_level_1>
-<text><location><page_1><loc_15><loc_80><loc_85><loc_87></location>Python و R ةغلب ةجمربلا ربتعت ةلاعف لولح داجيإ يف دعاستو ةيجاتنلإا ززعت نأ نكمي يتلا ةيوقلا تاودلأا نم ءاملعلاو نيللحملا ىلع لهسي امم ،تانايبلا ليلحتل ةيلاثم اهلعجت ةديرف تازيمPython و R نم لك كلتمي .تلاكشملل ناك اذإ .ةلاعفو ةعيرس ةقيرطب ةدقعم تلايلحت ءارجإ مهسي نأ نكمي تاغللا هذه مادختسا نإف ،ةيليلحت ةيلقع كيدل .لمعلا جئاتن نيسحت يف ريبك لكشب</text>
-<text><location><page_1><loc_34><loc_73><loc_34><loc_75></location>ً</text>
-<text><location><page_1><loc_16><loc_71><loc_85><loc_78></location>جارختساو تانايبلا نم ةلئاه تايمك ةجلاعم نكمملا نم حبصي ،ةجمربلا تاراهم عم يليلحتلا ريكفتلا عمتجي امدنع ذيفنتلPython و R مادختسا نيجمربملل نكمي .اهنم تاهجوتلاو طامنلأا ةجذمنلا لثم ،ةمدقتم ةيليلحت تايلمع ةقد رثكأ تارارق ذاختا ىلإ ا ضيأ يدؤي نأ نكمي لب ،تقولا رفوي طقف سيل اذه .ةريبكلا تانايبلا ليلحتو ةيئاصحلإا تانايبلا ىلع ةمئاق تاجاتنتسا ىلع ءانب .</text>
-<text><location><page_1><loc_83><loc_71><loc_83><loc_73></location>ً</text>
-<text><location><page_1><loc_15><loc_63><loc_85><loc_70></location>ليلحتلا نم ،تاقيبطتلا نم ةعساو ةعومجم معدت ةينغ تاودأو تابتكمPython و R نم لك رفوت ،كلذ ىلع ةولاع ىلع .ةفلتخملا تلاكشملل ةركتبم لولح ريوطتل تابتكملا هذه نم ةدافتسلاا نيمدختسملل نكمي .يللآا ملعتلا ىلإ ينايبلا R رفوت امنيب ،ةءافكب تانايبلا ةرادلإ Python يف pandas ةبتكم مادختسا نكمي ،لاثملا ليبس مسرلل ةيوق تاودأ .نيللحملاو نيثحابلل ةيلاثم اهلعجي امم ،يئاصحلإا ليلحتلاو ينايبلا</text>
-<text><location><page_1><loc_16><loc_56><loc_85><loc_61></location>Python و R ةغلب ةجمربلا يدؤت نأ نكمي ،ةياهنلا يف ةركتبم لولح ريفوتو ةيجاتنلإا نيسحت ىلإ ةيليلحت ةيلقع عم اهل نوكت نأ نكمي ةبسانملا ةيجمربلا بيلاسلأا قيبطتو لاعف لكشب تانايبلا ليلحت ىلع ةردقلا نإ .ةدقعملا تلاكشملل .ينهملاو يصخشلا ءادلأا ىلع ىدملا ةديعب ةيباجيإ تاريثأت</text>
-</document>
+<doctag><section_header_level_1><loc_183><loc_46><loc_426><loc_55>Pythonو R ةغلب ةجمربلا للاخ نم تلاكشملا لحو ةيجاتنلإا نيسحت</section_header_level_1>
+<text><loc_74><loc_64><loc_427><loc_99>Python و R ةغلب ةجمربلا ربتعت ةلاعف لولح داجيإ يف دعاستو ةيجاتنلإا ززعت نأ نكمي يتلا ةيوقلا تاودلأا نم ءاملعلاو نيللحملا ىلع لهسي امم ،تانايبلا ليلحتل ةيلاثم اهلعجت ةديرف تازيمPython و R نم لك كلتمي .تلاكشملل ناك اذإ .ةلاعفو ةعيرس ةقيرطب ةدقعم تلايلحت ءارجإ مهسي نأ نكمي تاغللا هذه مادختسا نإف ،ةيليلحت ةيلقع كيدل .لمعلا جئاتن نيسحت يف ريبك لكشب</text>
+<text><loc_170><loc_126><loc_170><loc_134>ً</text>
+<text><loc_82><loc_108><loc_427><loc_143>جارختساو تانايبلا نم ةلئاه تايمك ةجلاعم نكمملا نم حبصي ،ةجمربلا تاراهم عم يليلحتلا ريكفتلا عمتجي امدنع ذيفنتلPython و R مادختسا نيجمربملل نكمي .اهنم تاهجوتلاو طامنلأا ةجذمنلا لثم ،ةمدقتم ةيليلحت تايلمع ةقد رثكأ تارارق ذاختا ىلإ ا ضيأ يدؤي نأ نكمي لب ،تقولا رفوي طقف سيل اذه .ةريبكلا تانايبلا ليلحتو ةيئاصحلإا تانايبلا ىلع ةمئاق تاجاتنتسا ىلع ءانب .</text>
+<text><loc_416><loc_135><loc_416><loc_143>ً</text>
+<text><loc_76><loc_152><loc_427><loc_186>ليلحتلا نم ،تاقيبطتلا نم ةعساو ةعومجم معدت ةينغ تاودأو تابتكمPython و R نم لك رفوت ،كلذ ىلع ةولاع ىلع .ةفلتخملا تلاكشملل ةركتبم لولح ريوطتل تابتكملا هذه نم ةدافتسلاا نيمدختسملل نكمي .يللآا ملعتلا ىلإ ينايبلا R رفوت امنيب ،ةءافكب تانايبلا ةرادلإ Python يف pandas ةبتكم مادختسا نكمي ،لاثملا ليبس مسرلل ةيوق تاودأ .نيللحملاو نيثحابلل ةيلاثم اهلعجي امم ،يئاصحلإا ليلحتلاو ينايبلا</text>
+<text><loc_79><loc_195><loc_427><loc_221>Python و R ةغلب ةجمربلا يدؤت نأ نكمي ،ةياهنلا يف ةركتبم لولح ريفوتو ةيجاتنلإا نيسحت ىلإ ةيليلحت ةيلقع عم اهل نوكت نأ نكمي ةبسانملا ةيجمربلا بيلاسلأا قيبطتو لاعف لكشب تانايبلا ليلحت ىلع ةردقلا نإ .ةدقعملا تلاكشملل .ينهملاو يصخشلا ءادلأا ىلع ىدملا ةديعب ةيباجيإ تاريثأت</text>
+</doctag>
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_02.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/right_to_left_02.doctags.txt
@ -1,10 +1,7 @@
-<document>
-<text><location><page_1><loc_8><loc_3><loc_10><loc_4></location>11</text>
-<text><location><page_1><loc_11><loc_50><loc_73><loc_75></location>،هيلعو ملا ةوا رملا لاول خواهييع ووص عضت ةيرص م لا ةموكح لا نإف ةو اب لأا نم ددي قي حت ىاي لمعلخب خال ةير وام جلا سي ئر د يسلا فياكت ا دو ه :خاسعر ىاي ويولولأا ةومئخق سعر ىا ي يرصملا نخسنلإا ءخهب فام عضو ، تخ ووومن تحدووعم قووي حت ىوو اي لو وم علا ،ليوواعللاو ةحووصلا تحخووجم اووف ةووصخل ىوووواي خوووو حلا ا وووو و ،تخوووو ي خل لا فوووواذع اووووف ةامخوووو و ةمادلووووسمو ةوووويوق وو يلودلاو ةوويمياقلإا تخيدوو حلل ا ءوووض اووف يرووصملا امووو لا نووملأا تاددووحم ،ة وو ام ةووعبخلم رارملووساو ،ةيووسخيسلا ة رخوواملا ر ي وو و لت د ووواو ةاووصاومو تخ ايوووو لاو ةوووفخ لا تخووو ام ريوووولت ، خوووهرلإا ةوووحفخ كمو ر ار لوووسحاو نوووملأا لي هخووو م وووسري ي ووولا وووو حهل ا ىووواي لدووولعملا اهيدووو لا خووولبلاو ،اه،وووولا .اعملجملا ماسلاو ةه،اوملا</text>
-<text><location><page_1><loc_13><loc_45><loc_74><loc_48></location>رول لا لاول ةيرو ص م لا ةو موكحلا امخونرب دالوسي ،قبس خمل خً فوو 2024( -)2026 اتلآا وحهلا ىاي اهو ،ةسيئر ةيجيتارلسا اد هع ةعبرع قي حت :</text>
-<text><location><page_1><loc_12><loc_37><loc_73><loc_40></location>نــــــــم ما ةــــــــيا م رـ صم لا يم وقل ا اــــسن ا ءاــــ نب رــــــــــــــــــــصم لا عاـــــصت ا ءاـــــ نب يــــــــــــــــــــــسبا نت قتسظا ق يقحت را ر يــــــــــــــــــــــــساي سلا</text>
-<text><location><page_1><loc_12><loc_23><loc_73><loc_31></location>خهلوسحخب امخونرب لا ت خفدالوسم ديدحت لت دق هن ع ىلإ رخ لإا ردجت لكواب د روووصم ةو ووي ر تخ فدال ووو س م ىووو اي سيوووئر 2023 ر اوو وو حلا تخووو ساو تخيوووصوتو ، كيال ا تخ اووصيل اه،ووولا امخوونربلاو ،تارا ووو لا ت خ فدا لوو سمو ،اه،ووولا ،ةوو ي ا ةيه، ولا تخ ي جيتا رلسحا فالبمو .</text>
-<figure>
-<location><page_1><loc_75><loc_23><loc_100><loc_76></location>
-</figure>
-</document>
+<doctag><text><loc_40><loc_478><loc_49><loc_486>11</text>
+<text><loc_57><loc_125><loc_367><loc_249>،هيلعو ملا ةوا رملا لاول خواهييع ووص عضت ةيرص م لا ةموكح لا نإف ةو اب لأا نم ددي قي حت ىاي لمعلخب خال ةير وام جلا سي ئر د يسلا فياكت ا دو ه :خاسعر ىاي ويولولأا ةومئخق سعر ىا ي يرصملا نخسنلإا ءخهب فام عضو ، تخ ووومن تحدووعم قووي حت ىوو اي لو وم علا ،ليوواعللاو ةحووصلا تحخووجم اووف ةووصخل ىوووواي خوووو حلا ا وووو و ،تخوووو ي خل لا فوووواذع اووووف ةامخوووو و ةمادلووووسمو ةوووويوق وو يلودلاو ةوويمياقلإا تخيدوو حلل ا ءوووض اووف يرووصملا امووو لا نووملأا تاددووحم ،ة وو ام ةووعبخلم رارملووساو ،ةيووسخيسلا ة رخوواملا ر ي وو و لت د ووواو ةاووصاومو تخ ايوووو لاو ةوووفخ لا تخووو ام ريوووولت ، خوووهرلإا ةوووحفخ كمو ر ار لوووسحاو نوووملأا لي هخووو م وووسري ي ووولا وووو حهل ا ىووواي لدووولعملا اهيدووو لا خووولبلاو ،اه،وووولا .اعملجملا ماسلاو ةه،اوملا</text>
+<text><loc_63><loc_258><loc_370><loc_277>رول لا لاول ةيرو ص م لا ةو موكحلا امخونرب دالوسي ،قبس خمل خً فوو 2024( -)2026 اتلآا وحهلا ىاي اهو ،ةسيئر ةيجيتارلسا اد هع ةعبرع قي حت :</text>
+<text><loc_58><loc_301><loc_367><loc_317>نــــــــم ما ةــــــــيا م رـ صم لا يم وقل ا اــــسن ا ءاــــ نب رــــــــــــــــــــصم لا عاـــــصت ا ءاـــــ نب يــــــــــــــــــــــسبا نت قتسظا ق يقحت را ر يــــــــــــــــــــــــساي سلا</text>
+<text><loc_61><loc_344><loc_367><loc_385>خهلوسحخب امخونرب لا ت خفدالوسم ديدحت لت دق هن ع ىلإ رخ لإا ردجت لكواب د روووصم ةو ووي ر تخ فدال ووو س م ىووو اي سيوووئر 2023 ر اوو وو حلا تخووو ساو تخيوووصوتو ، كيال ا تخ اووصيل اه،ووولا امخوونربلاو ،تارا ووو لا ت خ فدا لوو سمو ،اه،ووولا ،ةوو ي ا ةيه، ولا تخ ي جيتا رلسحا فالبمو .</text>
+<picture><loc_375><loc_119><loc_500><loc_386></picture>
+</doctag>
--- a/tests/data/groundtruth/docling_v2/right_to_left_02.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_02.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_03.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/right_to_left_03.doctags.txt
@ -1,35 +1,33 @@
-<document>
-<section_header_level_1><location><page_1><loc_12><loc_90><loc_45><loc_93></location>یلخاد یلااک - یلصا رازاب رد شريذپ همانديما</section_header_level_1>
-<figure>
-<location><page_1><loc_65><loc_88><loc_81><loc_96></location>
-</figure>
-<section_header_level_1><location><page_1><loc_63><loc_81><loc_81><loc_84></location>لااک درادناتسا -2-5</section_header_level_1>
-<text><location><page_1><loc_77><loc_79><loc_87><loc_81></location>درادناتسا مان</text>
-<text><location><page_1><loc_11><loc_75><loc_44><loc_81></location>یرگ هتخير شور هب هدش ديلوت لاشمش و هشمش فرصم دروم هتسويپ یا هزاس یاهدلاوف رد - قباطم تسويپ زيلانآ</text>
-<text><location><page_1><loc_71><loc_72><loc_87><loc_74></location>یلم درادناتسا هرامش</text>
-<text><location><page_1><loc_40><loc_73><loc_45><loc_74></location>20300</text>
-<text><location><page_1><loc_68><loc_70><loc_87><loc_72></location>؟تسا یرابجا درادناتسا</text>
-<checkbox_unselected><location><page_1><loc_33><loc_70><loc_44><loc_72></location>ريخ       یلب</checkbox_unselected>
-<text><location><page_1><loc_65><loc_67><loc_87><loc_69></location>درادناتسا هدننکرداص عجرم</text>
-<text><location><page_1><loc_28><loc_67><loc_44><loc_69></location>ناريا درادناتسا یلم نامزاس</text>
-<text><location><page_1><loc_49><loc_62><loc_87><loc_66></location>ذخا ار روکذم درادناتسا ،لوصحم هدننکديلوت ايآ ؟تسا هدومن</text>
-<checkbox_selected><location><page_1><loc_33><loc_65><loc_35><loc_66></location>ريخ</checkbox_selected>
-<checkbox_unselected><location><page_1><loc_40><loc_65><loc_42><loc_66></location>یلب</checkbox_unselected>
-<section_header_level_1><location><page_1><loc_69><loc_56><loc_85><loc_58></location>سروب رد شريذپ -3</section_header_level_1>
-<text><location><page_1><loc_68><loc_54><loc_83><loc_56></location>کرادم هئارا خيرات</text>
-<text><location><page_1><loc_23><loc_54><loc_32><loc_56></location>1403/09/19</text>
-<text><location><page_1><loc_72><loc_51><loc_83><loc_53></location>شريذپ خيرات</text>
-<text><location><page_1><loc_23><loc_51><loc_32><loc_53></location>1403/10/04</text>
-<text><location><page_1><loc_62><loc_48><loc_83><loc_50></location>هضرع هتيمک هسلج هرامش</text>
-<text><location><page_1><loc_26><loc_49><loc_29><loc_50></location>436</text>
-<text><location><page_1><loc_67><loc_45><loc_83><loc_47></location>همانديما جرد خيرات</text>
-<text><location><page_1><loc_23><loc_46><loc_32><loc_48></location>1403/10/05</text>
-<text><location><page_1><loc_71><loc_43><loc_83><loc_45></location>شريذپ رواشم</text>
-<text><location><page_1><loc_21><loc_43><loc_34><loc_45></location>سروب نومرآ یرازگراک</text>
-<text><location><page_1><loc_47><loc_37><loc_83><loc_42></location>رد لااک شريذپ زا سپ هياپ تميق نييعت ةوحن سروب</text>
-<text><location><page_1><loc_18><loc_40><loc_36><loc_42></location>یناهج  یاه تميق ساسا رب</text>
-<text><location><page_1><loc_45><loc_32><loc_83><loc_37></location>شورف /شورف لک /ديلوت زا هضرع دصرد لقادح یلخاد</text>
-<text><location><page_1><loc_14><loc_35><loc_40><loc_37></location>نت 47.500 اي هنايلاس ديلوت زا %50 لقادح</text>
-<text><location><page_1><loc_68><loc_29><loc_83><loc_31></location>ليوحت زاجم یاطخ</text>
-<text><location><page_1><loc_18><loc_30><loc_37><loc_31></location>ليوحت لباق هلومحم نيرخآ 5%</text>
-</document>
+<doctag><section_header_level_1><loc_58><loc_37><loc_225><loc_48>یلخاد یلااک - یلصا رازاب رد شريذپ همانديما</section_header_level_1>
+<picture><loc_326><loc_21><loc_405><loc_61></picture>
+<section_header_level_1><loc_314><loc_82><loc_403><loc_93>لااک درادناتسا -2-5</section_header_level_1>
+<text><loc_385><loc_96><loc_436><loc_106>درادناتسا مان</text>
+<text><loc_56><loc_96><loc_222><loc_125>یرگ هتخير شور هب هدش ديلوت لاشمش و هشمش فرصم دروم هتسويپ یا هزاس یاهدلاوف رد - قباطم تسويپ زيلانآ</text>
+<text><loc_354><loc_128><loc_436><loc_138>یلم درادناتسا هرامش</text>
+<text><loc_199><loc_128><loc_223><loc_136>20300</text>
+<text><loc_342><loc_142><loc_436><loc_152>؟تسا یرابجا درادناتسا</text>
+<checkbox_unselected><loc_166><loc_141><loc_222><loc_149>ريخ       یلب</checkbox_unselected>
+<text><loc_327><loc_155><loc_436><loc_165>درادناتسا هدننکرداص عجرم</text>
+<text><loc_140><loc_154><loc_222><loc_163>ناريا درادناتسا یلم نامزاس</text>
+<text><loc_245><loc_169><loc_436><loc_192>ذخا ار روکذم درادناتسا ،لوصحم هدننکديلوت ايآ ؟تسا هدومن</text>
+<checkbox_selected><loc_166><loc_168><loc_175><loc_176>ريخ</checkbox_selected>
+<checkbox_unselected><loc_199><loc_168><loc_208><loc_176>یلب</checkbox_unselected>
+<section_header_level_1><loc_344><loc_209><loc_425><loc_219>سروب رد شريذپ -3</section_header_level_1>
+<text><loc_340><loc_222><loc_414><loc_232>کرادم هئارا خيرات</text>
+<text><loc_116><loc_221><loc_158><loc_230>1403/09/19</text>
+<text><loc_358><loc_236><loc_414><loc_246>شريذپ خيرات</text>
+<text><loc_116><loc_235><loc_158><loc_243>1403/10/04</text>
+<text><loc_308><loc_249><loc_414><loc_259>هضرع هتيمک هسلج هرامش</text>
+<text><loc_130><loc_248><loc_144><loc_257>436</text>
+<text><loc_335><loc_263><loc_414><loc_273>همانديما جرد خيرات</text>
+<text><loc_116><loc_262><loc_158><loc_270>1403/10/05</text>
+<text><loc_355><loc_276><loc_414><loc_286>شريذپ رواشم</text>
+<text><loc_103><loc_275><loc_171><loc_283>سروب نومرآ یرازگراک</text>
+<text><loc_236><loc_291><loc_414><loc_314>رد لااک شريذپ زا سپ هياپ تميق نييعت ةوحن سروب</text>
+<text><loc_92><loc_290><loc_179><loc_298>یناهج  یاه تميق ساسا رب</text>
+<text><loc_224><loc_317><loc_414><loc_340>شورف /شورف لک /ديلوت زا هضرع دصرد لقادح یلخاد</text>
+<text><loc_72><loc_316><loc_202><loc_325>نت 47.500 اي هنايلاس ديلوت زا %50 لقادح</text>
+<text><loc_340><loc_344><loc_414><loc_354>ليوحت زاجم یاطخ</text>
+<text><loc_90><loc_343><loc_184><loc_351>ليوحت لباق هلومحم نيرخآ 5%</text>
+<page_footer><loc_224><loc_463><loc_247><loc_469>Page 7</page_footer>
+</doctag>
--- a/Show More
+++ b/Show More