feat!: Docling v2 (#117)

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Christoph Auer
2024-10-16 21:02:03 +02:00
committed by GitHub
parent d504432c1e
commit 7d3be0edeb
144 changed files with 15180 additions and 3828 deletions

View File

@@ -1,68 +1,63 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
from typing import TYPE_CHECKING, Set, Union
from PIL import Image
from docling_core.types.doc import DoclingDocument
if TYPE_CHECKING:
from docling.datamodel.base_models import BoundingBox, Cell, PageSize
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
class PdfPageBackend(ABC):
class AbstractDocumentBackend(ABC):
@abstractmethod
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
pass
@abstractmethod
def get_text_cells(self) -> Iterable["Cell"]:
pass
@abstractmethod
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
pass
@abstractmethod
def get_page_image(
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
) -> Image.Image:
pass
@abstractmethod
def get_size(self) -> "PageSize":
pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
pass
class PdfDocumentBackend(ABC):
@abstractmethod
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
self.path_or_stream = path_or_stream
self.document_hash = document_hash
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass
@abstractmethod
def page_count(self) -> int:
pass
self.document_hash = in_doc.document_hash
self.input_format = in_doc.format
@abstractmethod
def is_valid(self) -> bool:
pass
@classmethod
@abstractmethod
def supports_pagination(cls) -> bool:
pass
@abstractmethod
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
@abstractmethod
def supported_formats(cls) -> Set["InputFormat"]:
pass
class PaginatedDocumentBackend(AbstractDocumentBackend):
"""DeclarativeDocumentBackend.
A declarative document backend is a backend that can transform to DoclingDocument
straight without a recognition pipeline.
"""
@abstractmethod
def page_count(self) -> int:
pass
class DeclarativeDocumentBackend(AbstractDocumentBackend):
"""DeclarativeDocumentBackend.
A declarative document backend is a backend that can transform to DoclingDocument
straight without a recognition pipeline.
"""
@abstractmethod
def convert(self) -> DoclingDocument:
pass

View File

@@ -5,12 +5,14 @@ from pathlib import Path
from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_parse.docling_parse import pdf_parser
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
@@ -177,8 +179,8 @@ class DoclingParsePageBackend(PdfPageBackend):
return image
def get_size(self) -> PageSize:
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None
@@ -186,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
class DoclingParseDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self._pdoc = pdfium.PdfDocument(path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = pdf_parser()
success = False
if isinstance(path_or_stream, BytesIO):
if isinstance(self.path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio(
document_hash, path_or_stream
self.document_hash, self.path_or_stream
)
elif isinstance(self.path_or_stream, Path):
success = self.parser.load_document(
self.document_hash, str(self.path_or_stream)
)
elif isinstance(path_or_stream, Path):
success = self.parser.load_document(document_hash, str(path_or_stream))
if not success:
raise RuntimeError(
f"docling-parse could not load document {document_hash}."
f"docling-parse could not load document with hash {self.document_hash}."
)
def page_count(self) -> int:

View File

@@ -2,15 +2,19 @@ import logging
import random
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_parse.docling_parse import pdf_parser_v2
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell, Size
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
@@ -190,8 +194,8 @@ class DoclingParseV2PageBackend(PdfPageBackend):
return image
def get_size(self) -> PageSize:
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None
@@ -199,23 +203,23 @@ class DoclingParseV2PageBackend(PdfPageBackend):
class DoclingParseV2DocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self._pdoc = pdfium.PdfDocument(path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = pdf_parser_v2("fatal")
success = False
if isinstance(path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio(
document_hash, path_or_stream
self.document_hash, path_or_stream
)
elif isinstance(path_or_stream, Path):
success = self.parser.load_document(document_hash, str(path_or_stream))
success = self.parser.load_document(self.document_hash, str(path_or_stream))
if not success:
raise RuntimeError(
f"docling-parse could not load document {document_hash}."
f"docling-parse v2 could not load document {self.document_hash}."
)
def page_count(self) -> int:

View File

@@ -0,0 +1,425 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from bs4 import BeautifulSoup
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
GroupLabel,
TableCell,
TableData,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...")
self.soup = None
# HTML file:
self.path_or_stream = path_or_stream
# Initialise the parents for the hierarchy
self.max_levels = 10
self.level = 0
self.parents = {} # type: ignore
for i in range(0, self.max_levels):
self.parents[i] = None
self.labels = {} # type: ignore
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
raise RuntimeError(
f"Could not initialize HTML backend for file with hash {self.document_hash}."
) from e
def is_valid(self) -> bool:
return self.soup is not None
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.HTML}
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
doc = DoclingDocument(name="dummy")
_log.debug("Trying to convert HTML...")
if self.is_valid():
assert self.soup is not None
# Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"):
br.replace_with("\n")
doc = self.walk(self.soup.body, doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
return doc
def walk(self, element, doc):
try:
# Iterate over elements in the body of the document
for idx, element in enumerate(element.children):
try:
self.analyse_element(element, idx, doc)
except Exception as exc_child:
_log.error(" -> error treating child: ", exc_child)
_log.error(" => element: ", element, "\n")
raise exc_child
except Exception as exc:
pass
return doc
def analyse_element(self, element, idx, doc):
"""
if element.name!=None:
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
"""
if element.name in self.labels:
self.labels[element.name] += 1
else:
self.labels[element.name] = 1
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(element, idx, doc)
elif element.name in ["p"]:
self.handle_paragraph(element, idx, doc)
elif element.name in ["ul", "ol"]:
self.handle_list(element, idx, doc)
elif element.name in ["li"]:
self.handle_listitem(element, idx, doc)
elif element.name == "table":
self.handle_table(element, idx, doc)
elif element.name == "figure":
self.handle_figure(element, idx, doc)
elif element.name == "img":
self.handle_image(element, idx, doc)
else:
self.walk(element, doc)
def get_direct_text(self, item):
"""Get the direct text of the <li> element (ignoring nested lists)."""
text = item.find(string=True, recursive=False)
if isinstance(text, str):
return text.strip()
return ""
# Function to recursively extract text from all child nodes
def extract_text_recursively(self, item):
result = []
if isinstance(item, str):
return [item]
result.append(self.get_direct_text(item))
try:
# Iterate over the children (and their text and tails)
for child in item:
try:
# Recursively get the child's text content
result.extend(self.extract_text_recursively(child))
except:
pass
except:
_log.warn("item has no children")
pass
return " ".join(result)
def handle_header(self, element, idx, doc):
"""Handles header tags (h1, h2, etc.)."""
hlevel = int(element.name.replace("h", ""))
slevel = hlevel - 1
label = DocItemLabel.SECTION_HEADER
text = element.text.strip()
if hlevel == 1:
for key, val in self.parents.items():
self.parents[key] = None
self.level = 1
self.parents[self.level] = doc.add_text(
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
)
elif hlevel == self.level:
self.parents[hlevel] = doc.add_text(
parent=self.parents[hlevel - 1], label=label, text=text
)
elif hlevel > self.level:
# add invisible group
for i in range(self.level + 1, hlevel):
self.parents[i] = doc.add_group(
name=f"header-{i}",
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
)
self.parents[hlevel] = doc.add_text(
parent=self.parents[hlevel - 1], label=label, text=text
)
self.level = hlevel
elif hlevel < self.level:
# remove the tail
for key, val in self.parents.items():
if key > hlevel:
self.parents[key] = None
self.parents[hlevel] = doc.add_text(
parent=self.parents[hlevel - 1], label=label, text=text
)
self.level = hlevel
def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
if element.text is None:
return
text = element.text.strip()
label = DocItemLabel.PARAGRAPH
if len(text) == 0:
return
doc.add_text(parent=self.parents[self.level], label=label, text=text)
def handle_list(self, element, idx, doc):
"""Handles list tags (ul, ol) and their list items."""
if element.name == "ul":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
)
elif element.name == "ol":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level],
name="ordered list",
label=GroupLabel.ORDERED_LIST,
)
self.level += 1
self.walk(element, doc)
self.parents[self.level + 1] = None
self.level -= 1
def handle_listitem(self, element, idx, doc):
"""Handles listitem tags (li)."""
nested_lists = element.find(["ul", "ol"])
parent_list_label = self.parents[self.level].label
index_in_list = len(self.parents[self.level].children) + 1
if nested_lists:
name = element.name
text = self.get_direct_text(element)
marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
marker = str(index_in_list)
enumerated = True
# create a list-item
self.parents[self.level + 1] = doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)
self.level += 1
self.walk(element, doc)
self.parents[self.level + 1] = None
self.level -= 1
elif isinstance(element.text, str):
text = element.text.strip()
marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}."
enumerated = True
doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)
else:
_log.warn("list-item has no text: ", element)
def handle_table(self, element, idx, doc):
"""Handles table tags."""
nested_tables = element.find("table")
if nested_tables is not None:
_log.warn("detected nested tables: skipping for now")
return
# Count the number of rows (number of <tr> elements)
num_rows = len(element.find_all("tr"))
# Find the number of columns (taking into account colspan)
num_cols = 0
for row in element.find_all("tr"):
col_count = 0
for cell in row.find_all(["td", "th"]):
colspan = int(cell.get("colspan", 1))
col_count += colspan
num_cols = max(num_cols, col_count)
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
for row_idx, row in enumerate(element.find_all("tr")):
# For each row, find all the column cells (both <td> and <th>)
cells = row.find_all(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header
col_header = True
for j, html_cell in enumerate(cells):
if html_cell.name == "td":
col_header = False
col_idx = 0
# Extract and print the text content of each cell
for _, html_cell in enumerate(cells):
text = html_cell.text
try:
text = self.extract_table_cell_text(html_cell)
except Exception as exc:
_log.warn("exception: ", exc)
exit(-1)
# label = html_cell.name
col_span = int(html_cell.get("colspan", 1))
row_span = int(html_cell.get("rowspan", 1))
while grid[row_idx][col_idx] is not None:
col_idx += 1
for r in range(row_span):
for c in range(col_span):
grid[row_idx + r][col_idx + c] = text
cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
data.table_cells.append(cell)
doc.add_table(data=data, parent=self.parents[self.level])
def get_list_text(self, list_element, level=0):
"""Recursively extract text from <ul> or <ol> with proper indentation."""
result = []
bullet_char = "*" # Default bullet character for unordered lists
if list_element.name == "ol": # For ordered lists, use numbers
for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
# Add numbering for ordered lists
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if nested_list:
result.extend(self.get_list_text(nested_list, level + 1))
elif list_element.name == "ul": # For unordered lists, use bullet points
for li in list_element.find_all("li", recursive=False):
# Add bullet points for unordered lists
result.append(
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
)
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if nested_list:
result.extend(self.get_list_text(nested_list, level + 1))
return result
def extract_table_cell_text(self, cell):
"""Extract text from a table cell, including lists with indents."""
contains_lists = cell.find(["ul", "ol"])
if contains_lists is None:
return cell.text
else:
_log.debug(
"should extract the content correctly for table-cells with lists ..."
)
return cell.text
def handle_figure(self, element, idx, doc):
"""Handles image tags (img)."""
# Extract the image URI from the <img> tag
# image_uri = root.xpath('//figure//img/@src')[0]
contains_captions = element.find(["figcaption"])
if contains_captions is None:
doc.add_picture(parent=self.parents[self.level], caption=None)
else:
texts = []
for item in contains_captions:
texts.append(item.text)
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
)
doc.add_picture(
parent=self.parents[self.level],
caption=fig_caption,
)
def handle_image(self, element, idx, doc):
"""Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None)

View File

@@ -0,0 +1,375 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
ProvenanceItem,
Size,
TableCell,
TableData,
)
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.namespaces = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
"p": "http://schemas.openxmlformats.org/presentationml/2006/main",
}
# Powerpoint file:
self.path_or_stream = path_or_stream
self.pptx_obj = None
self.valid = False
try:
if isinstance(self.path_or_stream, BytesIO):
self.pptx_obj = Presentation(self.path_or_stream)
elif isinstance(self.path_or_stream, Path):
self.pptx_obj = Presentation(str(self.path_or_stream))
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
) from e
return
def page_count(self) -> int:
if self.is_valid():
assert self.pptx_obj is not None
return len(self.pptx_obj.slides)
else:
return 0
def is_valid(self) -> bool:
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return True # True? if so, how to handle pages...
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.PPTX}
def convert(self) -> DoclingDocument:
# Parses the PPTX into a structured document model.
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
mimetype="application/vnd.ms-powerpoint",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(
name=docname, origin=origin
) # must add origin information
doc = self.walk_linear(self.pptx_obj, doc)
return doc
def generate_prov(self, shape, slide_ind, text=""):
left = shape.left
top = shape.top
width = shape.width
height = shape.height
shape_bbox = [left, top, left + width, top + height]
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
# prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
prov = ProvenanceItem(
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
)
return prov
def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
is_a_list = False
enum_list_item_value = 0
for paragraph in shape.text_frame.paragraphs:
enum_list_item_value += 1
bullet_type = "None"
# Check if paragraph is a bullet point using the `element` XML
p = paragraph._element
if (
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Bullet"
is_a_list = True
elif (
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Numbered"
is_a_list = True
else:
is_a_list = False
if paragraph.level > 0:
# Most likely a sub-list
is_a_list = True
list_text = paragraph.text.strip()
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
if is_a_list:
# Determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
list_label = GroupLabel.LIST
if bullet_type == "Numbered":
list_label = GroupLabel.ORDERED_LIST
new_list = doc.add_group(
label=list_label, name=f"list", parent=parent_slide
)
else:
new_list = None
if is_a_list:
_log.debug("LIST DETECTED!")
else:
_log.debug("No List")
# for e in p.iter():
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
if len(e.text.strip()) > 0:
e_is_a_list_item = False
is_numbered = False
if (
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Bullet"
e_is_a_list_item = True
elif (
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Numbered"
is_numbered = True
e_is_a_list_item = True
else:
e_is_a_list_item = False
if e_is_a_list_item:
# Set marker and enumerated arguments if this is an enumeration element.
enum_marker = str(enum_list_item_value) + "."
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_list,
text=list_text,
prov=prov,
)
else:
# Assign proper label to the text, depending if it's a Title or Section Header
# For other types of text, assign - PARAGRAPH
doc_label = DocItemLabel.PARAGRAPH
if shape.is_placeholder:
placeholder_type = shape.placeholder_format.type
if placeholder_type in [
PP_PLACEHOLDER.CENTER_TITLE,
PP_PLACEHOLDER.TITLE,
]:
# It's a title
doc_label = DocItemLabel.TITLE
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
DocItemLabel.SECTION_HEADER
enum_list_item_value = 0
doc.add_text(
label=doc_label,
parent=parent_slide,
text=list_text,
prov=prov,
)
return
def handle_title(self, shape, parent_slide, slide_ind, doc):
placeholder_type = shape.placeholder_format.type
txt = shape.text.strip()
prov = self.generate_prov(shape, slide_ind, txt)
if len(txt.strip()) > 0:
# title = slide.shapes.title.text if slide.shapes.title else "No title"
if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
_log.info(f"Title found: {shape.text}")
doc.add_text(
label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
)
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
_log.info(f"Subtitle found: {shape.text}")
# Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
doc.add_text(
label=DocItemLabel.SECTION_HEADER,
parent=parent_slide,
text=txt,
prov=prov,
)
return
def handle_pictures(self, shape, parent_slide, slide_ind, doc):
# shape has picture
prov = self.generate_prov(shape, slide_ind, "")
doc.add_picture(parent=parent_slide, caption=None, prov=prov)
return
def handle_tables(self, shape, parent_slide, slide_ind, doc):
# Handling tables, images, charts
if shape.has_table:
table = shape.table
table_xml = shape._element
prov = self.generate_prov(shape, slide_ind, "")
num_cols = 0
num_rows = len(table.rows)
tcells = []
# Access the XML element for the shape that contains the table
table_xml = shape._element
for row_idx, row in enumerate(table.rows):
if len(row.cells) > num_cols:
num_cols = len(row.cells)
for col_idx, cell in enumerate(row.cells):
# Access the XML of the cell (this is the 'tc' element in table XML)
cell_xml = table_xml.xpath(
f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
)
if not cell_xml:
continue # If no cell XML is found, skip
cell_xml = cell_xml[0] # Get the first matching XML node
row_span = cell_xml.get("rowSpan") # Vertical span
col_span = cell_xml.get("gridSpan") # Horizontal span
if row_span is None:
row_span = 1
else:
row_span = int(row_span)
if col_span is None:
col_span = 1
else:
col_span = int(col_span)
icell = TableCell(
text=cell.text.strip(),
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=False,
row_header=False,
)
if len(cell.text.strip()) > 0:
tcells.append(icell)
# Initialize Docling TableData
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Populate
for tcell in tcells:
data.table_cells.append(tcell)
if len(tcells) > 0:
# If table is not fully empty...
# Create Docling table
doc.add_table(data=data, prov=prov)
return
def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
# Units of size in PPTX by default are EMU units (English Metric Units)
slide_width = pptx_obj.slide_width
slide_height = pptx_obj.slide_height
text_content = [] # type: ignore
max_levels = 10
parents = {} # type: ignore
for i in range(0, max_levels):
parents[i] = None
# Loop through each slide
for slide_num, slide in enumerate(pptx_obj.slides):
slide_ind = pptx_obj.slides.index(slide)
parent_slide = doc.add_group(
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
)
size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
# Loop through each shape in the slide
for shape in slide.shapes:
if shape.has_table:
# Handle Tables
self.handle_tables(shape, parent_slide, slide_ind, doc)
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
# Handle Tables
self.handle_pictures(shape, parent_slide, slide_ind, doc)
# If shape doesn't have any text, move on to the next shape
if not hasattr(shape, "text"):
continue
if shape.text is None:
continue
if len(shape.text.strip()) == 0:
continue
if not shape.has_text_frame:
_log.warn("Warning: shape has text but not text_frame")
continue
# if shape.is_placeholder:
# Handle Titles (Headers) and Subtitles
# Check if the shape is a placeholder (titles are placeholders)
# self.handle_title(shape, parent_slide, slide_ind, doc)
# self.handle_text_elements(shape, parent_slide, slide_ind, doc)
# else:
# Handle other text elements, including lists (bullet lists, numbered lists)
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
# figures...
# doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
return doc

View File

@@ -0,0 +1,509 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union
import docx
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableCell,
TableData,
)
from lxml import etree
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class MsWordDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.XML_KEY = (
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
)
self.xml_namespaces = {
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
}
# self.initialise(path_or_stream)
# Word file:
self.path_or_stream = path_or_stream
self.valid = False
# Initialise the parents for the hierarchy
self.max_levels = 10
self.level_at_new_list = None
self.parents = {} # type: ignore
for i in range(-1, self.max_levels):
self.parents[i] = None
self.level = 0
self.listIter = 0
self.history = {
"names": [None],
"levels": [None],
"numids": [None],
"indents": [None],
}
self.docx_obj = None
try:
if isinstance(self.path_or_stream, BytesIO):
self.docx_obj = docx.Document(self.path_or_stream)
elif isinstance(self.path_or_stream, Path):
self.docx_obj = docx.Document(str(self.path_or_stream))
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
) from e
def is_valid(self) -> bool:
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.DOCX}
def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model.
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(name=docname, origin=origin)
if self.is_valid():
assert self.docx_obj is not None
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
return doc
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
def update_history(self, name, level, numid, ilevel):
self.history["names"].append(name)
self.history["levels"].append(level)
self.history["numids"].append(numid)
self.history["indents"].append(ilevel)
def prev_name(self):
return self.history["names"][-1]
def prev_level(self):
return self.history["levels"][-1]
def prev_numid(self):
return self.history["numids"][-1]
def prev_indent(self):
return self.history["indents"][-1]
def get_level(self) -> int:
"""Return the first None index."""
for k, v in self.parents.items():
if k >= 0 and v == None:
return k
return 0
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
for element in body:
tag_name = etree.QName(element).localname
# Check for Inline Images (drawings or blip elements)
found_drawing = etree.ElementBase.xpath(
element, ".//w:drawing", namespaces=self.xml_namespaces
)
found_pict = etree.ElementBase.xpath(
element, ".//w:pict", namespaces=self.xml_namespaces
)
# Check for Tables
if element.tag.endswith("tbl"):
try:
self.handle_tables(element, docx_obj, doc)
except Exception:
_log.debug("could not parse a table, broken docx table")
elif found_drawing or found_pict:
self.handle_pictures(element, docx_obj, doc)
# Check for Text
elif tag_name in ["p"]:
self.handle_text_elements(element, docx_obj, doc)
else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc
def str_to_int(self, s, default=0):
if s is None:
return None
try:
return int(s)
except ValueError:
return default
def get_numId_and_ilvl(self, paragraph):
# Access the XML element of the paragraph
numPr = paragraph._element.find(
".//w:numPr", namespaces=paragraph._element.nsmap
)
if numPr is not None:
# Get the numId element and extract the value
numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
return self.str_to_int(numId, default=None), self.str_to_int(
ilvl, default=None
)
return None, None # If the paragraph is not part of a list
def get_label_and_level(self, paragraph):
if paragraph.style is None:
return "Normal", None
label = paragraph.style.name
if label is None:
return "Normal", None
if ":" in label:
parts = label.split(":")
if len(parts) == 2:
return parts[0], int(parts[1])
parts = label.split(" ")
if "Heading" in label and len(parts) == 2:
parts.sort()
label_str = ""
label_level = 0
if parts[0] == "Heading":
# print("{} - {}".format(parts[0], parts[1]))
label_str = parts[0]
label_level = self.str_to_int(parts[1], default=None)
if parts[1] == "Heading":
label_str = parts[1]
label_level = self.str_to_int(parts[0], default=None)
return label_str, label_level
else:
return label, None
def handle_text_elements(self, element, docx_obj, doc):
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
if paragraph.text is None:
# _log.warn(f"paragraph has text==None")
return
text = paragraph.text.strip()
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
# Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph"
# TODO: reliably identify wether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False
p_style_name, p_level = self.get_label_and_level(paragraph)
numid, ilevel = self.get_numId_and_ilvl(paragraph)
# print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
if numid == 0:
numid = None
# Handle lists
if numid is not None and ilevel is not None:
self.add_listitem(
element,
docx_obj,
doc,
p_style_name,
p_level,
numid,
ilevel,
text,
is_numbered,
)
self.update_history(p_style_name, p_level, numid, ilevel)
return
elif numid is None and self.prev_numid() is not None: # Close list
for key, val in self.parents.items():
if key >= self.level_at_new_list:
self.parents[key] = None
self.level = self.level_at_new_list - 1
self.level_at_new_list = None
if p_style_name in ["Title"]:
for key, val in self.parents.items():
self.parents[key] = None
self.parents[0] = doc.add_text(
parent=None, label=DocItemLabel.TITLE, text=text
)
elif "Heading" in p_style_name:
self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
elif p_style_name in [
"Paragraph",
"Normal",
"Subtitle",
"Author",
"Default Text",
"List Paragraph",
"List Bullet",
"Quote",
]:
level = self.get_level()
doc.add_text(
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
)
else:
# Text style names can, and will have, not only default values but user values too
# hence we treat all other labels as pure text
level = self.get_level()
doc.add_text(
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
)
self.update_history(p_style_name, p_level, numid, ilevel)
return
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
level = self.get_level()
if isinstance(curr_level, int):
if curr_level == level:
self.parents[level] = doc.add_heading(
parent=self.parents[level - 1], text=text
)
elif curr_level > level:
# add invisible group
for i in range(level, curr_level):
self.parents[i] = doc.add_group(
parent=self.parents[i - 1],
label=GroupLabel.SECTION,
name=f"header-{i}",
)
self.parents[curr_level] = doc.add_heading(
parent=self.parents[curr_level - 1], text=text
)
elif curr_level < level:
# remove the tail
for key, val in self.parents.items():
if key >= curr_level:
self.parents[key] = None
self.parents[curr_level] = doc.add_heading(
parent=self.parents[curr_level - 1], text=text
)
else:
self.parents[self.level] = doc.add_heading(
parent=self.parents[self.level - 1], text=text
)
return
def add_listitem(
self,
element,
docx_obj,
doc,
p_style_name,
p_level,
numid,
ilevel,
text: str,
is_numbered=False,
):
# is_numbered = is_numbered
enum_marker = ""
level = self.get_level()
if self.prev_numid() is None: # Open new list
self.level_at_new_list = level # type: ignore
self.parents[level] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
)
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[level],
text=text,
)
elif (
self.prev_numid() == numid and self.prev_indent() < ilevel
): # Open indented list
for i in range(
self.level_at_new_list + self.prev_indent() + 1,
self.level_at_new_list + ilevel + 1,
):
# TODO: determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
self.listIter = 0
if is_numbered:
self.parents[i] = doc.add_group(
label=GroupLabel.ORDERED_LIST,
name="list",
parent=self.parents[i - 1],
)
else:
self.parents[i] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
)
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
elif self.prev_numid() == numid and ilevel < self.prev_indent(): # Close list
for k, v in self.parents.items():
if k > self.level_at_new_list + ilevel:
self.parents[k] = None
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
self.listIter = 0
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[level - 1],
text=text,
)
return
def handle_tables(self, element, docx_obj, doc):
# Function to check if a cell has a colspan (gridSpan)
def get_colspan(cell):
grid_span = cell._element.xpath("@w:gridSpan")
if grid_span:
return int(grid_span[0]) # Return the number of columns spanned
return 1 # Default is 1 (no colspan)
# Function to check if a cell has a rowspan (vMerge)
def get_rowspan(cell):
v_merge = cell._element.xpath("@w:vMerge")
if v_merge:
return v_merge[
0
] # 'restart' indicates the beginning of a rowspan, others are continuation
return 1
table = docx.table.Table(element, docx_obj)
num_rows = len(table.rows)
num_cols = 0
for row in table.rows:
# Calculate the max number of columns
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
# if row.cells:
# num_cols = max(num_cols, len(row.cells))
# Initialize the table grid
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
for row_idx, row in enumerate(table.rows):
col_idx = 0
for c, cell in enumerate(row.cells):
row_span = get_rowspan(cell)
col_span = get_colspan(cell)
# Find the next available column in the grid
while table_grid[row_idx][col_idx] is not None:
col_idx += 1
# Fill the grid with the cell value, considering rowspan and colspan
for i in range(row_span if row_span == "restart" else 1):
for j in range(col_span):
table_grid[row_idx + i][col_idx + j] = ""
cell = TableCell(
text=cell.text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=False, # col_header,
row_header=False, # ((not col_header) and html_cell.name=='th')
)
data.table_cells.append(cell)
level = self.get_level()
doc.add_table(data=data, parent=self.parents[level - 1])
return
def handle_pictures(self, element, docx_obj, doc):
doc.add_picture(parent=self.parents[self.level], caption=None)
return

View File

@@ -0,0 +1,78 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import Iterable, Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size
from PIL import Image
from docling.backend.abstract_backend import PaginatedDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat
from docling.datamodel.document import InputDocument
class PdfPageBackend(ABC):
@abstractmethod
def get_text_in_rect(self, bbox: BoundingBox) -> str:
pass
@abstractmethod
def get_text_cells(self) -> Iterable[Cell]:
pass
@abstractmethod
def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
pass
@abstractmethod
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
pass
@abstractmethod
def get_size(self) -> Size:
pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
pass
class PdfDocumentBackend(PaginatedDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
if self.input_format is not InputFormat.PDF:
if self.input_format is InputFormat.IMAGE:
buf = BytesIO()
img = Image.open(self.path_or_stream)
img.save(buf, "PDF")
buf.seek(0)
self.path_or_stream = buf
else:
raise RuntimeError(
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
)
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass
@abstractmethod
def page_count(self) -> int:
pass
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.PDF}
@classmethod
def supports_pagination(cls) -> bool:
return True

View File

@@ -2,16 +2,20 @@ import logging
import random
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage, PdfTextPage
from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
@@ -222,8 +226,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
return image
def get_size(self) -> PageSize:
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None
@@ -231,13 +235,14 @@ class PyPdfiumPageBackend(PdfPageBackend):
class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
try:
self._pdoc = pdfium.PdfDocument(path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
except PdfiumError as e:
raise RuntimeError(
f"pypdfium could not load document {document_hash}"
f"pypdfium could not load document with hash {self.document_hash}"
) from e
def page_count(self) -> int:

View File

@@ -5,22 +5,27 @@ import time
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Iterable, List, Optional
from typing import Annotated, Dict, Iterable, List, Optional
import typer
from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
InputFormat,
OutputFormat,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PipelineOptions,
OcrOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -87,28 +92,28 @@ def export_documents(
fname = output_dir / f"{doc_filename}.json"
with fname.open("w") as fp:
_log.info(f"writing JSON output to {fname}")
fp.write(json.dumps(conv_res.render_as_dict()))
fp.write(json.dumps(conv_res.document.export_to_dict()))
# Export Text format:
if export_txt:
fname = output_dir / f"{doc_filename}.txt"
with fname.open("w") as fp:
_log.info(f"writing Text output to {fname}")
fp.write(conv_res.render_as_text())
fp.write(conv_res.document.export_to_markdown(strict_text=True))
# Export Markdown format:
if export_md:
fname = output_dir / f"{doc_filename}.md"
with fname.open("w") as fp:
_log.info(f"writing Markdown output to {fname}")
fp.write(conv_res.render_as_markdown())
fp.write(conv_res.document.export_to_markdown())
# Export Document Tags format:
if export_doctags:
fname = output_dir / f"{doc_filename}.doctags"
with fname.open("w") as fp:
_log.info(f"writing Doc Tags output to {fname}")
fp.write(conv_res.render_as_doctags())
fp.write(conv_res.document.export_to_document_tokens())
else:
_log.warning(f"Document {conv_res.input.file} failed to convert.")
@@ -129,44 +134,31 @@ def convert(
help="PDF files to convert. Can be local file / directory paths or URL.",
),
],
export_json: Annotated[
bool,
typer.Option(
..., "--json/--no-json", help="If enabled the document is exported as JSON."
),
] = False,
export_md: Annotated[
bool,
typer.Option(
..., "--md/--no-md", help="If enabled the document is exported as Markdown."
),
] = True,
export_txt: Annotated[
bool,
typer.Option(
..., "--txt/--no-txt", help="If enabled the document is exported as Text."
),
] = False,
export_doctags: Annotated[
bool,
typer.Option(
...,
"--doctags/--no-doctags",
help="If enabled the document is exported as Doc Tags.",
),
] = False,
from_formats: List[InputFormat] = typer.Option(
None,
"--from",
help="Specify input formats to convert from. Defaults to all formats.",
),
to_formats: List[OutputFormat] = typer.Option(
None, "--to", help="Specify output formats. Defaults to Markdown."
),
ocr: Annotated[
bool,
typer.Option(
..., help="If enabled, the bitmap content will be processed using OCR."
),
] = True,
backend: Annotated[
Backend, typer.Option(..., help="The PDF backend to use.")
] = Backend.DOCLING,
ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR,
abort_on_error: Annotated[
bool,
typer.Option(
...,
"--abort-on-error/--no-abort-on-error",
help="If enabled, the bitmap content will be processed using OCR.",
),
] = False,
output: Annotated[
Path, typer.Option(..., help="Output directory where results are saved.")
] = Path("."),
@@ -182,6 +174,9 @@ def convert(
):
logging.basicConfig(level=logging.INFO)
if from_formats is None:
from_formats = [e for e in InputFormat]
input_doc_paths: List[Path] = []
for src in input_sources:
source = resolve_file_source(source=src)
@@ -191,48 +186,54 @@ def convert(
)
raise typer.Abort()
elif source.is_dir():
input_doc_paths.extend(list(source.glob("**/*.pdf")))
input_doc_paths.extend(list(source.glob("**/*.PDF")))
for fmt in from_formats:
for ext in FormatToExtensions[fmt]:
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
else:
input_doc_paths.append(source)
match backend:
case Backend.PYPDFIUM2:
do_cell_matching = ocr # only do cell matching when OCR enabled
pdf_backend = PyPdfiumDocumentBackend
case Backend.DOCLING:
do_cell_matching = True
pdf_backend = DoclingParseDocumentBackend
case _:
raise RuntimeError(f"Unexpected backend type {backend}")
if to_formats is None:
to_formats = [OutputFormat.MARKDOWN]
export_json = OutputFormat.JSON in to_formats
export_md = OutputFormat.MARKDOWN in to_formats
export_txt = OutputFormat.TEXT in to_formats
export_doctags = OutputFormat.DOCTAGS in to_formats
match ocr_engine:
case OcrEngine.EASYOCR:
ocr_options = EasyOcrOptions()
ocr_options: OcrOptions = EasyOcrOptions()
case OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions()
case OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions()
case _:
raise RuntimeError(f"Unexpected backend type {backend}")
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
pipeline_options = PipelineOptions(
pipeline_options = PdfPipelineOptions(
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=pdf_backend,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=DoclingParseDocumentBackend, # pdf_backend
)
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,
format_options=format_options,
)
start_time = time.time()
conv_results = doc_converter.convert(input)
conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error
)
output.mkdir(parents=True, exist_ok=True)
export_documents(

View File

@@ -1,18 +1,19 @@
import copy
import warnings
from enum import Enum, auto
from io import BytesIO
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
from docling.backend.abstract_backend import PdfPageBackend
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
PipelineOptions,
TableStructureOptions,
from docling_core.types.doc import (
BoundingBox,
DocItemLabel,
PictureDataType,
Size,
TableCell,
)
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict
if TYPE_CHECKING:
from docling.backend.pdf_backend import PdfPageBackend
class ConversionStatus(str, Enum):
@@ -23,18 +24,61 @@ class ConversionStatus(str, Enum):
PARTIAL_SUCCESS = auto()
class InputFormat(str, Enum):
DOCX = "docx"
PPTX = "pptx"
HTML = "html"
IMAGE = "image"
PDF = "pdf"
class OutputFormat(str, Enum):
MARKDOWN = "md"
JSON = "json"
TEXT = "text"
DOCTAGS = "doctags"
FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
InputFormat.PDF: ["pdf"],
InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
}
FormatToMimeType: Dict[InputFormat, Set[str]] = {
InputFormat.DOCX: {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
},
InputFormat.PPTX: {
"application/vnd.openxmlformats-officedocument.presentationml.template",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
},
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
InputFormat.IMAGE: {
"image/png",
"image/jpeg",
"image/tiff",
"image/gif",
"image/bmp",
},
InputFormat.PDF: {"application/pdf"},
}
MimeTypeToFormat = {
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
}
class DocInputType(str, Enum):
PATH = auto()
STREAM = auto()
class CoordOrigin(str, Enum):
TOPLEFT = auto()
BOTTOMLEFT = auto()
class DoclingComponentType(str, Enum):
PDF_BACKEND = auto()
DOCUMENT_BACKEND = auto()
MODEL = auto()
DOC_ASSEMBLER = auto()
@@ -45,118 +89,6 @@ class ErrorItem(BaseModel):
error_message: str
class PageSize(BaseModel):
width: float = 0.0
height: float = 0.0
class BoundingBox(BaseModel):
l: float # left
t: float # top
r: float # right
b: float # bottom
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
@property
def width(self):
return self.r - self.l
@property
def height(self):
return abs(self.t - self.b)
def scaled(self, scale: float) -> "BoundingBox":
out_bbox = copy.deepcopy(self)
out_bbox.l *= scale
out_bbox.r *= scale
out_bbox.t *= scale
out_bbox.b *= scale
return out_bbox
def normalized(self, page_size: PageSize) -> "BoundingBox":
out_bbox = copy.deepcopy(self)
out_bbox.l /= page_size.width
out_bbox.r /= page_size.width
out_bbox.t /= page_size.height
out_bbox.b /= page_size.height
return out_bbox
def as_tuple(self):
if self.coord_origin == CoordOrigin.TOPLEFT:
return (self.l, self.t, self.r, self.b)
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return (self.l, self.b, self.r, self.t)
@classmethod
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
if origin == CoordOrigin.TOPLEFT:
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b < t:
b, t = t, b
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
elif origin == CoordOrigin.BOTTOMLEFT:
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b > t:
b, t = t, b
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
def area(self) -> float:
area = (self.r - self.l) * (self.b - self.t)
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
area = -area
return area
def intersection_area_with(self, other: "BoundingBox") -> float:
# Calculate intersection coordinates
left = max(self.l, other.l)
top = max(self.t, other.t)
right = min(self.r, other.r)
bottom = min(self.b, other.b)
# Calculate intersection dimensions
width = right - left
height = bottom - top
# If the bounding boxes do not overlap, width or height will be negative
if width <= 0 or height <= 0:
return 0.0
return width * height
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
return self
elif self.coord_origin == CoordOrigin.TOPLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t,
b=page_height - self.b,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
def to_top_left_origin(self, page_height):
if self.coord_origin == CoordOrigin.TOPLEFT:
return self
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t, # self.b
b=page_height - self.b, # self.t
coord_origin=CoordOrigin.TOPLEFT,
)
class Cell(BaseModel):
id: int
text: str
@@ -169,14 +101,14 @@ class OcrCell(Cell):
class Cluster(BaseModel):
id: int
label: str
label: DocItemLabel
bbox: BoundingBox
confidence: float = 1.0
cells: List[Cell] = []
class BasePageElement(BaseModel):
label: str
label: DocItemLabel
id: int
page_no: int
cluster: Cluster
@@ -187,37 +119,7 @@ class LayoutPrediction(BaseModel):
clusters: List[Cluster] = []
class TableCell(BaseModel):
bbox: BoundingBox
row_span: int
col_span: int
start_row_offset_idx: int
end_row_offset_idx: int
start_col_offset_idx: int
end_col_offset_idx: int
text: str
column_header: bool = False
row_header: bool = False
row_section: bool = False
@model_validator(mode="before")
@classmethod
def from_dict_format(cls, data: Any) -> Any:
if isinstance(data, Dict):
text = data["bbox"].get("token", "")
if not len(text):
text_cells = data.pop("text_cell_bboxes", None)
if text_cells:
for el in text_cells:
text += el["token"] + " "
text = text.strip()
data["text"] = text
return data
class TableElement(BasePageElement):
class Table(BasePageElement):
otsl_seq: List[str]
num_rows: int = 0
num_cols: int = 0
@@ -225,18 +127,15 @@ class TableElement(BasePageElement):
class TableStructurePrediction(BaseModel):
table_map: Dict[int, TableElement] = {}
table_map: Dict[int, Table] = {}
class TextElement(BasePageElement): ...
class FigureData(BaseModel):
pass
class TextElement(BasePageElement):
text: str
class FigureElement(BasePageElement):
data: Optional[FigureData] = None
annotations: List[PictureDataType] = []
provenance: Optional[str] = None
predicted_class: Optional[str] = None
confidence: Optional[float] = None
@@ -259,7 +158,7 @@ class PagePredictions(BaseModel):
equations_prediction: Optional[EquationPrediction] = None
PageElement = Union[TextElement, TableElement, FigureElement]
PageElement = Union[TextElement, Table, FigureElement]
class AssembledUnit(BaseModel):
@@ -272,13 +171,13 @@ class Page(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
page_no: int
page_hash: Optional[str] = None
size: Optional[PageSize] = None
# page_hash: Optional[str] = None
size: Optional[Size] = None
cells: List[Cell] = []
predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None
_backend: Optional[PdfPageBackend] = (
_backend: Optional["PdfPageBackend"] = (
None # Internal PDF backend. By default it is cleared during assembling.
)
_default_image_scale: float = 1.0 # Default image scale for external usage.
@@ -301,24 +200,5 @@ class Page(BaseModel):
class DocumentStream(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
filename: str
name: str
stream: BytesIO
class AssembleOptions(BaseModel):
keep_page_images: Annotated[
bool,
Field(
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
),
] = False # False: page images are removed in the assemble step
images_scale: Optional[float] = None # if set, the scale for generated images
@model_validator(mode="after")
def set_page_images_from_deprecated(self) -> Self:
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
default_scale = 1.0
if self.keep_page_images and self.images_scale is None:
self.images_scale = default_scale
return self

View File

@@ -1,87 +1,101 @@
import logging
import re
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
from docling_core.types import BaseCell, BaseText
import filetype
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types import TableCell
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
from docling_core.types.doc.base import Figure
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
PictureItem,
SectionHeaderItem,
TableItem,
TextItem,
)
from docling_core.types.doc.document import ListItem
from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
from docling_core.utils.file import resolve_file_source
from pydantic import BaseModel
from typing_extensions import deprecated
from docling.backend.abstract_backend import PdfDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.abstract_backend import (
AbstractDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
DocumentStream,
ErrorItem,
FigureElement,
InputFormat,
MimeTypeToFormat,
Page,
PageElement,
TableElement,
TextElement,
)
from docling.datamodel.settings import DocumentLimits
from docling.utils.utils import create_file_hash
from docling.utils.utils import create_file_hash, create_hash
if TYPE_CHECKING:
from docling.document_converter import FormatOption
_log = logging.getLogger(__name__)
layout_label_to_ds_type = {
"Title": "title",
"Document Index": "table-of-path_or_stream",
"Section-header": "subtitle-level-1",
"Checkbox-Selected": "checkbox-selected",
"Checkbox-Unselected": "checkbox-unselected",
"Caption": "caption",
"Page-header": "page-header",
"Page-footer": "page-footer",
"Footnote": "footnote",
"Table": "table",
"Formula": "equation",
"List-item": "paragraph",
"Code": "paragraph",
"Picture": "figure",
"Text": "paragraph",
DocItemLabel.TITLE: "title",
DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
DocItemLabel.CAPTION: "caption",
DocItemLabel.PAGE_HEADER: "page-header",
DocItemLabel.PAGE_FOOTER: "page-footer",
DocItemLabel.FOOTNOTE: "footnote",
DocItemLabel.TABLE: "table",
DocItemLabel.FORMULA: "equation",
DocItemLabel.LIST_ITEM: "paragraph",
DocItemLabel.CODE: "paragraph",
DocItemLabel.PICTURE: "figure",
DocItemLabel.TEXT: "paragraph",
DocItemLabel.PARAGRAPH: "paragraph",
}
_EMPTY_DOC = DsDocument(
_name="",
description=DsDocumentDescription(logs=[]),
file_info=DsFileInfoObject(
filename="",
document_hash="",
),
)
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
class InputDocument(BaseModel):
file: PurePath = None
document_hash: Optional[str] = None
valid: bool = False
file: PurePath
document_hash: str # = None
valid: bool = True
limits: DocumentLimits = DocumentLimits()
format: InputFormat # = None
filesize: Optional[int] = None
page_count: Optional[int] = None
page_count: int = 0
_backend: PdfDocumentBackend = None # Internal PDF backend used
_backend: AbstractDocumentBackend # Internal PDF backend used
def __init__(
self,
path_or_stream: Union[BytesIO, Path],
format: InputFormat,
backend: Type[AbstractDocumentBackend],
filename: Optional[str] = None,
limits: Optional[DocumentLimits] = None,
pdf_backend=DoclingParseDocumentBackend,
):
super().__init__()
super().__init__(
file="", document_hash="", format=InputFormat.PDF
) # initialize with dummy values
self.limits = limits or DocumentLimits()
self.format = format
try:
if isinstance(path_or_stream, Path):
@@ -91,11 +105,12 @@ class InputDocument(BaseModel):
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
self._init_doc(backend, path_or_stream)
elif isinstance(path_or_stream, BytesIO):
assert (
filename is not None
), "Can't construct InputDocument from stream without providing filename arg."
self.file = PurePath(filename)
self.filesize = path_or_stream.getbuffer().nbytes
@@ -103,15 +118,20 @@ class InputDocument(BaseModel):
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
self._init_doc(backend, path_or_stream)
else:
raise RuntimeError(
f"Unexpected type path_or_stream: {type(path_or_stream)}"
)
if self.document_hash and self._backend.page_count() > 0:
self.page_count = self._backend.page_count()
if self.page_count <= self.limits.max_num_pages:
self.valid = True
# For paginated backends, check if the maximum page count is exceeded.
if self.valid and self._backend.is_valid():
if self._backend.supports_pagination() and isinstance(
self._backend, PaginatedDocumentBackend
):
self.page_count = self._backend.page_count()
if not self.page_count <= self.limits.max_num_pages:
self.valid = False
except (FileNotFoundError, OSError) as e:
_log.exception(
@@ -125,9 +145,26 @@ class InputDocument(BaseModel):
)
# raise
def _init_doc(
self,
backend: Type[AbstractDocumentBackend],
path_or_stream: Union[BytesIO, Path],
) -> None:
if backend is None:
raise RuntimeError(
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
f"Please check your format configuration on DocumentConverter."
)
@deprecated("Use `ConversionResult` instead.")
class ConvertedDocument(BaseModel):
self._backend = backend(self, path_or_stream=path_or_stream)
class DocumentFormat(str, Enum):
V2 = "v2"
V1 = "v1"
class ConversionResult(BaseModel):
input: InputDocument
status: ConversionStatus = ConversionStatus.PENDING # failure, success
@@ -136,15 +173,42 @@ class ConvertedDocument(BaseModel):
pages: List[Page] = []
assembled: AssembledUnit = AssembledUnit()
output: DsDocument = _EMPTY_DOC
document: DoclingDocument = _EMPTY_DOCLING_DOC
@property
@deprecated("Use document instead.")
def legacy_document(self):
reverse_label_mapping = {
DocItemLabel.CAPTION.value: "Caption",
DocItemLabel.FOOTNOTE.value: "Footnote",
DocItemLabel.FORMULA.value: "Formula",
DocItemLabel.LIST_ITEM.value: "List-item",
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
DocItemLabel.PAGE_HEADER.value: "Page-header",
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
DocItemLabel.SECTION_HEADER.value: "Section-header",
DocItemLabel.TABLE.value: "Table",
DocItemLabel.TEXT.value: "Text",
DocItemLabel.TITLE.value: "Title",
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
DocItemLabel.CODE.value: "Code",
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
DocItemLabel.FORM.value: "Form",
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
DocItemLabel.PARAGRAPH.value: "paragraph",
}
def _to_ds_document(self) -> DsDocument:
title = ""
desc = DsDocumentDescription(logs=[])
page_hashes = [
PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
for p in self.pages
PageReference(
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
page=p.page_no,
model="default",
)
for p in self.document.pages.values()
]
file_info = DsFileInfoObject(
@@ -157,145 +221,199 @@ class ConvertedDocument(BaseModel):
main_text = []
tables = []
figures = []
equations = []
footnotes = []
page_headers = []
page_footers = []
page_no_to_page = {p.page_no: p for p in self.pages}
embedded_captions = set()
for ix, (item, level) in enumerate(
self.document.iterate_items(self.document.body)
):
for element in self.assembled.elements:
# Convert bboxes to lower-left origin.
target_bbox = DsBoundingBox(
element.cluster.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple()
)
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
caption = item.caption_text(self.document)
if caption:
embedded_captions.add(caption)
if isinstance(element, TextElement):
main_text.append(
BaseText(
text=element.text,
obj_type=layout_label_to_ds_type.get(element.label),
name=element.label,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, len(element.text)],
)
],
)
)
elif isinstance(element, TableElement):
index = len(tables)
ref_str = f"#/tables/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
for item, level in self.document.iterate_items():
if isinstance(item, DocItem):
item_type = item.label
# Initialise empty table data grid (only empty cells)
table_data = [
[
TableCell(
text="",
# bbox=[0,0,0,0],
spans=[[i, j]],
obj_type="body",
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
if isinstance(item, ListItem) and item.marker:
text = f"{item.marker} {item.text}"
else:
text = item.text
# Can be empty.
prov = [
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, len(item.text)],
)
for j in range(element.num_cols)
for p in item.prov
]
for i in range(element.num_rows)
]
main_text.append(
BaseText(
text=text,
obj_type=layout_label_to_ds_type.get(item.label),
name=reverse_label_mapping[item.label],
prov=prov,
)
)
# Overwrite cells in table data for which there is actual cell content.
for cell in element.table_cells:
for i in range(
min(cell.start_row_offset_idx, element.num_rows),
min(cell.end_row_offset_idx, element.num_rows),
):
for j in range(
min(cell.start_col_offset_idx, element.num_cols),
min(cell.end_col_offset_idx, element.num_cols),
# skip captions of they are embedded in the actual
# floating object
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
continue
elif isinstance(item, TableItem) and item.data:
index = len(tables)
ref_str = f"#/tables/{index}"
main_text.append(
Ref(
name=reverse_label_mapping[item.label],
obj_type=layout_label_to_ds_type.get(item.label),
ref=ref_str,
),
)
# Initialise empty table data grid (only empty cells)
table_data = [
[
TableCell(
text="",
# bbox=[0,0,0,0],
spans=[[i, j]],
obj_type="body",
)
for j in range(item.data.num_cols)
]
for i in range(item.data.num_rows)
]
# Overwrite cells in table data for which there is actual cell content.
for cell in item.data.table_cells:
for i in range(
min(cell.start_row_offset_idx, item.data.num_rows),
min(cell.end_row_offset_idx, item.data.num_rows),
):
celltype = "body"
if cell.column_header:
celltype = "col_header"
elif cell.row_header:
celltype = "row_header"
elif cell.row_section:
celltype = "row_section"
for j in range(
min(cell.start_col_offset_idx, item.data.num_cols),
min(cell.end_col_offset_idx, item.data.num_cols),
):
celltype = "body"
if cell.column_header:
celltype = "col_header"
elif cell.row_header:
celltype = "row_header"
elif cell.row_section:
celltype = "row_section"
def make_spans(cell):
for rspan in range(
min(cell.start_row_offset_idx, element.num_rows),
min(cell.end_row_offset_idx, element.num_rows),
):
for cspan in range(
def make_spans(cell):
for rspan in range(
min(
cell.start_col_offset_idx, element.num_cols
cell.start_row_offset_idx,
item.data.num_rows,
),
min(
cell.end_row_offset_idx, item.data.num_rows
),
min(cell.end_col_offset_idx, element.num_cols),
):
yield [rspan, cspan]
for cspan in range(
min(
cell.start_col_offset_idx,
item.data.num_cols,
),
min(
cell.end_col_offset_idx,
item.data.num_cols,
),
):
yield [rspan, cspan]
spans = list(make_spans(cell))
table_data[i][j] = TableCell(
text=cell.text,
bbox=cell.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple(),
# col=j,
# row=i,
spans=spans,
obj_type=celltype,
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
)
spans = list(make_spans(cell))
table_data[i][j] = GlmTableCell(
text=cell.text,
bbox=(
cell.bbox.as_tuple()
if cell.bbox is not None
else None
), # check if this is bottom-left
spans=spans,
obj_type=celltype,
col=j,
row=i,
row_header=cell.row_header,
row_section=cell.row_section,
col_header=cell.column_header,
row_span=[
cell.start_row_offset_idx,
cell.end_row_offset_idx,
],
col_span=[
cell.start_col_offset_idx,
cell.end_col_offset_idx,
],
)
tables.append(
DsSchemaTable(
num_cols=element.num_cols,
num_rows=element.num_rows,
obj_type=layout_label_to_ds_type.get(element.label),
data=table_data,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, 0],
)
],
# Compute the caption
caption = item.caption_text(self.document)
tables.append(
DsSchemaTable(
text=caption,
num_cols=item.data.num_cols,
num_rows=item.data.num_rows,
obj_type=layout_label_to_ds_type.get(item.label),
data=table_data,
prov=[
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, 0],
)
for p in item.prov
],
)
)
)
elif isinstance(element, FigureElement):
index = len(figures)
ref_str = f"#/figures/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
figures.append(
Figure(
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, 0],
)
],
obj_type=layout_label_to_ds_type.get(element.label),
# data=[[]],
elif isinstance(item, PictureItem):
index = len(figures)
ref_str = f"#/figures/{index}"
main_text.append(
Ref(
name=reverse_label_mapping[item.label],
obj_type=layout_label_to_ds_type.get(item.label),
ref=ref_str,
),
)
# Compute the caption
caption = item.caption_text(self.document)
figures.append(
Figure(
prov=[
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, len(caption)],
)
for p in item.prov
],
obj_type=layout_label_to_ds_type.get(item.label),
text=caption,
# data=[[]],
)
)
)
page_dimensions = [
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
for p in self.pages
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
for p in self.document.pages.values()
]
ds_doc = DsDocument(
@@ -303,6 +421,10 @@ class ConvertedDocument(BaseModel):
description=desc,
file_info=file_info,
main_text=main_text,
equations=equations,
footnotes=footnotes,
page_headers=page_headers,
page_footers=page_footers,
tables=tables,
figures=figures,
page_dimensions=page_dimensions,
@@ -310,152 +432,76 @@ class ConvertedDocument(BaseModel):
return ds_doc
def render_as_dict(self):
return self.output.model_dump(by_alias=True, exclude_none=True)
def render_as_markdown(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
"table",
"figure",
],
strict_text: bool = False,
image_placeholder: str = "<!-- image -->",
):
return self.output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=strict_text,
image_placeholder=image_placeholder,
)
class _DocumentConversionInput(BaseModel):
def render_as_text(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
],
):
return self.output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=True,
)
def render_as_doctags(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
"table",
"figure",
],
xsize: int = 100,
ysize: int = 100,
add_location: bool = True,
add_content: bool = True,
add_page_index: bool = True,
# table specific flags
add_table_cell_location: bool = False,
add_table_cell_label: bool = True,
add_table_cell_text: bool = True,
) -> str:
return self.output.export_to_document_tokens(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
xsize=xsize,
ysize=ysize,
add_location=add_location,
add_content=add_content,
add_page_index=add_page_index,
# table specific flags
add_table_cell_location=add_table_cell_location,
add_table_cell_label=add_table_cell_label,
add_table_cell_text=add_table_cell_text,
)
def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,)
):
for element in self.assembled.elements:
if isinstance(element, element_types):
page_ix = element.page_no
scale = self.pages[page_ix]._default_image_scale
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
page_height=self.pages[page_ix].size.height * scale
)
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
yield element, cropped_im
class ConversionResult(ConvertedDocument):
pass
class DocumentConversionInput(BaseModel):
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
limits: Optional[DocumentLimits] = DocumentLimits()
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
def docs(
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator:
obj = resolve_file_source(item) if isinstance(item, str) else item
format = self._guess_format(obj)
if format not in format_options.keys():
_log.info(
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
)
continue
else:
backend = format_options[format].backend
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
for obj in self._path_or_stream_iterator:
if isinstance(obj, Path):
yield InputDocument(
path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
path_or_stream=obj,
format=format,
filename=obj.name,
limits=self.limits,
backend=backend,
)
elif isinstance(obj, DocumentStream):
yield InputDocument(
path_or_stream=obj.stream,
filename=obj.filename,
format=format,
filename=obj.name,
limits=self.limits,
pdf_backend=pdf_backend,
backend=backend,
)
else:
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
@classmethod
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
paths = [Path(p) for p in paths]
def _guess_format(self, obj):
content = None
if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
if mime is None:
with obj.open("rb") as f:
content = f.read(1024) # Read first 1KB
doc_input = cls(limits=limits)
doc_input._path_or_stream_iterator = paths
elif isinstance(obj, DocumentStream):
obj.stream.seek(0)
content = obj.stream.read(8192)
obj.stream.seek(0)
mime = filetype.guess_mime(content)
return doc_input
if mime is None:
mime = self._detect_html_xhtml(content)
@classmethod
def from_streams(
cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
):
doc_input = cls(limits=limits)
doc_input._path_or_stream_iterator = streams
format = MimeTypeToFormat.get(mime)
return format
return doc_input
def _detect_html_xhtml(self, content):
content_str = content.decode("ascii", errors="ignore").lower()
# Remove XML comments
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
content_str = content_str.lstrip()
if re.match(r"<\?xml", content_str):
if "xhtml" in content_str[:1000]:
return "application/xhtml+xml"
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
return "text/html"
return None

View File

@@ -1,4 +1,5 @@
from enum import Enum, auto
from pathlib import Path
from typing import List, Literal, Optional, Union
from pydantic import BaseModel, ConfigDict, Field
@@ -58,6 +59,13 @@ class TesseractOcrOptions(OcrOptions):
class PipelineOptions(BaseModel):
create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling
)
class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
@@ -65,3 +73,8 @@ class PipelineOptions(BaseModel):
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
Field(EasyOcrOptions(), discriminator="kind")
)
images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False
generate_table_images: bool = False

View File

@@ -14,6 +14,7 @@ class BatchConcurrencySettings(BaseModel):
doc_batch_concurrency: int = 2
page_batch_size: int = 4
page_batch_concurrency: int = 2
elements_batch_size: int = 16
# doc_batch_size: int = 1
# doc_batch_concurrency: int = 1

View File

@@ -1,84 +1,179 @@
import functools
import logging
import tempfile
import sys
import time
import traceback
from functools import partial
from pathlib import Path
from typing import Iterable, Optional, Type, Union
from typing import Dict, Iterable, Iterator, List, Optional, Type
import requests
from PIL import ImageDraw
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from docling.backend.abstract_backend import PdfDocumentBackend
from docling.datamodel.base_models import (
AssembledUnit,
AssembleOptions,
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
)
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.datamodel.document import (
ConversionResult,
DocumentConversionInput,
InputDocument,
_DocumentConversionInput,
)
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.ds_glm_model import GlmModel
from docling.models.page_assemble_model import PageAssembleModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline
from docling.pipeline.standard_model_pipeline import StandardModelPipeline
from docling.utils.utils import chunkify, create_hash
from docling.datamodel.settings import DocumentLimits, settings
from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.utils.utils import chunkify
_log = logging.getLogger(__name__)
class FormatOption(BaseModel):
pipeline_cls: Type[BasePipeline]
pipeline_options: Optional[PipelineOptions] = None
backend: Type[AbstractDocumentBackend]
model_config = ConfigDict(arbitrary_types_allowed=True)
@model_validator(mode="after")
def set_optional_field_default(self) -> "FormatOption":
if self.pipeline_options is None:
self.pipeline_options = self.pipeline_cls.get_default_options()
return self
class WordFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
class PowerpointFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
class HTMLFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
_format_to_default_options = {
InputFormat.DOCX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
),
InputFormat.PPTX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
),
InputFormat.HTML: FormatOption(
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
),
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
),
}
class DocumentConverter:
_default_download_filename = "file.pdf"
_default_download_filename = "file"
def __init__(
self,
artifacts_path: Optional[Union[Path, str]] = None,
pipeline_options: PipelineOptions = PipelineOptions(),
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
assemble_options: AssembleOptions = AssembleOptions(),
allowed_formats: Optional[List[InputFormat]] = None,
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
):
if not artifacts_path:
artifacts_path = self.download_models_hf()
self.allowed_formats = allowed_formats
self.format_to_options = format_options
artifacts_path = Path(artifacts_path)
if self.allowed_formats is None:
# if self.format_to_options is not None:
# self.allowed_formats = self.format_to_options.keys()
# else:
self.allowed_formats = [e for e in InputFormat] # all formats
self.model_pipeline = pipeline_cls(
artifacts_path=artifacts_path, pipeline_options=pipeline_options
if self.format_to_options is None:
self.format_to_options = _format_to_default_options
else:
for f in self.allowed_formats:
if f not in self.format_to_options.keys():
_log.debug(f"Requested format {f} will use default options.")
self.format_to_options[f] = _format_to_default_options[f]
remove_keys = []
for f in self.format_to_options.keys():
if f not in self.allowed_formats:
remove_keys.append(f)
for f in remove_keys:
self.format_to_options.pop(f)
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
@validate_call(config=ConfigDict(strict=True))
def convert(
self,
source: Path | str | DocumentStream, # TODO review naming
raises_on_error: bool = True,
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
) -> ConversionResult:
all_res = self.convert_all(
source=[source],
raises_on_error=raises_on_error,
max_num_pages=max_num_pages,
max_file_size=max_file_size,
)
return next(all_res)
self.page_assemble_model = PageAssembleModel(config={})
self.glm_model = GlmModel(config={})
self.pdf_backend = pdf_backend
self.assemble_options = assemble_options
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
) -> Path:
from huggingface_hub import snapshot_download
download_path = snapshot_download(
repo_id="ds4sd/docling-models",
force_download=force,
local_dir=local_dir,
revision="v2.0.0",
@validate_call(config=ConfigDict(strict=True))
def convert_all(
self,
source: Iterable[Path | str | DocumentStream], # TODO review naming
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
) -> Iterator[ConversionResult]:
limits = DocumentLimits(
max_num_pages=max_num_pages,
max_file_size=max_file_size,
)
conv_input = _DocumentConversionInput(
path_or_stream_iterator=source,
limit=limits,
)
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
for conv_res in conv_res_iter:
if raises_on_error and conv_res.status not in {
ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS,
}:
raise RuntimeError(
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
)
else:
yield conv_res
return Path(download_path)
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
def _convert(
self, conv_input: _DocumentConversionInput, raises_on_error: bool
) -> Iterator[ConversionResult]:
assert self.format_to_options is not None
for input_batch in chunkify(
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options
):
_log.info(f"Going to convert document batch...")
# parallel processing only within input_batch
@@ -87,211 +182,79 @@ class DocumentConverter:
# ) as pool:
# yield from pool.map(self.process_document, input_batch)
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
yield from map(self._process_document, input_batch)
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
for item in map(
partial(self._process_document, raises_on_error=raises_on_error),
input_batch,
):
if item is not None:
yield item
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
"""Convert a single document.
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
assert self.format_to_options is not None
Args:
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
fopt = self.format_to_options.get(doc.format)
Raises:
ValueError: If source is of unexpected type.
RuntimeError: If conversion fails.
if fopt is None:
raise RuntimeError(f"Could not get pipeline for document {doc.file}")
else:
pipeline_class = fopt.pipeline_cls
pipeline_options = fopt.pipeline_options
Returns:
ConversionResult: The conversion result object.
"""
with tempfile.TemporaryDirectory() as temp_dir:
try:
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
res = requests.get(http_url, stream=True)
res.raise_for_status()
fname = None
# try to get filename from response header
if cont_disp := res.headers.get("Content-Disposition"):
for par in cont_disp.strip().split(";"):
# currently only handling directive "filename" (not "*filename")
if (split := par.split("=")) and split[0].strip() == "filename":
fname = "=".join(split[1:]).strip().strip("'\"") or None
break
# otherwise, use name from URL:
if fname is None:
fname = Path(http_url.path).name or self._default_download_filename
local_path = Path(temp_dir) / fname
with open(local_path, "wb") as f:
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
f.write(chunk)
except ValidationError:
try:
local_path = TypeAdapter(Path).validate_python(source)
except ValidationError:
raise ValueError(
f"Unexpected file path type encountered: {type(source)}"
)
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
conv_res_iter = self.convert(conv_inp)
conv_res: ConversionResult = next(conv_res_iter)
if conv_res.status not in {
ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS,
}:
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
return conv_res
def _process_document(self, in_doc: InputDocument) -> ConversionResult:
start_doc_time = time.time()
conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
if not in_doc.valid:
conv_res.status = ConversionStatus.FAILURE
return conv_res
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))
all_assembled_pages = []
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
start_pb_time = time.time()
# Pipeline
# 1. Initialise the page resources
init_pages = map(
functools.partial(self._initialize_page, in_doc), page_batch
)
# 2. Populate page image
pages_with_images = map(
functools.partial(self._populate_page_images, in_doc), init_pages
)
# 3. Populate programmatic page cells
pages_with_cells = map(
functools.partial(self._parse_page_cells, in_doc),
pages_with_images,
)
# 4. Run pipeline stages
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
# 5. Assemble page elements (per page)
assembled_pages = self.page_assemble_model(pipeline_pages)
# exhaust assembled_pages
for assembled_page in assembled_pages:
# Free up mem resources before moving on with next batch
# Remove page images (can be disabled)
if self.assemble_options.images_scale is None:
assembled_page._image_cache = {}
# Unload backend
assembled_page._backend.unload()
all_assembled_pages.append(assembled_page)
end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
conv_res.pages = all_assembled_pages
self._assemble_doc(conv_res)
status = ConversionStatus.SUCCESS
for page in conv_res.pages:
if not page._backend.is_valid():
conv_res.errors.append(
ErrorItem(
component_type=DoclingComponentType.PDF_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS
conv_res.status = status
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.info(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
assert pipeline_options is not None
# TODO this will ignore if different options have been defined for the same pipeline class.
if (
pipeline_class not in self.initialized_pipelines
or self.initialized_pipelines[pipeline_class].pipeline_options
!= pipeline_options
):
self.initialized_pipelines[pipeline_class] = pipeline_class(
pipeline_options=pipeline_options
)
return self.initialized_pipelines[pipeline_class]
finally:
# Always unload the PDF backend, even in case of failure
if in_doc._backend:
in_doc._backend.unload()
def _process_document(
self, in_doc: InputDocument, raises_on_error: bool
) -> Optional[ConversionResult]:
assert self.allowed_formats is not None
assert in_doc.format in self.allowed_formats
start_doc_time = time.time()
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
end_doc_time = time.time() - start_doc_time
_log.info(
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
)
return conv_res
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
page._backend = doc._backend.load_page(page.page_no)
page.size = page._backend.get_size()
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
def _execute_pipeline(
self, in_doc: InputDocument, raises_on_error: bool
) -> ConversionResult:
if in_doc.valid:
pipeline = self._get_pipeline(in_doc)
if pipeline is None: # Can't find a default pipeline. Should this raise?
if raises_on_error:
raise RuntimeError(
f"No pipeline could be initialized for {in_doc.file}."
)
else:
conv_res = ConversionResult(input=in_doc)
conv_res.status = ConversionStatus.FAILURE
return conv_res
return page
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
# Generate the page image and store it in the page object
def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
# default scale
page.get_image(
scale=1.0
) # puts the page image on the image cache at default scale
else:
if raises_on_error:
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
# user requested scales
if self.assemble_options.images_scale is not None:
page._default_image_scale = self.assemble_options.images_scale
page.get_image(
scale=self.assemble_options.images_scale
) # this will trigger storing the image in the internal cache
else:
# invalid doc or not of desired format
conv_res = ConversionResult(input=in_doc)
conv_res.status = ConversionStatus.FAILURE
# TODO add error log why it failed.
return page
# Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
page.cells = page._backend.get_text_cells()
# DEBUG code:
def draw_text_boxes(image, cells):
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show()
# draw_text_boxes(page.get_image(scale=1.0), cells)
return page
def _assemble_doc(self, conv_res: ConversionResult):
all_elements = []
all_headers = []
all_body = []
for p in conv_res.pages:
for el in p.assembled.body:
all_body.append(el)
for el in p.assembled.headers:
all_headers.append(el)
for el in p.assembled.elements:
all_elements.append(el)
conv_res.assembled = AssembledUnit(
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.output = self.glm_model(conv_res)
return conv_res

View File

@@ -0,0 +1,25 @@
from abc import ABC, abstractmethod
from typing import Any, Iterable
from docling_core.types.doc import DoclingDocument, NodeItem
from docling.datamodel.base_models import Page
class BasePageModel(ABC):
@abstractmethod
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
pass
class BaseEnrichmentModel(ABC):
@abstractmethod
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
pass
@abstractmethod
def __call__(
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
) -> Iterable[Any]:
pass

View File

@@ -1,14 +1,15 @@
import copy
import logging
from abc import abstractmethod
from typing import Iterable, List, Tuple
from typing import Iterable, List
import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin
from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.pipeline_options import OcrOptions
_log = logging.getLogger(__name__)
@@ -20,8 +21,9 @@ class BaseOcrModel:
self.options = options
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
BITMAP_COVERAGE_TRESHOLD = 0.75
assert page.size is not None
def find_ocr_rects(size, bitmap_rects):
image = Image.new(
@@ -60,7 +62,10 @@ class BaseOcrModel:
return (area_frac, bounding_boxes) # fraction covered # boxes
bitmap_rects = page._backend.get_bitmap_rects()
if page._backend is not None:
bitmap_rects = page._backend.get_bitmap_rects()
else:
bitmap_rects = []
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
# return full-page rectangle if sufficiently covered with bitmaps
@@ -75,7 +80,7 @@ class BaseOcrModel:
)
]
# return individual rectangles if the bitmap coverage is smaller
elif coverage < BITMAP_COVERAGE_TRESHOLD:
else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
return ocr_rects
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.

View File

@@ -1,39 +1,228 @@
import copy
import random
from typing import List, Union
from deepsearch_glm.nlp_utils import init_nlp_model
from deepsearch_glm.utils.doc_utils import to_legacy_document_format
from deepsearch_glm.utils.doc_utils import to_docling_document
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument
from docling_core.types import Ref
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy_doc.base import Figure, TableCell
from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
from docling.datamodel.document import ConversionResult
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.utils.utils import create_hash
class GlmOptions(BaseModel):
model_config = ConfigDict(protected_namespaces=())
model_names: str = "" # e.g. "language;term;reference"
class GlmModel:
def __init__(self, config):
self.config = config
self.model_names = self.config.get(
"model_names", ""
) # "language;term;reference"
load_pretrained_nlp_models()
# model = init_nlp_model(model_names="language;term;reference")
model = init_nlp_model(model_names=self.model_names)
self.model = model
def __init__(self, options: GlmOptions):
self.options = options
def __call__(self, conv_res: ConversionResult) -> DsDocument:
ds_doc = conv_res._to_ds_document()
load_pretrained_nlp_models()
self.model = init_nlp_model(model_names=self.options.model_names)
def _to_legacy_document(self, conv_res) -> DsDocument:
title = ""
desc: DsDocumentDescription = DsDocumentDescription(logs=[])
page_hashes = [
PageReference(
hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
page=p.page_no + 1,
model="default",
)
for p in conv_res.pages
]
file_info = DsFileInfoObject(
filename=conv_res.input.file.name,
document_hash=conv_res.input.document_hash,
num_pages=conv_res.input.page_count,
page_hashes=page_hashes,
)
main_text: List[Union[Ref, BaseText]] = []
tables: List[DsSchemaTable] = []
figures: List[Figure] = []
page_no_to_page = {p.page_no: p for p in conv_res.pages}
for element in conv_res.assembled.elements:
# Convert bboxes to lower-left origin.
target_bbox = DsBoundingBox(
element.cluster.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple()
)
if isinstance(element, TextElement):
main_text.append(
BaseText(
text=element.text,
obj_type=layout_label_to_ds_type.get(element.label),
name=element.label,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, len(element.text)],
)
],
)
)
elif isinstance(element, Table):
index = len(tables)
ref_str = f"#/tables/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
# Initialise empty table data grid (only empty cells)
table_data = [
[
TableCell(
text="",
# bbox=[0,0,0,0],
spans=[[i, j]],
obj_type="body",
)
for j in range(element.num_cols)
]
for i in range(element.num_rows)
]
# Overwrite cells in table data for which there is actual cell content.
for cell in element.table_cells:
for i in range(
min(cell.start_row_offset_idx, element.num_rows),
min(cell.end_row_offset_idx, element.num_rows),
):
for j in range(
min(cell.start_col_offset_idx, element.num_cols),
min(cell.end_col_offset_idx, element.num_cols),
):
celltype = "body"
if cell.column_header:
celltype = "col_header"
elif cell.row_header:
celltype = "row_header"
elif cell.row_section:
celltype = "row_section"
def make_spans(cell):
for rspan in range(
min(cell.start_row_offset_idx, element.num_rows),
min(cell.end_row_offset_idx, element.num_rows),
):
for cspan in range(
min(
cell.start_col_offset_idx, element.num_cols
),
min(cell.end_col_offset_idx, element.num_cols),
):
yield [rspan, cspan]
spans = list(make_spans(cell))
if cell.bbox is not None:
bbox = cell.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple()
else:
bbox = None
table_data[i][j] = TableCell(
text=cell.text,
bbox=bbox,
# col=j,
# row=i,
spans=spans,
obj_type=celltype,
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
)
tables.append(
DsSchemaTable(
num_cols=element.num_cols,
num_rows=element.num_rows,
obj_type=layout_label_to_ds_type.get(element.label),
data=table_data,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, 0],
)
],
)
)
elif isinstance(element, FigureElement):
index = len(figures)
ref_str = f"#/figures/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
figures.append(
Figure(
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, 0],
)
],
obj_type=layout_label_to_ds_type.get(element.label),
# data=[[]],
)
)
page_dimensions = [
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
for p in conv_res.pages
]
ds_doc: DsDocument = DsDocument(
name=title,
description=desc,
file_info=file_info,
main_text=main_text,
tables=tables,
figures=figures,
page_dimensions=page_dimensions,
)
return ds_doc
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True)
glm_doc = self.model.apply_on_doc(ds_doc_dict)
ds_doc_dict = to_legacy_document_format(
glm_doc, ds_doc_dict, update_name_label=True
)
exported_doc = DsDocument.model_validate(ds_doc_dict)
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
# DEBUG code:
def draw_clusters_and_cells(ds_document, page_no):
@@ -48,7 +237,7 @@ class GlmModel:
if arr == "tables":
prov = ds_document.tables[index].prov[0]
elif arr == "figures":
prov = ds_document.figures[index].prov[0]
prov = ds_document.pictures[index].prov[0]
else:
prov = None
@@ -83,4 +272,4 @@ class GlmModel:
# draw_clusters_and_cells(ds_doc, 0)
# draw_clusters_and_cells(exported_doc, 0)
return exported_doc
return docling_doc

View File

@@ -2,8 +2,9 @@ import logging
from typing import Iterable
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.models.base_ocr_model import BaseOcrModel
@@ -39,6 +40,8 @@ class EasyOcrModel(BaseOcrModel):
return
for page in page_batch:
assert page._backend is not None
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []

View File

@@ -2,8 +2,10 @@ import copy
import logging
import random
import time
from pathlib import Path
from typing import Iterable, List
from docling_core.types.doc import CoordOrigin, DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import ImageDraw
@@ -11,74 +13,73 @@ from docling.datamodel.base_models import (
BoundingBox,
Cell,
Cluster,
CoordOrigin,
LayoutPrediction,
Page,
)
from docling.models.base_model import BasePageModel
from docling.utils import layout_utils as lu
_log = logging.getLogger(__name__)
class LayoutModel:
class LayoutModel(BasePageModel):
TEXT_ELEM_LABELS = [
"Text",
"Footnote",
"Caption",
"Checkbox-Unselected",
"Checkbox-Selected",
"Section-header",
"Page-header",
"Page-footer",
"Code",
"List-item",
# "Title"
DocItemLabel.TEXT,
DocItemLabel.FOOTNOTE,
DocItemLabel.CAPTION,
DocItemLabel.CHECKBOX_UNSELECTED,
DocItemLabel.CHECKBOX_SELECTED,
DocItemLabel.SECTION_HEADER,
DocItemLabel.PAGE_HEADER,
DocItemLabel.PAGE_FOOTER,
DocItemLabel.CODE,
DocItemLabel.LIST_ITEM,
# "Formula",
]
PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
TABLE_LABEL = "Table"
FIGURE_LABEL = "Picture"
FORMULA_LABEL = "Formula"
TABLE_LABEL = DocItemLabel.TABLE
FIGURE_LABEL = DocItemLabel.PICTURE
FORMULA_LABEL = DocItemLabel.FORMULA
def __init__(self, config):
self.config = config
self.layout_predictor = LayoutPredictor(
config["artifacts_path"]
) # TODO temporary
def __init__(self, artifacts_path: Path):
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
MIN_INTERSECTION = 0.2
CLASS_THRESHOLDS = {
"Caption": 0.35,
"Footnote": 0.35,
"Formula": 0.35,
"List-item": 0.35,
"Page-footer": 0.35,
"Page-header": 0.35,
"Picture": 0.2, # low threshold adjust to capture chemical structures for examples.
"Section-header": 0.45,
"Table": 0.35,
"Text": 0.45,
"Title": 0.45,
"Document Index": 0.45,
"Code": 0.45,
"Checkbox-Selected": 0.45,
"Checkbox-Unselected": 0.45,
"Form": 0.45,
"Key-Value Region": 0.45,
DocItemLabel.CAPTION: 0.35,
DocItemLabel.FOOTNOTE: 0.35,
DocItemLabel.FORMULA: 0.35,
DocItemLabel.LIST_ITEM: 0.35,
DocItemLabel.PAGE_FOOTER: 0.35,
DocItemLabel.PAGE_HEADER: 0.35,
DocItemLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
DocItemLabel.SECTION_HEADER: 0.45,
DocItemLabel.TABLE: 0.35,
DocItemLabel.TEXT: 0.45,
DocItemLabel.TITLE: 0.45,
DocItemLabel.DOCUMENT_INDEX: 0.45,
DocItemLabel.CODE: 0.45,
DocItemLabel.CHECKBOX_SELECTED: 0.45,
DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
DocItemLabel.FORM: 0.45,
DocItemLabel.KEY_VALUE_REGION: 0.45,
}
CLASS_REMAPPINGS = {"Document Index": "Table", "Title": "Section-header"}
CLASS_REMAPPINGS = {
DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
}
_log.debug("================= Start postprocess function ====================")
start_time = time.time()
# Apply Confidence Threshold to cluster predictions
# confidence = self.conf_threshold
clusters_out = []
clusters_mod = []
for cluster in clusters:
for cluster in clusters_in:
confidence = CLASS_THRESHOLDS[cluster.label]
if cluster.confidence >= confidence:
# annotation["created_by"] = "high_conf_pred"
@@ -86,10 +87,10 @@ class LayoutModel:
# Remap class labels where needed.
if cluster.label in CLASS_REMAPPINGS.keys():
cluster.label = CLASS_REMAPPINGS[cluster.label]
clusters_out.append(cluster)
clusters_mod.append(cluster)
# map to dictionary clusters and cells, with bottom left origin
clusters = [
clusters_orig = [
{
"id": c.id,
"bbox": list(
@@ -99,7 +100,7 @@ class LayoutModel:
"cell_ids": [],
"type": c.label,
}
for c in clusters
for c in clusters_in
]
clusters_out = [
@@ -113,9 +114,11 @@ class LayoutModel:
"cell_ids": [],
"type": c.label,
}
for c in clusters_out
for c in clusters_mod
]
del clusters_mod
raw_cells = [
{
"id": c.id,
@@ -149,7 +152,7 @@ class LayoutModel:
# Assign orphan cells with lower confidence predictions
clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
clusters_out, clusters, raw_cells, orphan_cell_indices
clusters_out, clusters_orig, raw_cells, orphan_cell_indices
)
# Refresh the cell_ids assignment, after creating new clusters using low conf predictions
@@ -178,7 +181,7 @@ class LayoutModel:
) = lu.cell_id_state_map(clusters_out, cell_count)
clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
clusters_out, clusters, raw_cells, orphan_cell_indices
clusters_out, clusters_orig, raw_cells, orphan_cell_indices
)
_log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
@@ -237,46 +240,55 @@ class LayoutModel:
end_time = time.time() - start_time
_log.debug(f"Finished post processing in seconds={end_time:.3f}")
cells_out = [
cells_out_new = [
Cell(
id=c["id"],
id=c["id"], # type: ignore
bbox=BoundingBox.from_tuple(
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
).to_top_left_origin(page_height),
text=c["text"],
text=c["text"], # type: ignore
)
for c in cells_out
]
del cells_out
clusters_out_new = []
for c in clusters_out:
cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
cluster_cells = [
ccell for ccell in cells_out_new if ccell.id in c["cell_ids"] # type: ignore
]
c_new = Cluster(
id=c["id"],
id=c["id"], # type: ignore
bbox=BoundingBox.from_tuple(
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
).to_top_left_origin(page_height),
confidence=c["confidence"],
label=c["type"],
confidence=c["confidence"], # type: ignore
label=DocItemLabel(c["type"]),
cells=cluster_cells,
)
clusters_out_new.append(c_new)
return clusters_out_new, cells_out
return clusters_out_new, cells_out_new
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch:
assert page.size is not None
clusters = []
for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0))
):
label = DocItemLabel(
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types
cluster = Cluster(
id=ix,
label=pred_item["label"],
label=label,
confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item),
cells=[],
)
clusters.append(cluster)
# Map cells to clusters

View File

@@ -2,22 +2,29 @@ import logging
import re
from typing import Iterable, List
from pydantic import BaseModel
from docling.datamodel.base_models import (
AssembledUnit,
FigureElement,
Page,
PageElement,
TableElement,
Table,
TextElement,
)
from docling.models.base_model import BasePageModel
from docling.models.layout_model import LayoutModel
_log = logging.getLogger(__name__)
class PageAssembleModel:
def __init__(self, config):
self.config = config
class PageAssembleOptions(BaseModel):
keep_images: bool = False
class PageAssembleModel(BasePageModel):
def __init__(self, options: PageAssembleOptions):
self.options = options
def sanitize_text(self, lines):
if len(lines) <= 1:
@@ -46,6 +53,8 @@ class PageAssembleModel:
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch:
assert page._backend is not None
assert page.predictions.layout is not None
# assembles some JSON output page by page.
elements: List[PageElement] = []
@@ -84,7 +93,7 @@ class PageAssembleModel:
if (
not tbl
): # fallback: add table without structure, if it isn't present
tbl = TableElement(
tbl = Table(
label=cluster.label,
id=cluster.id,
text="",
@@ -145,4 +154,11 @@ class PageAssembleModel:
elements=elements, headers=headers, body=body
)
# Remove page images (can be disabled)
if not self.options.keep_images:
page._image_cache = {}
# Unload backend
page._backend.unload()
yield page

View File

@@ -0,0 +1,57 @@
from typing import Iterable, Optional
from PIL import ImageDraw
from pydantic import BaseModel
from docling.datamodel.base_models import Page
from docling.models.base_model import BasePageModel
class PagePreprocessingOptions(BaseModel):
images_scale: Optional[float]
class PagePreprocessingModel(BasePageModel):
def __init__(self, options: PagePreprocessingOptions):
self.options = options
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch:
page = self._populate_page_images(page)
page = self._parse_page_cells(page)
yield page
# Generate the page image and store it in the page object
def _populate_page_images(self, page: Page) -> Page:
# default scale
page.get_image(
scale=1.0
) # puts the page image on the image cache at default scale
images_scale = self.options.images_scale
# user requested scales
if images_scale is not None:
page._default_image_scale = images_scale
page.get_image(
scale=images_scale
) # this will trigger storing the image in the internal cache
return page
# Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, page: Page) -> Page:
assert page._backend is not None
page.cells = list(page._backend.get_text_cells())
# DEBUG code:
def draw_text_boxes(image, cells):
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show()
# draw_text_boxes(page.get_image(scale=1.0), cells)
return page

View File

@@ -3,29 +3,25 @@ from pathlib import Path
from typing import Iterable, List
import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from PIL import ImageDraw
from docling.datamodel.base_models import (
BoundingBox,
Page,
TableCell,
TableElement,
TableStructurePrediction,
)
from docling.datamodel.pipeline_options import TableFormerMode
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
from docling.models.base_model import BasePageModel
class TableStructureModel:
def __init__(self, config):
self.config = config
self.do_cell_matching = config["do_cell_matching"]
self.mode = config["mode"]
class TableStructureModel(BasePageModel):
def __init__(
self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
):
self.options = options
self.do_cell_matching = self.options.do_cell_matching
self.mode = self.options.mode
self.enabled = config["enabled"]
self.enabled = enabled
if self.enabled:
artifacts_path: Path = config["artifacts_path"]
if self.mode == TableFormerMode.ACCURATE:
artifacts_path = artifacts_path / "fat"
@@ -39,7 +35,9 @@ class TableStructureModel:
self.tf_predictor = TFPredictor(self.tm_config)
self.scale = 2.0 # Scale up table input images to 144 dpi
def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
assert page._backend is not None
image = (
page._backend.get_page_image()
) # make new image to avoid drawing on the saved ones
@@ -50,17 +48,18 @@ class TableStructureModel:
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
for tc in table_element.table_cells:
x0, y0, x1, y1 = tc.bbox.as_tuple()
if tc.column_header:
width = 3
else:
width = 1
draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
draw.text(
(x0 + 3, y0 + 3),
text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
fill="black",
)
if tc.bbox is not None:
x0, y0, x1, y1 = tc.bbox.as_tuple()
if tc.column_header:
width = 3
else:
width = 1
draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
draw.text(
(x0 + 3, y0 + 3),
text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
fill="black",
)
image.show()
@@ -71,6 +70,9 @@ class TableStructureModel:
return
for page in page_batch:
assert page._backend is not None
assert page.predictions.layout is not None
assert page.size is not None
page.predictions.tablestructure = TableStructurePrediction() # dummy
@@ -85,7 +87,7 @@ class TableStructureModel:
],
)
for cluster in page.predictions.layout.clusters
if cluster.label == "Table"
if cluster.label == DocItemLabel.TABLE
]
if not len(in_tables):
yield page
@@ -132,7 +134,7 @@ class TableStructureModel:
element["bbox"]["token"] = text_piece
tc = TableCell.model_validate(element)
if self.do_cell_matching:
if self.do_cell_matching and tc.bbox is not None:
tc.bbox = tc.bbox.scaled(1 / self.scale)
table_cells.append(tc)
@@ -141,7 +143,7 @@ class TableStructureModel:
num_cols = table_out["predict_details"]["num_cols"]
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
tbl = TableElement(
tbl = Table(
otsl_seq=otsl_seq,
table_cells=table_cells,
num_rows=num_rows,
@@ -149,7 +151,7 @@ class TableStructureModel:
id=table_cluster.id,
page_no=page.page_no,
cluster=table_cluster,
label="Table",
label=DocItemLabel.TABLE,
)
page.predictions.tablestructure.table_map[table_cluster.id] = tbl

View File

@@ -2,11 +2,12 @@ import io
import logging
import tempfile
from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, Tuple
from typing import Iterable, Optional, Tuple
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.models.base_ocr_model import BaseOcrModel
@@ -21,8 +22,8 @@ class TesseractOcrCliModel(BaseOcrModel):
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self._name = None
self._version = None
self._name: Optional[str] = None
self._version: Optional[str] = None
if self.enabled:
try:
@@ -39,7 +40,7 @@ class TesseractOcrCliModel(BaseOcrModel):
def _get_name_and_version(self) -> Tuple[str, str]:
if self._name != None and self._version != None:
return self._name, self._version
return self._name, self._version # type: ignore
cmd = [self.options.tesseract_cmd, "--version"]
@@ -108,6 +109,8 @@ class TesseractOcrCliModel(BaseOcrModel):
return
for page in page_batch:
assert page._backend is not None
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []

View File

@@ -1,9 +1,9 @@
import logging
from typing import Iterable
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.models.base_ocr_model import BaseOcrModel
@@ -68,6 +68,9 @@ class TesseractOcrModel(BaseOcrModel):
return
for page in page_batch:
assert page._backend is not None
assert self.reader is not None
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []

View File

@@ -1,18 +0,0 @@
from pathlib import Path
from typing import Callable, Iterable, List
from docling.datamodel.base_models import Page
from docling.datamodel.pipeline_options import PipelineOptions
class BaseModelPipeline:
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
self.model_pipe: List[Callable] = []
self.artifacts_path = artifacts_path
self.pipeline_options = pipeline_options
def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for model in self.model_pipe:
page_batch = model(page_batch)
yield from page_batch

View File

@@ -0,0 +1,190 @@
import functools
import logging
import time
import traceback
from abc import ABC, abstractmethod
from typing import Callable, Iterable, List
from docling_core.types.doc import DoclingDocument, NodeItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BaseEnrichmentModel
from docling.utils.utils import chunkify
_log = logging.getLogger(__name__)
class BasePipeline(ABC):
def __init__(self, pipeline_options: PipelineOptions):
self.pipeline_options = pipeline_options
self.build_pipe: List[Callable] = []
self.enrichment_pipe: List[BaseEnrichmentModel] = []
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
try:
# These steps are building and assembling the structure of the
# output DoclingDocument
conv_res = self._build_document(in_doc, conv_res)
conv_res = self._assemble_document(in_doc, conv_res)
# From this stage, all operations should rely only on conv_res.output
conv_res = self._enrich_document(in_doc, conv_res)
conv_res.status = self._determine_status(in_doc, conv_res)
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
if raises_on_error:
raise e
return conv_res
@abstractmethod
def _build_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
pass
def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
return conv_res
def _enrich_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
def _filter_elements(
doc: DoclingDocument, model: BaseEnrichmentModel
) -> Iterable[NodeItem]:
for element, _level in doc.iterate_items():
if model.is_processable(doc=doc, element=element):
yield element
for model in self.enrichment_pipe:
for element_batch in chunkify(
_filter_elements(conv_res.document, model),
settings.perf.elements_batch_size,
):
# TODO: currently we assume the element itself is modified, because
# we don't have an interface to save the element back to the document
for element in model(
doc=conv_res.document, element_batch=element_batch
): # Must exhaust!
pass
return conv_res
@abstractmethod
def _determine_status(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionStatus:
pass
@classmethod
@abstractmethod
def get_default_options(cls) -> PipelineOptions:
pass
@classmethod
@abstractmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
pass
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
# for model in self.build_pipe:
# element_batch = model(element_batch)
#
# yield from element_batch
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for model in self.build_pipe:
page_batch = model(page_batch)
yield from page_batch
def _build_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
if not isinstance(in_doc._backend, PdfDocumentBackend):
raise RuntimeError(
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
f"Can not convert this with a PDF pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
start_pb_time = time.time()
# 1. Initialise the page resources
init_pages = map(
functools.partial(self.initialize_page, in_doc), page_batch
)
# 2. Run pipeline stages
pipeline_pages = self._apply_on_pages(init_pages)
for p in pipeline_pages: # Must exhaust!
pass
end_pb_time = time.time() - start_pb_time
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.warning(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
)
raise e
finally:
# Always unload the PDF backend, even in case of failure
if in_doc._backend:
in_doc._backend.unload()
return conv_res
def _determine_status(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionStatus:
status = ConversionStatus.SUCCESS
for page in conv_res.pages:
if page._backend is None or not page._backend.is_valid():
conv_res.errors.append(
ErrorItem(
component_type=DoclingComponentType.DOCUMENT_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS
return status
# Initialise and load resources for a page
@abstractmethod
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
pass

View File

@@ -0,0 +1,59 @@
import logging
from docling.backend.abstract_backend import (
AbstractDocumentBackend,
DeclarativeDocumentBackend,
)
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.pipeline.base_pipeline import BasePipeline
_log = logging.getLogger(__name__)
class SimplePipeline(BasePipeline):
"""SimpleModelPipeline.
This class is used at the moment for formats / backends
which produce straight DoclingDocument output.
"""
def __init__(self, pipeline_options: PipelineOptions):
super().__init__(pipeline_options)
def _build_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
raise RuntimeError(
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
f"Can not convert this with simple pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
# Instead of running a page-level pipeline to build up the document structure,
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
# a DoclingDocument straight.
conv_res.document = in_doc._backend.convert()
return conv_res
def _determine_status(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionStatus:
# This is called only if the previous steps didn't raise.
# Since we don't have anything else to evaluate, we can
# safely return SUCCESS.
return ConversionStatus.SUCCESS
@classmethod
def get_default_options(cls) -> PipelineOptions:
return PipelineOptions()
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, DeclarativeDocumentBackend)

View File

@@ -1,66 +0,0 @@
from pathlib import Path
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.models.base_ocr_model import BaseOcrModel
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline
class StandardModelPipeline(BaseModelPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
super().__init__(artifacts_path, pipeline_options)
ocr_model: BaseOcrModel
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
ocr_model = EasyOcrModel(
enabled=pipeline_options.do_ocr,
options=pipeline_options.ocr_options,
)
elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
ocr_model = TesseractOcrCliModel(
enabled=pipeline_options.do_ocr,
options=pipeline_options.ocr_options,
)
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
ocr_model = TesseractOcrModel(
enabled=pipeline_options.do_ocr,
options=pipeline_options.ocr_options,
)
else:
raise RuntimeError(
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
)
self.model_pipe = [
# OCR
ocr_model,
# Layout
LayoutModel(
config={
"artifacts_path": artifacts_path
/ StandardModelPipeline._layout_model_path
}
),
# Table structure
TableStructureModel(
config={
"artifacts_path": artifacts_path
/ StandardModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure,
"mode": pipeline_options.table_structure_options.mode,
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
}
),
]

View File

@@ -0,0 +1,198 @@
import logging
from pathlib import Path
from typing import Optional
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import (
PagePreprocessingModel,
PagePreprocessingOptions,
)
from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.base_pipeline import PaginatedPipeline
_log = logging.getLogger(__name__)
class StandardPdfPipeline(PaginatedPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: PdfPipelineOptions
if pipeline_options.artifacts_path is None:
self.artifacts_path = self.download_models_hf()
else:
self.artifacts_path = Path(pipeline_options.artifacts_path)
keep_images = (
self.pipeline_options.generate_page_images
or self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
)
self.glm_model = GlmModel(options=GlmOptions())
if (ocr_model := self.get_ocr_model()) is None:
raise RuntimeError(
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
)
self.build_pipe = [
# Pre-processing
PagePreprocessingModel(
options=PagePreprocessingOptions(
images_scale=pipeline_options.images_scale
)
),
# OCR
ocr_model,
# Layout model
LayoutModel(
artifacts_path=self.artifacts_path
/ StandardPdfPipeline._layout_model_path
),
# Table structure model
TableStructureModel(
enabled=pipeline_options.do_table_structure,
artifacts_path=self.artifacts_path
/ StandardPdfPipeline._table_model_path,
options=pipeline_options.table_structure_options,
),
# Page assemble
PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
]
self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument
]
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
) -> Path:
from huggingface_hub import snapshot_download
download_path = snapshot_download(
repo_id="ds4sd/docling-models",
force_download=force,
local_dir=local_dir,
revision="v2.0.1",
)
return Path(download_path)
def get_ocr_model(self) -> Optional[BaseOcrModel]:
if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
return EasyOcrModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
return TesseractOcrCliModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
return TesseractOcrModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
return None
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
page._backend = doc._backend.load_page(page.page_no) # type: ignore
if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size()
return page
def _assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
all_elements = []
all_headers = []
all_body = []
for p in conv_res.pages:
assert p.assembled is not None
for el in p.assembled.body:
all_body.append(el)
for el in p.assembled.headers:
all_headers.append(el)
for el in p.assembled.elements:
all_elements.append(el)
conv_res.assembled = AssembledUnit(
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.document = self.glm_model(conv_res)
# Generate page images in the output
if self.pipeline_options.generate_page_images:
for page in conv_res.pages:
assert page.image is not None
page_no = page.page_no + 1
conv_res.document.pages[page_no].image = ImageRef.from_pil(
page.image, dpi=int(72 * self.pipeline_options.images_scale)
)
# Generate images of the requested element types
if (
self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
):
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
if not isinstance(element, DocItem) or len(element.prov) == 0:
continue
if (
isinstance(element, PictureItem)
and self.pipeline_options.generate_picture_images
) or (
isinstance(element, TableItem)
and self.pipeline_options.generate_table_images
):
page_ix = element.prov[0].page_no - 1
page = conv_res.pages[page_ix]
assert page.size is not None
assert page.image is not None
crop_bbox = (
element.prov[0]
.bbox.scaled(scale=scale)
.to_top_left_origin(page_height=page.size.height * scale)
)
cropped_im = page.image.crop(crop_bbox.as_tuple())
element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
return conv_res
@classmethod
def get_default_options(cls) -> PdfPipelineOptions:
return PdfPipelineOptions()
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, PdfDocumentBackend)

View File

@@ -1,9 +1,10 @@
import logging
from typing import Any, Dict, Iterable, List, Tuple, Union
from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
from docling.datamodel.base_models import OcrCell
from docling.datamodel.document import ConversionResult, Page
_log = logging.getLogger(__name__)
@@ -40,7 +41,7 @@ def generate_multimodal_pages(
end_ix = 0
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
doc = doc_result.output
doc = doc_result.legacy_document
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
segments = []

View File

@@ -2,6 +2,7 @@ import copy
import logging
import networkx as nx
from docling_core.types.doc import DocItemLabel
logger = logging.getLogger("layout_utils")
@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
)
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
logger.debug(" Empty non-picture, removed")
continue ## Skip this former cluster, now without cells.
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
if not (cluster["type"] in ["Table", "Picture"]):
if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
## A text-like cluster. The bbox only needs to be around the text cells:
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
new_bbox = surrounding_list(
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
)
logger.debug(" New bounding box:" + str(new_bbox))
if cluster["type"] == "Picture":
if cluster["type"] == DocItemLabel.PICTURE:
## We only make the bbox completely comprise included text cells:
logger.debug(" Picture")
if len(cluster["cell_ids"]) != 0:
@@ -587,7 +588,7 @@ def set_orphan_as_text(
max_id = -1
figures = []
for cluster in cluster_predictions:
if cluster["type"] == "Picture":
if cluster["type"] == DocItemLabel.PICTURE:
figures.append(cluster)
if cluster["id"] > max_id:
@@ -638,13 +639,13 @@ def set_orphan_as_text(
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
if fig_flag == False and lines_detector == False:
# get class from low confidence detections if not set as text:
class_type = "Text"
class_type = DocItemLabel.TEXT
for cluster in cluster_predictions_low:
intersection = compute_intersection(
orph_cell["bbox"], cluster["bbox"]
)
class_type = "Text"
class_type = DocItemLabel.TEXT
if (
cluster["confidence"] > 0.1
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -718,7 +719,9 @@ def merge_cells(cluster_predictions):
if cluster["id"] == node:
lines.append(cluster)
cluster_predictions.remove(cluster)
new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
new_merged_cluster = build_cluster_from_lines(
lines, DocItemLabel.TEXT, max_id
)
cluster_predictions.append(new_merged_cluster)
return cluster_predictions
@@ -753,9 +756,9 @@ def clean_up_clusters(
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
elif img_table == True:
if (
cluster_1["type"] == "Text"
and cluster_2["type"] == "Picture"
or cluster_2["type"] == "Table"
cluster_1["type"] == DocItemLabel.TEXT
and cluster_2["type"] == DocItemLabel.PICTURE
or cluster_2["type"] == DocItemLabel.TABLE
):
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -771,7 +774,10 @@ def clean_up_clusters(
DuplicateDeletedClusterIDs.append(cluster_1["id"])
# remove tables that have one pdf cell
if one_cell_table == True:
if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
if (
cluster_1["type"] == DocItemLabel.TABLE
and len(cluster_1["cell_ids"]) < 2
):
DuplicateDeletedClusterIDs.append(cluster_1["id"])
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))