mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-09 13:18:24 +00:00
feat!: Docling v2 (#117)
--------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
@@ -1,68 +1,63 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
|
||||
from typing import TYPE_CHECKING, Set, Union
|
||||
|
||||
from PIL import Image
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, PageSize
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
class PdfPageBackend(ABC):
|
||||
|
||||
class AbstractDocumentBackend(ABC):
|
||||
@abstractmethod
|
||||
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_text_cells(self) -> Iterable["Cell"]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
|
||||
) -> Image.Image:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_size(self) -> "PageSize":
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def is_valid(self) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
pass
|
||||
|
||||
|
||||
class PdfDocumentBackend(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
self.path_or_stream = path_or_stream
|
||||
self.document_hash = document_hash
|
||||
|
||||
@abstractmethod
|
||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def page_count(self) -> int:
|
||||
pass
|
||||
self.document_hash = in_doc.document_hash
|
||||
self.input_format = in_doc.format
|
||||
|
||||
@abstractmethod
|
||||
def is_valid(self) -> bool:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def supported_formats(cls) -> Set["InputFormat"]:
|
||||
pass
|
||||
|
||||
|
||||
class PaginatedDocumentBackend(AbstractDocumentBackend):
|
||||
"""DeclarativeDocumentBackend.
|
||||
|
||||
A declarative document backend is a backend that can transform to DoclingDocument
|
||||
straight without a recognition pipeline.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def page_count(self) -> int:
|
||||
pass
|
||||
|
||||
|
||||
class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
||||
"""DeclarativeDocumentBackend.
|
||||
|
||||
A declarative document backend is a backend that can transform to DoclingDocument
|
||||
straight without a recognition pipeline.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def convert(self) -> DoclingDocument:
|
||||
pass
|
||||
|
||||
@@ -5,12 +5,14 @@ from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
from docling_parse.docling_parse import pdf_parser
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -177,8 +179,8 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
|
||||
return image
|
||||
|
||||
def get_size(self) -> PageSize:
|
||||
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||
def get_size(self) -> Size:
|
||||
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||
|
||||
def unload(self):
|
||||
self._ppage = None
|
||||
@@ -186,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
|
||||
|
||||
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||
self.parser = pdf_parser()
|
||||
|
||||
success = False
|
||||
if isinstance(path_or_stream, BytesIO):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
success = self.parser.load_document_from_bytesio(
|
||||
document_hash, path_or_stream
|
||||
self.document_hash, self.path_or_stream
|
||||
)
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
success = self.parser.load_document(
|
||||
self.document_hash, str(self.path_or_stream)
|
||||
)
|
||||
elif isinstance(path_or_stream, Path):
|
||||
success = self.parser.load_document(document_hash, str(path_or_stream))
|
||||
|
||||
if not success:
|
||||
raise RuntimeError(
|
||||
f"docling-parse could not load document {document_hash}."
|
||||
f"docling-parse could not load document with hash {self.document_hash}."
|
||||
)
|
||||
|
||||
def page_count(self) -> int:
|
||||
|
||||
@@ -2,15 +2,19 @@ import logging
|
||||
import random
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_parse.docling_parse import pdf_parser_v2
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell, Size
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -190,8 +194,8 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
|
||||
return image
|
||||
|
||||
def get_size(self) -> PageSize:
|
||||
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||
def get_size(self) -> Size:
|
||||
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||
|
||||
def unload(self):
|
||||
self._ppage = None
|
||||
@@ -199,23 +203,23 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
|
||||
|
||||
class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||
self.parser = pdf_parser_v2("fatal")
|
||||
|
||||
success = False
|
||||
if isinstance(path_or_stream, BytesIO):
|
||||
success = self.parser.load_document_from_bytesio(
|
||||
document_hash, path_or_stream
|
||||
self.document_hash, path_or_stream
|
||||
)
|
||||
elif isinstance(path_or_stream, Path):
|
||||
success = self.parser.load_document(document_hash, str(path_or_stream))
|
||||
success = self.parser.load_document(self.document_hash, str(path_or_stream))
|
||||
|
||||
if not success:
|
||||
raise RuntimeError(
|
||||
f"docling-parse could not load document {document_hash}."
|
||||
f"docling-parse v2 could not load document {self.document_hash}."
|
||||
)
|
||||
|
||||
def page_count(self) -> int:
|
||||
|
||||
425
docling/backend/html_backend.py
Normal file
425
docling/backend/html_backend.py
Normal file
@@ -0,0 +1,425 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
GroupLabel,
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
_log.debug("About to init HTML backend...")
|
||||
self.soup = None
|
||||
# HTML file:
|
||||
self.path_or_stream = path_or_stream
|
||||
# Initialise the parents for the hierarchy
|
||||
self.max_levels = 10
|
||||
self.level = 0
|
||||
self.parents = {} # type: ignore
|
||||
for i in range(0, self.max_levels):
|
||||
self.parents[i] = None
|
||||
self.labels = {} # type: ignore
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
self.soup = BeautifulSoup(text_stream, "html.parser")
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
||||
) from e
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.soup is not None
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.HTML}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
# access self.path_or_stream to load stuff
|
||||
doc = DoclingDocument(name="dummy")
|
||||
_log.debug("Trying to convert HTML...")
|
||||
|
||||
if self.is_valid():
|
||||
assert self.soup is not None
|
||||
# Replace <br> tags with newline characters
|
||||
for br in self.soup.body.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
doc = self.walk(self.soup.body, doc)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||
)
|
||||
return doc
|
||||
|
||||
def walk(self, element, doc):
|
||||
try:
|
||||
# Iterate over elements in the body of the document
|
||||
for idx, element in enumerate(element.children):
|
||||
try:
|
||||
self.analyse_element(element, idx, doc)
|
||||
except Exception as exc_child:
|
||||
|
||||
_log.error(" -> error treating child: ", exc_child)
|
||||
_log.error(" => element: ", element, "\n")
|
||||
raise exc_child
|
||||
|
||||
except Exception as exc:
|
||||
pass
|
||||
|
||||
return doc
|
||||
|
||||
def analyse_element(self, element, idx, doc):
|
||||
"""
|
||||
if element.name!=None:
|
||||
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
||||
"""
|
||||
|
||||
if element.name in self.labels:
|
||||
self.labels[element.name] += 1
|
||||
else:
|
||||
self.labels[element.name] = 1
|
||||
|
||||
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
self.handle_header(element, idx, doc)
|
||||
elif element.name in ["p"]:
|
||||
self.handle_paragraph(element, idx, doc)
|
||||
elif element.name in ["ul", "ol"]:
|
||||
self.handle_list(element, idx, doc)
|
||||
elif element.name in ["li"]:
|
||||
self.handle_listitem(element, idx, doc)
|
||||
elif element.name == "table":
|
||||
self.handle_table(element, idx, doc)
|
||||
elif element.name == "figure":
|
||||
self.handle_figure(element, idx, doc)
|
||||
elif element.name == "img":
|
||||
self.handle_image(element, idx, doc)
|
||||
else:
|
||||
self.walk(element, doc)
|
||||
|
||||
def get_direct_text(self, item):
|
||||
"""Get the direct text of the <li> element (ignoring nested lists)."""
|
||||
text = item.find(string=True, recursive=False)
|
||||
|
||||
if isinstance(text, str):
|
||||
return text.strip()
|
||||
|
||||
return ""
|
||||
|
||||
# Function to recursively extract text from all child nodes
|
||||
def extract_text_recursively(self, item):
|
||||
result = []
|
||||
|
||||
if isinstance(item, str):
|
||||
return [item]
|
||||
|
||||
result.append(self.get_direct_text(item))
|
||||
|
||||
try:
|
||||
# Iterate over the children (and their text and tails)
|
||||
for child in item:
|
||||
try:
|
||||
# Recursively get the child's text content
|
||||
result.extend(self.extract_text_recursively(child))
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
_log.warn("item has no children")
|
||||
pass
|
||||
|
||||
return " ".join(result)
|
||||
|
||||
def handle_header(self, element, idx, doc):
|
||||
"""Handles header tags (h1, h2, etc.)."""
|
||||
hlevel = int(element.name.replace("h", ""))
|
||||
slevel = hlevel - 1
|
||||
|
||||
label = DocItemLabel.SECTION_HEADER
|
||||
text = element.text.strip()
|
||||
|
||||
if hlevel == 1:
|
||||
for key, val in self.parents.items():
|
||||
self.parents[key] = None
|
||||
|
||||
self.level = 1
|
||||
self.parents[self.level] = doc.add_text(
|
||||
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
|
||||
)
|
||||
|
||||
elif hlevel == self.level:
|
||||
self.parents[hlevel] = doc.add_text(
|
||||
parent=self.parents[hlevel - 1], label=label, text=text
|
||||
)
|
||||
|
||||
elif hlevel > self.level:
|
||||
|
||||
# add invisible group
|
||||
for i in range(self.level + 1, hlevel):
|
||||
self.parents[i] = doc.add_group(
|
||||
name=f"header-{i}",
|
||||
label=GroupLabel.SECTION,
|
||||
parent=self.parents[i - 1],
|
||||
)
|
||||
|
||||
self.parents[hlevel] = doc.add_text(
|
||||
parent=self.parents[hlevel - 1], label=label, text=text
|
||||
)
|
||||
self.level = hlevel
|
||||
|
||||
elif hlevel < self.level:
|
||||
|
||||
# remove the tail
|
||||
for key, val in self.parents.items():
|
||||
if key > hlevel:
|
||||
self.parents[key] = None
|
||||
|
||||
self.parents[hlevel] = doc.add_text(
|
||||
parent=self.parents[hlevel - 1], label=label, text=text
|
||||
)
|
||||
self.level = hlevel
|
||||
|
||||
def handle_paragraph(self, element, idx, doc):
|
||||
"""Handles paragraph tags (p)."""
|
||||
if element.text is None:
|
||||
return
|
||||
text = element.text.strip()
|
||||
label = DocItemLabel.PARAGRAPH
|
||||
if len(text) == 0:
|
||||
return
|
||||
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
||||
|
||||
def handle_list(self, element, idx, doc):
|
||||
"""Handles list tags (ul, ol) and their list items."""
|
||||
|
||||
if element.name == "ul":
|
||||
# create a list group
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
||||
)
|
||||
elif element.name == "ol":
|
||||
# create a list group
|
||||
self.parents[self.level + 1] = doc.add_group(
|
||||
parent=self.parents[self.level],
|
||||
name="ordered list",
|
||||
label=GroupLabel.ORDERED_LIST,
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
self.walk(element, doc)
|
||||
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
|
||||
def handle_listitem(self, element, idx, doc):
|
||||
"""Handles listitem tags (li)."""
|
||||
nested_lists = element.find(["ul", "ol"])
|
||||
|
||||
parent_list_label = self.parents[self.level].label
|
||||
index_in_list = len(self.parents[self.level].children) + 1
|
||||
|
||||
if nested_lists:
|
||||
name = element.name
|
||||
text = self.get_direct_text(element)
|
||||
|
||||
marker = ""
|
||||
enumerated = False
|
||||
if parent_list_label == GroupLabel.ORDERED_LIST:
|
||||
marker = str(index_in_list)
|
||||
enumerated = True
|
||||
|
||||
# create a list-item
|
||||
self.parents[self.level + 1] = doc.add_list_item(
|
||||
text=text,
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
self.walk(element, doc)
|
||||
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
|
||||
elif isinstance(element.text, str):
|
||||
text = element.text.strip()
|
||||
|
||||
marker = ""
|
||||
enumerated = False
|
||||
if parent_list_label == GroupLabel.ORDERED_LIST:
|
||||
marker = f"{str(index_in_list)}."
|
||||
enumerated = True
|
||||
doc.add_list_item(
|
||||
text=text,
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
else:
|
||||
_log.warn("list-item has no text: ", element)
|
||||
|
||||
def handle_table(self, element, idx, doc):
|
||||
"""Handles table tags."""
|
||||
|
||||
nested_tables = element.find("table")
|
||||
if nested_tables is not None:
|
||||
_log.warn("detected nested tables: skipping for now")
|
||||
return
|
||||
|
||||
# Count the number of rows (number of <tr> elements)
|
||||
num_rows = len(element.find_all("tr"))
|
||||
|
||||
# Find the number of columns (taking into account colspan)
|
||||
num_cols = 0
|
||||
for row in element.find_all("tr"):
|
||||
col_count = 0
|
||||
for cell in row.find_all(["td", "th"]):
|
||||
colspan = int(cell.get("colspan", 1))
|
||||
col_count += colspan
|
||||
num_cols = max(num_cols, col_count)
|
||||
|
||||
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
|
||||
# Iterate over the rows in the table
|
||||
for row_idx, row in enumerate(element.find_all("tr")):
|
||||
|
||||
# For each row, find all the column cells (both <td> and <th>)
|
||||
cells = row.find_all(["td", "th"])
|
||||
|
||||
# Check if each cell in the row is a header -> means it is a column header
|
||||
col_header = True
|
||||
for j, html_cell in enumerate(cells):
|
||||
if html_cell.name == "td":
|
||||
col_header = False
|
||||
|
||||
col_idx = 0
|
||||
# Extract and print the text content of each cell
|
||||
for _, html_cell in enumerate(cells):
|
||||
|
||||
text = html_cell.text
|
||||
try:
|
||||
text = self.extract_table_cell_text(html_cell)
|
||||
except Exception as exc:
|
||||
_log.warn("exception: ", exc)
|
||||
exit(-1)
|
||||
|
||||
# label = html_cell.name
|
||||
|
||||
col_span = int(html_cell.get("colspan", 1))
|
||||
row_span = int(html_cell.get("rowspan", 1))
|
||||
|
||||
while grid[row_idx][col_idx] is not None:
|
||||
col_idx += 1
|
||||
for r in range(row_span):
|
||||
for c in range(col_span):
|
||||
grid[row_idx + r][col_idx + c] = text
|
||||
|
||||
cell = TableCell(
|
||||
text=text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=row_idx,
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
col_header=col_header,
|
||||
row_header=((not col_header) and html_cell.name == "th"),
|
||||
)
|
||||
data.table_cells.append(cell)
|
||||
|
||||
doc.add_table(data=data, parent=self.parents[self.level])
|
||||
|
||||
def get_list_text(self, list_element, level=0):
|
||||
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
||||
result = []
|
||||
bullet_char = "*" # Default bullet character for unordered lists
|
||||
|
||||
if list_element.name == "ol": # For ordered lists, use numbers
|
||||
for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
|
||||
# Add numbering for ordered lists
|
||||
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
|
||||
# Handle nested lists
|
||||
nested_list = li.find(["ul", "ol"])
|
||||
if nested_list:
|
||||
result.extend(self.get_list_text(nested_list, level + 1))
|
||||
elif list_element.name == "ul": # For unordered lists, use bullet points
|
||||
for li in list_element.find_all("li", recursive=False):
|
||||
# Add bullet points for unordered lists
|
||||
result.append(
|
||||
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
|
||||
)
|
||||
# Handle nested lists
|
||||
nested_list = li.find(["ul", "ol"])
|
||||
if nested_list:
|
||||
result.extend(self.get_list_text(nested_list, level + 1))
|
||||
|
||||
return result
|
||||
|
||||
def extract_table_cell_text(self, cell):
|
||||
"""Extract text from a table cell, including lists with indents."""
|
||||
contains_lists = cell.find(["ul", "ol"])
|
||||
if contains_lists is None:
|
||||
return cell.text
|
||||
else:
|
||||
_log.debug(
|
||||
"should extract the content correctly for table-cells with lists ..."
|
||||
)
|
||||
return cell.text
|
||||
|
||||
def handle_figure(self, element, idx, doc):
|
||||
"""Handles image tags (img)."""
|
||||
|
||||
# Extract the image URI from the <img> tag
|
||||
# image_uri = root.xpath('//figure//img/@src')[0]
|
||||
|
||||
contains_captions = element.find(["figcaption"])
|
||||
if contains_captions is None:
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
||||
|
||||
else:
|
||||
texts = []
|
||||
for item in contains_captions:
|
||||
texts.append(item.text)
|
||||
|
||||
fig_caption = doc.add_text(
|
||||
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
|
||||
)
|
||||
doc.add_picture(
|
||||
parent=self.parents[self.level],
|
||||
caption=fig_caption,
|
||||
)
|
||||
|
||||
def handle_image(self, element, idx, doc):
|
||||
"""Handles image tags (img)."""
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
||||
375
docling/backend/mspowerpoint_backend.py
Normal file
375
docling/backend/mspowerpoint_backend.py
Normal file
@@ -0,0 +1,375 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
CoordOrigin,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
ProvenanceItem,
|
||||
Size,
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from pptx import Presentation
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
DeclarativeDocumentBackend,
|
||||
PaginatedDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.namespaces = {
|
||||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
|
||||
"p": "http://schemas.openxmlformats.org/presentationml/2006/main",
|
||||
}
|
||||
# Powerpoint file:
|
||||
self.path_or_stream = path_or_stream
|
||||
|
||||
self.pptx_obj = None
|
||||
self.valid = False
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.pptx_obj = Presentation(self.path_or_stream)
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
self.pptx_obj = Presentation(str(self.path_or_stream))
|
||||
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
return
|
||||
|
||||
def page_count(self) -> int:
|
||||
if self.is_valid():
|
||||
assert self.pptx_obj is not None
|
||||
return len(self.pptx_obj.slides)
|
||||
else:
|
||||
return 0
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return True # True? if so, how to handle pages...
|
||||
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.PPTX}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
# Parses the PPTX into a structured document model.
|
||||
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
||||
|
||||
fname = ""
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
fname = self.path_or_stream.name
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=fname,
|
||||
mimetype="application/vnd.ms-powerpoint",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
if len(fname) > 0:
|
||||
docname = Path(fname).stem
|
||||
else:
|
||||
docname = "stream"
|
||||
doc = DoclingDocument(
|
||||
name=docname, origin=origin
|
||||
) # must add origin information
|
||||
doc = self.walk_linear(self.pptx_obj, doc)
|
||||
|
||||
return doc
|
||||
|
||||
def generate_prov(self, shape, slide_ind, text=""):
|
||||
left = shape.left
|
||||
top = shape.top
|
||||
width = shape.width
|
||||
height = shape.height
|
||||
shape_bbox = [left, top, left + width, top + height]
|
||||
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
|
||||
# prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
|
||||
prov = ProvenanceItem(
|
||||
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
|
||||
)
|
||||
|
||||
return prov
|
||||
|
||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
|
||||
is_a_list = False
|
||||
enum_list_item_value = 0
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
enum_list_item_value += 1
|
||||
bullet_type = "None"
|
||||
# Check if paragraph is a bullet point using the `element` XML
|
||||
p = paragraph._element
|
||||
if (
|
||||
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
||||
is not None
|
||||
):
|
||||
bullet_type = "Bullet"
|
||||
is_a_list = True
|
||||
elif (
|
||||
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
||||
is not None
|
||||
):
|
||||
bullet_type = "Numbered"
|
||||
is_a_list = True
|
||||
else:
|
||||
is_a_list = False
|
||||
|
||||
if paragraph.level > 0:
|
||||
# Most likely a sub-list
|
||||
is_a_list = True
|
||||
list_text = paragraph.text.strip()
|
||||
|
||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
||||
|
||||
if is_a_list:
|
||||
# Determine if this is an unordered list or an ordered list.
|
||||
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||
list_label = GroupLabel.LIST
|
||||
if bullet_type == "Numbered":
|
||||
list_label = GroupLabel.ORDERED_LIST
|
||||
|
||||
new_list = doc.add_group(
|
||||
label=list_label, name=f"list", parent=parent_slide
|
||||
)
|
||||
else:
|
||||
new_list = None
|
||||
|
||||
if is_a_list:
|
||||
_log.debug("LIST DETECTED!")
|
||||
else:
|
||||
_log.debug("No List")
|
||||
|
||||
# for e in p.iter():
|
||||
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
|
||||
if len(e.text.strip()) > 0:
|
||||
e_is_a_list_item = False
|
||||
is_numbered = False
|
||||
if (
|
||||
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
||||
is not None
|
||||
):
|
||||
bullet_type = "Bullet"
|
||||
e_is_a_list_item = True
|
||||
elif (
|
||||
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
||||
is not None
|
||||
):
|
||||
bullet_type = "Numbered"
|
||||
is_numbered = True
|
||||
e_is_a_list_item = True
|
||||
else:
|
||||
e_is_a_list_item = False
|
||||
|
||||
if e_is_a_list_item:
|
||||
# Set marker and enumerated arguments if this is an enumeration element.
|
||||
enum_marker = str(enum_list_item_value) + "."
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_list,
|
||||
text=list_text,
|
||||
prov=prov,
|
||||
)
|
||||
else:
|
||||
# Assign proper label to the text, depending if it's a Title or Section Header
|
||||
# For other types of text, assign - PARAGRAPH
|
||||
doc_label = DocItemLabel.PARAGRAPH
|
||||
if shape.is_placeholder:
|
||||
placeholder_type = shape.placeholder_format.type
|
||||
if placeholder_type in [
|
||||
PP_PLACEHOLDER.CENTER_TITLE,
|
||||
PP_PLACEHOLDER.TITLE,
|
||||
]:
|
||||
# It's a title
|
||||
doc_label = DocItemLabel.TITLE
|
||||
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
||||
DocItemLabel.SECTION_HEADER
|
||||
|
||||
enum_list_item_value = 0
|
||||
|
||||
doc.add_text(
|
||||
label=doc_label,
|
||||
parent=parent_slide,
|
||||
text=list_text,
|
||||
prov=prov,
|
||||
)
|
||||
return
|
||||
|
||||
def handle_title(self, shape, parent_slide, slide_ind, doc):
|
||||
placeholder_type = shape.placeholder_format.type
|
||||
txt = shape.text.strip()
|
||||
prov = self.generate_prov(shape, slide_ind, txt)
|
||||
|
||||
if len(txt.strip()) > 0:
|
||||
# title = slide.shapes.title.text if slide.shapes.title else "No title"
|
||||
if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
|
||||
_log.info(f"Title found: {shape.text}")
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
|
||||
)
|
||||
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
||||
_log.info(f"Subtitle found: {shape.text}")
|
||||
# Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
|
||||
doc.add_text(
|
||||
label=DocItemLabel.SECTION_HEADER,
|
||||
parent=parent_slide,
|
||||
text=txt,
|
||||
prov=prov,
|
||||
)
|
||||
return
|
||||
|
||||
def handle_pictures(self, shape, parent_slide, slide_ind, doc):
|
||||
# shape has picture
|
||||
prov = self.generate_prov(shape, slide_ind, "")
|
||||
doc.add_picture(parent=parent_slide, caption=None, prov=prov)
|
||||
return
|
||||
|
||||
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
||||
# Handling tables, images, charts
|
||||
if shape.has_table:
|
||||
table = shape.table
|
||||
table_xml = shape._element
|
||||
|
||||
prov = self.generate_prov(shape, slide_ind, "")
|
||||
|
||||
num_cols = 0
|
||||
num_rows = len(table.rows)
|
||||
tcells = []
|
||||
# Access the XML element for the shape that contains the table
|
||||
table_xml = shape._element
|
||||
|
||||
for row_idx, row in enumerate(table.rows):
|
||||
if len(row.cells) > num_cols:
|
||||
num_cols = len(row.cells)
|
||||
for col_idx, cell in enumerate(row.cells):
|
||||
# Access the XML of the cell (this is the 'tc' element in table XML)
|
||||
cell_xml = table_xml.xpath(
|
||||
f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
|
||||
)
|
||||
|
||||
if not cell_xml:
|
||||
continue # If no cell XML is found, skip
|
||||
|
||||
cell_xml = cell_xml[0] # Get the first matching XML node
|
||||
row_span = cell_xml.get("rowSpan") # Vertical span
|
||||
col_span = cell_xml.get("gridSpan") # Horizontal span
|
||||
|
||||
if row_span is None:
|
||||
row_span = 1
|
||||
else:
|
||||
row_span = int(row_span)
|
||||
|
||||
if col_span is None:
|
||||
col_span = 1
|
||||
else:
|
||||
col_span = int(col_span)
|
||||
|
||||
icell = TableCell(
|
||||
text=cell.text.strip(),
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=row_idx,
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
col_header=False,
|
||||
row_header=False,
|
||||
)
|
||||
if len(cell.text.strip()) > 0:
|
||||
tcells.append(icell)
|
||||
# Initialize Docling TableData
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
# Populate
|
||||
for tcell in tcells:
|
||||
data.table_cells.append(tcell)
|
||||
if len(tcells) > 0:
|
||||
# If table is not fully empty...
|
||||
# Create Docling table
|
||||
doc.add_table(data=data, prov=prov)
|
||||
return
|
||||
|
||||
def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
|
||||
# Units of size in PPTX by default are EMU units (English Metric Units)
|
||||
slide_width = pptx_obj.slide_width
|
||||
slide_height = pptx_obj.slide_height
|
||||
|
||||
text_content = [] # type: ignore
|
||||
|
||||
max_levels = 10
|
||||
parents = {} # type: ignore
|
||||
for i in range(0, max_levels):
|
||||
parents[i] = None
|
||||
|
||||
# Loop through each slide
|
||||
for slide_num, slide in enumerate(pptx_obj.slides):
|
||||
slide_ind = pptx_obj.slides.index(slide)
|
||||
parent_slide = doc.add_group(
|
||||
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
|
||||
)
|
||||
|
||||
size = Size(width=slide_width, height=slide_height)
|
||||
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
|
||||
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
|
||||
|
||||
# Loop through each shape in the slide
|
||||
for shape in slide.shapes:
|
||||
|
||||
if shape.has_table:
|
||||
# Handle Tables
|
||||
self.handle_tables(shape, parent_slide, slide_ind, doc)
|
||||
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
||||
# Handle Tables
|
||||
self.handle_pictures(shape, parent_slide, slide_ind, doc)
|
||||
|
||||
# If shape doesn't have any text, move on to the next shape
|
||||
if not hasattr(shape, "text"):
|
||||
continue
|
||||
if shape.text is None:
|
||||
continue
|
||||
if len(shape.text.strip()) == 0:
|
||||
continue
|
||||
if not shape.has_text_frame:
|
||||
_log.warn("Warning: shape has text but not text_frame")
|
||||
continue
|
||||
|
||||
# if shape.is_placeholder:
|
||||
# Handle Titles (Headers) and Subtitles
|
||||
# Check if the shape is a placeholder (titles are placeholders)
|
||||
# self.handle_title(shape, parent_slide, slide_ind, doc)
|
||||
# self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
||||
# else:
|
||||
|
||||
# Handle other text elements, including lists (bullet lists, numbered lists)
|
||||
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
||||
|
||||
# figures...
|
||||
# doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
|
||||
|
||||
return doc
|
||||
509
docling/backend/msword_backend.py
Normal file
509
docling/backend/msword_backend.py
Normal file
@@ -0,0 +1,509 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
import docx
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from lxml import etree
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.XML_KEY = (
|
||||
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
||||
)
|
||||
self.xml_namespaces = {
|
||||
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
|
||||
}
|
||||
# self.initialise(path_or_stream)
|
||||
# Word file:
|
||||
self.path_or_stream = path_or_stream
|
||||
self.valid = False
|
||||
# Initialise the parents for the hierarchy
|
||||
self.max_levels = 10
|
||||
self.level_at_new_list = None
|
||||
self.parents = {} # type: ignore
|
||||
for i in range(-1, self.max_levels):
|
||||
self.parents[i] = None
|
||||
|
||||
self.level = 0
|
||||
self.listIter = 0
|
||||
|
||||
self.history = {
|
||||
"names": [None],
|
||||
"levels": [None],
|
||||
"numids": [None],
|
||||
"indents": [None],
|
||||
}
|
||||
|
||||
self.docx_obj = None
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.docx_obj = docx.Document(self.path_or_stream)
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
self.docx_obj = docx.Document(str(self.path_or_stream))
|
||||
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.DOCX}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
# Parses the DOCX into a structured document model.
|
||||
|
||||
fname = ""
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
fname = self.path_or_stream.name
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=fname,
|
||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
if len(fname) > 0:
|
||||
docname = Path(fname).stem
|
||||
else:
|
||||
docname = "stream"
|
||||
doc = DoclingDocument(name=docname, origin=origin)
|
||||
if self.is_valid():
|
||||
assert self.docx_obj is not None
|
||||
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||
return doc
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||
)
|
||||
|
||||
def update_history(self, name, level, numid, ilevel):
|
||||
self.history["names"].append(name)
|
||||
self.history["levels"].append(level)
|
||||
|
||||
self.history["numids"].append(numid)
|
||||
self.history["indents"].append(ilevel)
|
||||
|
||||
def prev_name(self):
|
||||
return self.history["names"][-1]
|
||||
|
||||
def prev_level(self):
|
||||
return self.history["levels"][-1]
|
||||
|
||||
def prev_numid(self):
|
||||
return self.history["numids"][-1]
|
||||
|
||||
def prev_indent(self):
|
||||
return self.history["indents"][-1]
|
||||
|
||||
def get_level(self) -> int:
|
||||
"""Return the first None index."""
|
||||
for k, v in self.parents.items():
|
||||
if k >= 0 and v == None:
|
||||
return k
|
||||
return 0
|
||||
|
||||
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
||||
for element in body:
|
||||
tag_name = etree.QName(element).localname
|
||||
|
||||
# Check for Inline Images (drawings or blip elements)
|
||||
found_drawing = etree.ElementBase.xpath(
|
||||
element, ".//w:drawing", namespaces=self.xml_namespaces
|
||||
)
|
||||
found_pict = etree.ElementBase.xpath(
|
||||
element, ".//w:pict", namespaces=self.xml_namespaces
|
||||
)
|
||||
|
||||
# Check for Tables
|
||||
if element.tag.endswith("tbl"):
|
||||
try:
|
||||
self.handle_tables(element, docx_obj, doc)
|
||||
except Exception:
|
||||
_log.debug("could not parse a table, broken docx table")
|
||||
|
||||
elif found_drawing or found_pict:
|
||||
self.handle_pictures(element, docx_obj, doc)
|
||||
# Check for Text
|
||||
elif tag_name in ["p"]:
|
||||
self.handle_text_elements(element, docx_obj, doc)
|
||||
else:
|
||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||
return doc
|
||||
|
||||
def str_to_int(self, s, default=0):
|
||||
if s is None:
|
||||
return None
|
||||
try:
|
||||
return int(s)
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
def get_numId_and_ilvl(self, paragraph):
|
||||
# Access the XML element of the paragraph
|
||||
numPr = paragraph._element.find(
|
||||
".//w:numPr", namespaces=paragraph._element.nsmap
|
||||
)
|
||||
|
||||
if numPr is not None:
|
||||
# Get the numId element and extract the value
|
||||
numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
|
||||
ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
|
||||
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
||||
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
||||
|
||||
return self.str_to_int(numId, default=None), self.str_to_int(
|
||||
ilvl, default=None
|
||||
)
|
||||
|
||||
return None, None # If the paragraph is not part of a list
|
||||
|
||||
def get_label_and_level(self, paragraph):
|
||||
if paragraph.style is None:
|
||||
return "Normal", None
|
||||
label = paragraph.style.name
|
||||
if label is None:
|
||||
return "Normal", None
|
||||
if ":" in label:
|
||||
parts = label.split(":")
|
||||
|
||||
if len(parts) == 2:
|
||||
return parts[0], int(parts[1])
|
||||
|
||||
parts = label.split(" ")
|
||||
|
||||
if "Heading" in label and len(parts) == 2:
|
||||
parts.sort()
|
||||
label_str = ""
|
||||
label_level = 0
|
||||
if parts[0] == "Heading":
|
||||
# print("{} - {}".format(parts[0], parts[1]))
|
||||
label_str = parts[0]
|
||||
label_level = self.str_to_int(parts[1], default=None)
|
||||
if parts[1] == "Heading":
|
||||
label_str = parts[1]
|
||||
label_level = self.str_to_int(parts[0], default=None)
|
||||
return label_str, label_level
|
||||
else:
|
||||
return label, None
|
||||
|
||||
def handle_text_elements(self, element, docx_obj, doc):
|
||||
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
||||
|
||||
if paragraph.text is None:
|
||||
# _log.warn(f"paragraph has text==None")
|
||||
return
|
||||
|
||||
text = paragraph.text.strip()
|
||||
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
# "List Bullet", "List Number", "List Paragraph"
|
||||
# TODO: reliably identify wether list is a numbered list or not
|
||||
# is_numbered = "List Bullet" not in paragraph.style.name
|
||||
is_numbered = False
|
||||
|
||||
p_style_name, p_level = self.get_label_and_level(paragraph)
|
||||
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
||||
# print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
|
||||
|
||||
if numid == 0:
|
||||
numid = None
|
||||
|
||||
# Handle lists
|
||||
if numid is not None and ilevel is not None:
|
||||
self.add_listitem(
|
||||
element,
|
||||
docx_obj,
|
||||
doc,
|
||||
p_style_name,
|
||||
p_level,
|
||||
numid,
|
||||
ilevel,
|
||||
text,
|
||||
is_numbered,
|
||||
)
|
||||
self.update_history(p_style_name, p_level, numid, ilevel)
|
||||
return
|
||||
elif numid is None and self.prev_numid() is not None: # Close list
|
||||
for key, val in self.parents.items():
|
||||
if key >= self.level_at_new_list:
|
||||
self.parents[key] = None
|
||||
self.level = self.level_at_new_list - 1
|
||||
self.level_at_new_list = None
|
||||
if p_style_name in ["Title"]:
|
||||
for key, val in self.parents.items():
|
||||
self.parents[key] = None
|
||||
self.parents[0] = doc.add_text(
|
||||
parent=None, label=DocItemLabel.TITLE, text=text
|
||||
)
|
||||
elif "Heading" in p_style_name:
|
||||
self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
|
||||
|
||||
elif p_style_name in [
|
||||
"Paragraph",
|
||||
"Normal",
|
||||
"Subtitle",
|
||||
"Author",
|
||||
"Default Text",
|
||||
"List Paragraph",
|
||||
"List Bullet",
|
||||
"Quote",
|
||||
]:
|
||||
level = self.get_level()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
||||
)
|
||||
|
||||
else:
|
||||
# Text style names can, and will have, not only default values but user values too
|
||||
# hence we treat all other labels as pure text
|
||||
level = self.get_level()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
||||
)
|
||||
|
||||
self.update_history(p_style_name, p_level, numid, ilevel)
|
||||
return
|
||||
|
||||
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
|
||||
level = self.get_level()
|
||||
if isinstance(curr_level, int):
|
||||
|
||||
if curr_level == level:
|
||||
|
||||
self.parents[level] = doc.add_heading(
|
||||
parent=self.parents[level - 1], text=text
|
||||
)
|
||||
|
||||
elif curr_level > level:
|
||||
|
||||
# add invisible group
|
||||
for i in range(level, curr_level):
|
||||
self.parents[i] = doc.add_group(
|
||||
parent=self.parents[i - 1],
|
||||
label=GroupLabel.SECTION,
|
||||
name=f"header-{i}",
|
||||
)
|
||||
|
||||
self.parents[curr_level] = doc.add_heading(
|
||||
parent=self.parents[curr_level - 1], text=text
|
||||
)
|
||||
|
||||
elif curr_level < level:
|
||||
|
||||
# remove the tail
|
||||
for key, val in self.parents.items():
|
||||
if key >= curr_level:
|
||||
self.parents[key] = None
|
||||
|
||||
self.parents[curr_level] = doc.add_heading(
|
||||
parent=self.parents[curr_level - 1], text=text
|
||||
)
|
||||
|
||||
else:
|
||||
self.parents[self.level] = doc.add_heading(
|
||||
parent=self.parents[self.level - 1], text=text
|
||||
)
|
||||
return
|
||||
|
||||
def add_listitem(
|
||||
self,
|
||||
element,
|
||||
docx_obj,
|
||||
doc,
|
||||
p_style_name,
|
||||
p_level,
|
||||
numid,
|
||||
ilevel,
|
||||
text: str,
|
||||
is_numbered=False,
|
||||
):
|
||||
# is_numbered = is_numbered
|
||||
enum_marker = ""
|
||||
|
||||
level = self.get_level()
|
||||
if self.prev_numid() is None: # Open new list
|
||||
self.level_at_new_list = level # type: ignore
|
||||
|
||||
self.parents[level] = doc.add_group(
|
||||
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
||||
)
|
||||
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
self.listIter += 1
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[level],
|
||||
text=text,
|
||||
)
|
||||
|
||||
elif (
|
||||
self.prev_numid() == numid and self.prev_indent() < ilevel
|
||||
): # Open indented list
|
||||
for i in range(
|
||||
self.level_at_new_list + self.prev_indent() + 1,
|
||||
self.level_at_new_list + ilevel + 1,
|
||||
):
|
||||
# TODO: determine if this is an unordered list or an ordered list.
|
||||
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||
self.listIter = 0
|
||||
if is_numbered:
|
||||
self.parents[i] = doc.add_group(
|
||||
label=GroupLabel.ORDERED_LIST,
|
||||
name="list",
|
||||
parent=self.parents[i - 1],
|
||||
)
|
||||
else:
|
||||
self.parents[i] = doc.add_group(
|
||||
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
|
||||
)
|
||||
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
self.listIter += 1
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[self.level_at_new_list + ilevel],
|
||||
text=text,
|
||||
)
|
||||
|
||||
elif self.prev_numid() == numid and ilevel < self.prev_indent(): # Close list
|
||||
for k, v in self.parents.items():
|
||||
if k > self.level_at_new_list + ilevel:
|
||||
self.parents[k] = None
|
||||
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
self.listIter += 1
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[self.level_at_new_list + ilevel],
|
||||
text=text,
|
||||
)
|
||||
self.listIter = 0
|
||||
|
||||
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
self.listIter += 1
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[level - 1],
|
||||
text=text,
|
||||
)
|
||||
return
|
||||
|
||||
def handle_tables(self, element, docx_obj, doc):
|
||||
|
||||
# Function to check if a cell has a colspan (gridSpan)
|
||||
def get_colspan(cell):
|
||||
grid_span = cell._element.xpath("@w:gridSpan")
|
||||
if grid_span:
|
||||
return int(grid_span[0]) # Return the number of columns spanned
|
||||
return 1 # Default is 1 (no colspan)
|
||||
|
||||
# Function to check if a cell has a rowspan (vMerge)
|
||||
def get_rowspan(cell):
|
||||
v_merge = cell._element.xpath("@w:vMerge")
|
||||
if v_merge:
|
||||
return v_merge[
|
||||
0
|
||||
] # 'restart' indicates the beginning of a rowspan, others are continuation
|
||||
return 1
|
||||
|
||||
table = docx.table.Table(element, docx_obj)
|
||||
|
||||
num_rows = len(table.rows)
|
||||
num_cols = 0
|
||||
for row in table.rows:
|
||||
# Calculate the max number of columns
|
||||
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
||||
# if row.cells:
|
||||
# num_cols = max(num_cols, len(row.cells))
|
||||
|
||||
# Initialize the table grid
|
||||
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
|
||||
for row_idx, row in enumerate(table.rows):
|
||||
col_idx = 0
|
||||
for c, cell in enumerate(row.cells):
|
||||
row_span = get_rowspan(cell)
|
||||
col_span = get_colspan(cell)
|
||||
|
||||
# Find the next available column in the grid
|
||||
while table_grid[row_idx][col_idx] is not None:
|
||||
col_idx += 1
|
||||
|
||||
# Fill the grid with the cell value, considering rowspan and colspan
|
||||
for i in range(row_span if row_span == "restart" else 1):
|
||||
for j in range(col_span):
|
||||
table_grid[row_idx + i][col_idx + j] = ""
|
||||
|
||||
cell = TableCell(
|
||||
text=cell.text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=row_idx,
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
col_header=False, # col_header,
|
||||
row_header=False, # ((not col_header) and html_cell.name=='th')
|
||||
)
|
||||
|
||||
data.table_cells.append(cell)
|
||||
|
||||
level = self.get_level()
|
||||
doc.add_table(data=data, parent=self.parents[level - 1])
|
||||
return
|
||||
|
||||
def handle_pictures(self, element, docx_obj, doc):
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
||||
return
|
||||
78
docling/backend/pdf_backend.py
Normal file
78
docling/backend/pdf_backend.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Set, Union
|
||||
|
||||
from docling_core.types.doc import BoundingBox, Size
|
||||
from PIL import Image
|
||||
|
||||
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
||||
from docling.datamodel.base_models import Cell, InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
class PdfPageBackend(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_text_cells(self) -> Iterable[Cell]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_size(self) -> Size:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def is_valid(self) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
pass
|
||||
|
||||
|
||||
class PdfDocumentBackend(PaginatedDocumentBackend):
|
||||
|
||||
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
if self.input_format is not InputFormat.PDF:
|
||||
if self.input_format is InputFormat.IMAGE:
|
||||
buf = BytesIO()
|
||||
img = Image.open(self.path_or_stream)
|
||||
img.save(buf, "PDF")
|
||||
buf.seek(0)
|
||||
self.path_or_stream = buf
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def page_count(self) -> int:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.PDF}
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return True
|
||||
@@ -2,16 +2,20 @@ import logging
|
||||
import random
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_c
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage, PdfTextPage
|
||||
from pypdfium2 import PdfTextPage
|
||||
from pypdfium2._helpers.misc import PdfiumError
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -222,8 +226,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
return image
|
||||
|
||||
def get_size(self) -> PageSize:
|
||||
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||
def get_size(self) -> Size:
|
||||
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||
|
||||
def unload(self):
|
||||
self._ppage = None
|
||||
@@ -231,13 +235,14 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
|
||||
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
try:
|
||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||
except PdfiumError as e:
|
||||
raise RuntimeError(
|
||||
f"pypdfium could not load document {document_hash}"
|
||||
f"pypdfium could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
def page_count(self) -> int:
|
||||
|
||||
@@ -5,22 +5,27 @@ import time
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Iterable, List, Optional
|
||||
from typing import Annotated, Dict, Iterable, List, Optional
|
||||
|
||||
import typer
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FormatToExtensions,
|
||||
InputFormat,
|
||||
OutputFormat,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
PipelineOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
@@ -87,28 +92,28 @@ def export_documents(
|
||||
fname = output_dir / f"{doc_filename}.json"
|
||||
with fname.open("w") as fp:
|
||||
_log.info(f"writing JSON output to {fname}")
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
fp.write(json.dumps(conv_res.document.export_to_dict()))
|
||||
|
||||
# Export Text format:
|
||||
if export_txt:
|
||||
fname = output_dir / f"{doc_filename}.txt"
|
||||
with fname.open("w") as fp:
|
||||
_log.info(f"writing Text output to {fname}")
|
||||
fp.write(conv_res.render_as_text())
|
||||
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
||||
|
||||
# Export Markdown format:
|
||||
if export_md:
|
||||
fname = output_dir / f"{doc_filename}.md"
|
||||
with fname.open("w") as fp:
|
||||
_log.info(f"writing Markdown output to {fname}")
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
fp.write(conv_res.document.export_to_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
if export_doctags:
|
||||
fname = output_dir / f"{doc_filename}.doctags"
|
||||
with fname.open("w") as fp:
|
||||
_log.info(f"writing Doc Tags output to {fname}")
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
fp.write(conv_res.document.export_to_document_tokens())
|
||||
|
||||
else:
|
||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||
@@ -129,44 +134,31 @@ def convert(
|
||||
help="PDF files to convert. Can be local file / directory paths or URL.",
|
||||
),
|
||||
],
|
||||
export_json: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., "--json/--no-json", help="If enabled the document is exported as JSON."
|
||||
),
|
||||
] = False,
|
||||
export_md: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., "--md/--no-md", help="If enabled the document is exported as Markdown."
|
||||
),
|
||||
] = True,
|
||||
export_txt: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., "--txt/--no-txt", help="If enabled the document is exported as Text."
|
||||
),
|
||||
] = False,
|
||||
export_doctags: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
...,
|
||||
"--doctags/--no-doctags",
|
||||
help="If enabled the document is exported as Doc Tags.",
|
||||
),
|
||||
] = False,
|
||||
from_formats: List[InputFormat] = typer.Option(
|
||||
None,
|
||||
"--from",
|
||||
help="Specify input formats to convert from. Defaults to all formats.",
|
||||
),
|
||||
to_formats: List[OutputFormat] = typer.Option(
|
||||
None, "--to", help="Specify output formats. Defaults to Markdown."
|
||||
),
|
||||
ocr: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., help="If enabled, the bitmap content will be processed using OCR."
|
||||
),
|
||||
] = True,
|
||||
backend: Annotated[
|
||||
Backend, typer.Option(..., help="The PDF backend to use.")
|
||||
] = Backend.DOCLING,
|
||||
ocr_engine: Annotated[
|
||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||
] = OcrEngine.EASYOCR,
|
||||
abort_on_error: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
...,
|
||||
"--abort-on-error/--no-abort-on-error",
|
||||
help="If enabled, the bitmap content will be processed using OCR.",
|
||||
),
|
||||
] = False,
|
||||
output: Annotated[
|
||||
Path, typer.Option(..., help="Output directory where results are saved.")
|
||||
] = Path("."),
|
||||
@@ -182,6 +174,9 @@ def convert(
|
||||
):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
if from_formats is None:
|
||||
from_formats = [e for e in InputFormat]
|
||||
|
||||
input_doc_paths: List[Path] = []
|
||||
for src in input_sources:
|
||||
source = resolve_file_source(source=src)
|
||||
@@ -191,48 +186,54 @@ def convert(
|
||||
)
|
||||
raise typer.Abort()
|
||||
elif source.is_dir():
|
||||
input_doc_paths.extend(list(source.glob("**/*.pdf")))
|
||||
input_doc_paths.extend(list(source.glob("**/*.PDF")))
|
||||
for fmt in from_formats:
|
||||
for ext in FormatToExtensions[fmt]:
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
||||
else:
|
||||
input_doc_paths.append(source)
|
||||
|
||||
match backend:
|
||||
case Backend.PYPDFIUM2:
|
||||
do_cell_matching = ocr # only do cell matching when OCR enabled
|
||||
pdf_backend = PyPdfiumDocumentBackend
|
||||
case Backend.DOCLING:
|
||||
do_cell_matching = True
|
||||
pdf_backend = DoclingParseDocumentBackend
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||
if to_formats is None:
|
||||
to_formats = [OutputFormat.MARKDOWN]
|
||||
|
||||
export_json = OutputFormat.JSON in to_formats
|
||||
export_md = OutputFormat.MARKDOWN in to_formats
|
||||
export_txt = OutputFormat.TEXT in to_formats
|
||||
export_doctags = OutputFormat.DOCTAGS in to_formats
|
||||
|
||||
match ocr_engine:
|
||||
case OcrEngine.EASYOCR:
|
||||
ocr_options = EasyOcrOptions()
|
||||
ocr_options: OcrOptions = EasyOcrOptions()
|
||||
case OcrEngine.TESSERACT_CLI:
|
||||
ocr_options = TesseractCliOcrOptions()
|
||||
case OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions()
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
pipeline_options = PipelineOptions(
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=pdf_backend,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
|
||||
# Define input files
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=DoclingParseDocumentBackend, # pdf_backend
|
||||
)
|
||||
}
|
||||
doc_converter = DocumentConverter(
|
||||
allowed_formats=from_formats,
|
||||
format_options=format_options,
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths, raises_on_error=abort_on_error
|
||||
)
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
export_documents(
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
import copy
|
||||
import warnings
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
|
||||
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
from typing_extensions import Self
|
||||
|
||||
from docling.backend.abstract_backend import PdfPageBackend
|
||||
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
|
||||
PipelineOptions,
|
||||
TableStructureOptions,
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
DocItemLabel,
|
||||
PictureDataType,
|
||||
Size,
|
||||
TableCell,
|
||||
)
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.backend.pdf_backend import PdfPageBackend
|
||||
|
||||
|
||||
class ConversionStatus(str, Enum):
|
||||
@@ -23,18 +24,61 @@ class ConversionStatus(str, Enum):
|
||||
PARTIAL_SUCCESS = auto()
|
||||
|
||||
|
||||
class InputFormat(str, Enum):
|
||||
DOCX = "docx"
|
||||
PPTX = "pptx"
|
||||
HTML = "html"
|
||||
IMAGE = "image"
|
||||
PDF = "pdf"
|
||||
|
||||
|
||||
class OutputFormat(str, Enum):
|
||||
MARKDOWN = "md"
|
||||
JSON = "json"
|
||||
TEXT = "text"
|
||||
DOCTAGS = "doctags"
|
||||
|
||||
|
||||
FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
||||
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
||||
InputFormat.PDF: ["pdf"],
|
||||
InputFormat.HTML: ["html", "htm", "xhtml"],
|
||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||
}
|
||||
|
||||
FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
||||
InputFormat.DOCX: {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
||||
},
|
||||
InputFormat.PPTX: {
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
},
|
||||
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
|
||||
InputFormat.IMAGE: {
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/gif",
|
||||
"image/bmp",
|
||||
},
|
||||
InputFormat.PDF: {"application/pdf"},
|
||||
}
|
||||
MimeTypeToFormat = {
|
||||
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
||||
}
|
||||
|
||||
|
||||
class DocInputType(str, Enum):
|
||||
PATH = auto()
|
||||
STREAM = auto()
|
||||
|
||||
|
||||
class CoordOrigin(str, Enum):
|
||||
TOPLEFT = auto()
|
||||
BOTTOMLEFT = auto()
|
||||
|
||||
|
||||
class DoclingComponentType(str, Enum):
|
||||
PDF_BACKEND = auto()
|
||||
DOCUMENT_BACKEND = auto()
|
||||
MODEL = auto()
|
||||
DOC_ASSEMBLER = auto()
|
||||
|
||||
@@ -45,118 +89,6 @@ class ErrorItem(BaseModel):
|
||||
error_message: str
|
||||
|
||||
|
||||
class PageSize(BaseModel):
|
||||
width: float = 0.0
|
||||
height: float = 0.0
|
||||
|
||||
|
||||
class BoundingBox(BaseModel):
|
||||
l: float # left
|
||||
t: float # top
|
||||
r: float # right
|
||||
b: float # bottom
|
||||
|
||||
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
|
||||
|
||||
@property
|
||||
def width(self):
|
||||
return self.r - self.l
|
||||
|
||||
@property
|
||||
def height(self):
|
||||
return abs(self.t - self.b)
|
||||
|
||||
def scaled(self, scale: float) -> "BoundingBox":
|
||||
out_bbox = copy.deepcopy(self)
|
||||
out_bbox.l *= scale
|
||||
out_bbox.r *= scale
|
||||
out_bbox.t *= scale
|
||||
out_bbox.b *= scale
|
||||
|
||||
return out_bbox
|
||||
|
||||
def normalized(self, page_size: PageSize) -> "BoundingBox":
|
||||
out_bbox = copy.deepcopy(self)
|
||||
out_bbox.l /= page_size.width
|
||||
out_bbox.r /= page_size.width
|
||||
out_bbox.t /= page_size.height
|
||||
out_bbox.b /= page_size.height
|
||||
|
||||
return out_bbox
|
||||
|
||||
def as_tuple(self):
|
||||
if self.coord_origin == CoordOrigin.TOPLEFT:
|
||||
return (self.l, self.t, self.r, self.b)
|
||||
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
return (self.l, self.b, self.r, self.t)
|
||||
|
||||
@classmethod
|
||||
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
|
||||
if origin == CoordOrigin.TOPLEFT:
|
||||
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
||||
if r < l:
|
||||
l, r = r, l
|
||||
if b < t:
|
||||
b, t = t, b
|
||||
|
||||
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
||||
elif origin == CoordOrigin.BOTTOMLEFT:
|
||||
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
|
||||
if r < l:
|
||||
l, r = r, l
|
||||
if b > t:
|
||||
b, t = t, b
|
||||
|
||||
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
||||
|
||||
def area(self) -> float:
|
||||
area = (self.r - self.l) * (self.b - self.t)
|
||||
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
area = -area
|
||||
return area
|
||||
|
||||
def intersection_area_with(self, other: "BoundingBox") -> float:
|
||||
# Calculate intersection coordinates
|
||||
left = max(self.l, other.l)
|
||||
top = max(self.t, other.t)
|
||||
right = min(self.r, other.r)
|
||||
bottom = min(self.b, other.b)
|
||||
|
||||
# Calculate intersection dimensions
|
||||
width = right - left
|
||||
height = bottom - top
|
||||
|
||||
# If the bounding boxes do not overlap, width or height will be negative
|
||||
if width <= 0 or height <= 0:
|
||||
return 0.0
|
||||
|
||||
return width * height
|
||||
|
||||
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
|
||||
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
return self
|
||||
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
||||
return BoundingBox(
|
||||
l=self.l,
|
||||
r=self.r,
|
||||
t=page_height - self.t,
|
||||
b=page_height - self.b,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
|
||||
def to_top_left_origin(self, page_height):
|
||||
if self.coord_origin == CoordOrigin.TOPLEFT:
|
||||
return self
|
||||
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
return BoundingBox(
|
||||
l=self.l,
|
||||
r=self.r,
|
||||
t=page_height - self.t, # self.b
|
||||
b=page_height - self.b, # self.t
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
|
||||
|
||||
class Cell(BaseModel):
|
||||
id: int
|
||||
text: str
|
||||
@@ -169,14 +101,14 @@ class OcrCell(Cell):
|
||||
|
||||
class Cluster(BaseModel):
|
||||
id: int
|
||||
label: str
|
||||
label: DocItemLabel
|
||||
bbox: BoundingBox
|
||||
confidence: float = 1.0
|
||||
cells: List[Cell] = []
|
||||
|
||||
|
||||
class BasePageElement(BaseModel):
|
||||
label: str
|
||||
label: DocItemLabel
|
||||
id: int
|
||||
page_no: int
|
||||
cluster: Cluster
|
||||
@@ -187,37 +119,7 @@ class LayoutPrediction(BaseModel):
|
||||
clusters: List[Cluster] = []
|
||||
|
||||
|
||||
class TableCell(BaseModel):
|
||||
bbox: BoundingBox
|
||||
row_span: int
|
||||
col_span: int
|
||||
start_row_offset_idx: int
|
||||
end_row_offset_idx: int
|
||||
start_col_offset_idx: int
|
||||
end_col_offset_idx: int
|
||||
text: str
|
||||
column_header: bool = False
|
||||
row_header: bool = False
|
||||
row_section: bool = False
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def from_dict_format(cls, data: Any) -> Any:
|
||||
if isinstance(data, Dict):
|
||||
text = data["bbox"].get("token", "")
|
||||
if not len(text):
|
||||
text_cells = data.pop("text_cell_bboxes", None)
|
||||
if text_cells:
|
||||
for el in text_cells:
|
||||
text += el["token"] + " "
|
||||
|
||||
text = text.strip()
|
||||
data["text"] = text
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class TableElement(BasePageElement):
|
||||
class Table(BasePageElement):
|
||||
otsl_seq: List[str]
|
||||
num_rows: int = 0
|
||||
num_cols: int = 0
|
||||
@@ -225,18 +127,15 @@ class TableElement(BasePageElement):
|
||||
|
||||
|
||||
class TableStructurePrediction(BaseModel):
|
||||
table_map: Dict[int, TableElement] = {}
|
||||
table_map: Dict[int, Table] = {}
|
||||
|
||||
|
||||
class TextElement(BasePageElement): ...
|
||||
|
||||
|
||||
class FigureData(BaseModel):
|
||||
pass
|
||||
class TextElement(BasePageElement):
|
||||
text: str
|
||||
|
||||
|
||||
class FigureElement(BasePageElement):
|
||||
data: Optional[FigureData] = None
|
||||
annotations: List[PictureDataType] = []
|
||||
provenance: Optional[str] = None
|
||||
predicted_class: Optional[str] = None
|
||||
confidence: Optional[float] = None
|
||||
@@ -259,7 +158,7 @@ class PagePredictions(BaseModel):
|
||||
equations_prediction: Optional[EquationPrediction] = None
|
||||
|
||||
|
||||
PageElement = Union[TextElement, TableElement, FigureElement]
|
||||
PageElement = Union[TextElement, Table, FigureElement]
|
||||
|
||||
|
||||
class AssembledUnit(BaseModel):
|
||||
@@ -272,13 +171,13 @@ class Page(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
page_no: int
|
||||
page_hash: Optional[str] = None
|
||||
size: Optional[PageSize] = None
|
||||
# page_hash: Optional[str] = None
|
||||
size: Optional[Size] = None
|
||||
cells: List[Cell] = []
|
||||
predictions: PagePredictions = PagePredictions()
|
||||
assembled: Optional[AssembledUnit] = None
|
||||
|
||||
_backend: Optional[PdfPageBackend] = (
|
||||
_backend: Optional["PdfPageBackend"] = (
|
||||
None # Internal PDF backend. By default it is cleared during assembling.
|
||||
)
|
||||
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
||||
@@ -301,24 +200,5 @@ class Page(BaseModel):
|
||||
class DocumentStream(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
filename: str
|
||||
name: str
|
||||
stream: BytesIO
|
||||
|
||||
|
||||
class AssembleOptions(BaseModel):
|
||||
keep_page_images: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
||||
),
|
||||
] = False # False: page images are removed in the assemble step
|
||||
images_scale: Optional[float] = None # if set, the scale for generated images
|
||||
|
||||
@model_validator(mode="after")
|
||||
def set_page_images_from_deprecated(self) -> Self:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
default_scale = 1.0
|
||||
if self.keep_page_images and self.images_scale is None:
|
||||
self.images_scale = default_scale
|
||||
return self
|
||||
|
||||
@@ -1,87 +1,101 @@
|
||||
import logging
|
||||
import re
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
|
||||
from docling_core.types import BaseCell, BaseText
|
||||
import filetype
|
||||
from docling_core.types import BaseText
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||
from docling_core.types import Table as DsSchemaTable
|
||||
from docling_core.types import TableCell
|
||||
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
|
||||
from docling_core.types.doc.base import Figure
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
PictureItem,
|
||||
SectionHeaderItem,
|
||||
TableItem,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import ListItem
|
||||
from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
PaginatedDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
FigureElement,
|
||||
InputFormat,
|
||||
MimeTypeToFormat,
|
||||
Page,
|
||||
PageElement,
|
||||
TableElement,
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
from docling.utils.utils import create_file_hash
|
||||
from docling.utils.utils import create_file_hash, create_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.document_converter import FormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
layout_label_to_ds_type = {
|
||||
"Title": "title",
|
||||
"Document Index": "table-of-path_or_stream",
|
||||
"Section-header": "subtitle-level-1",
|
||||
"Checkbox-Selected": "checkbox-selected",
|
||||
"Checkbox-Unselected": "checkbox-unselected",
|
||||
"Caption": "caption",
|
||||
"Page-header": "page-header",
|
||||
"Page-footer": "page-footer",
|
||||
"Footnote": "footnote",
|
||||
"Table": "table",
|
||||
"Formula": "equation",
|
||||
"List-item": "paragraph",
|
||||
"Code": "paragraph",
|
||||
"Picture": "figure",
|
||||
"Text": "paragraph",
|
||||
DocItemLabel.TITLE: "title",
|
||||
DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
|
||||
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
|
||||
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
||||
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
||||
DocItemLabel.CAPTION: "caption",
|
||||
DocItemLabel.PAGE_HEADER: "page-header",
|
||||
DocItemLabel.PAGE_FOOTER: "page-footer",
|
||||
DocItemLabel.FOOTNOTE: "footnote",
|
||||
DocItemLabel.TABLE: "table",
|
||||
DocItemLabel.FORMULA: "equation",
|
||||
DocItemLabel.LIST_ITEM: "paragraph",
|
||||
DocItemLabel.CODE: "paragraph",
|
||||
DocItemLabel.PICTURE: "figure",
|
||||
DocItemLabel.TEXT: "paragraph",
|
||||
DocItemLabel.PARAGRAPH: "paragraph",
|
||||
}
|
||||
|
||||
_EMPTY_DOC = DsDocument(
|
||||
_name="",
|
||||
description=DsDocumentDescription(logs=[]),
|
||||
file_info=DsFileInfoObject(
|
||||
filename="",
|
||||
document_hash="",
|
||||
),
|
||||
)
|
||||
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
||||
|
||||
|
||||
class InputDocument(BaseModel):
|
||||
file: PurePath = None
|
||||
document_hash: Optional[str] = None
|
||||
valid: bool = False
|
||||
file: PurePath
|
||||
document_hash: str # = None
|
||||
valid: bool = True
|
||||
limits: DocumentLimits = DocumentLimits()
|
||||
format: InputFormat # = None
|
||||
|
||||
filesize: Optional[int] = None
|
||||
page_count: Optional[int] = None
|
||||
page_count: int = 0
|
||||
|
||||
_backend: PdfDocumentBackend = None # Internal PDF backend used
|
||||
_backend: AbstractDocumentBackend # Internal PDF backend used
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
format: InputFormat,
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
filename: Optional[str] = None,
|
||||
limits: Optional[DocumentLimits] = None,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
):
|
||||
super().__init__()
|
||||
super().__init__(
|
||||
file="", document_hash="", format=InputFormat.PDF
|
||||
) # initialize with dummy values
|
||||
|
||||
self.limits = limits or DocumentLimits()
|
||||
self.format = format
|
||||
|
||||
try:
|
||||
if isinstance(path_or_stream, Path):
|
||||
@@ -91,11 +105,12 @@ class InputDocument(BaseModel):
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = pdf_backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
self._init_doc(backend, path_or_stream)
|
||||
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
assert (
|
||||
filename is not None
|
||||
), "Can't construct InputDocument from stream without providing filename arg."
|
||||
self.file = PurePath(filename)
|
||||
self.filesize = path_or_stream.getbuffer().nbytes
|
||||
|
||||
@@ -103,15 +118,20 @@ class InputDocument(BaseModel):
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = pdf_backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
self._init_doc(backend, path_or_stream)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Unexpected type path_or_stream: {type(path_or_stream)}"
|
||||
)
|
||||
|
||||
if self.document_hash and self._backend.page_count() > 0:
|
||||
self.page_count = self._backend.page_count()
|
||||
|
||||
if self.page_count <= self.limits.max_num_pages:
|
||||
self.valid = True
|
||||
# For paginated backends, check if the maximum page count is exceeded.
|
||||
if self.valid and self._backend.is_valid():
|
||||
if self._backend.supports_pagination() and isinstance(
|
||||
self._backend, PaginatedDocumentBackend
|
||||
):
|
||||
self.page_count = self._backend.page_count()
|
||||
if not self.page_count <= self.limits.max_num_pages:
|
||||
self.valid = False
|
||||
|
||||
except (FileNotFoundError, OSError) as e:
|
||||
_log.exception(
|
||||
@@ -125,9 +145,26 @@ class InputDocument(BaseModel):
|
||||
)
|
||||
# raise
|
||||
|
||||
def _init_doc(
|
||||
self,
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
if backend is None:
|
||||
raise RuntimeError(
|
||||
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
|
||||
@deprecated("Use `ConversionResult` instead.")
|
||||
class ConvertedDocument(BaseModel):
|
||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||
|
||||
|
||||
class DocumentFormat(str, Enum):
|
||||
V2 = "v2"
|
||||
V1 = "v1"
|
||||
|
||||
|
||||
class ConversionResult(BaseModel):
|
||||
input: InputDocument
|
||||
|
||||
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
||||
@@ -136,15 +173,42 @@ class ConvertedDocument(BaseModel):
|
||||
pages: List[Page] = []
|
||||
assembled: AssembledUnit = AssembledUnit()
|
||||
|
||||
output: DsDocument = _EMPTY_DOC
|
||||
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||
|
||||
@property
|
||||
@deprecated("Use document instead.")
|
||||
def legacy_document(self):
|
||||
reverse_label_mapping = {
|
||||
DocItemLabel.CAPTION.value: "Caption",
|
||||
DocItemLabel.FOOTNOTE.value: "Footnote",
|
||||
DocItemLabel.FORMULA.value: "Formula",
|
||||
DocItemLabel.LIST_ITEM.value: "List-item",
|
||||
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
|
||||
DocItemLabel.PAGE_HEADER.value: "Page-header",
|
||||
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
|
||||
DocItemLabel.SECTION_HEADER.value: "Section-header",
|
||||
DocItemLabel.TABLE.value: "Table",
|
||||
DocItemLabel.TEXT.value: "Text",
|
||||
DocItemLabel.TITLE.value: "Title",
|
||||
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
|
||||
DocItemLabel.CODE.value: "Code",
|
||||
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
|
||||
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
|
||||
DocItemLabel.FORM.value: "Form",
|
||||
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
|
||||
DocItemLabel.PARAGRAPH.value: "paragraph",
|
||||
}
|
||||
|
||||
def _to_ds_document(self) -> DsDocument:
|
||||
title = ""
|
||||
desc = DsDocumentDescription(logs=[])
|
||||
|
||||
page_hashes = [
|
||||
PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
|
||||
for p in self.pages
|
||||
PageReference(
|
||||
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
|
||||
page=p.page_no,
|
||||
model="default",
|
||||
)
|
||||
for p in self.document.pages.values()
|
||||
]
|
||||
|
||||
file_info = DsFileInfoObject(
|
||||
@@ -157,145 +221,199 @@ class ConvertedDocument(BaseModel):
|
||||
main_text = []
|
||||
tables = []
|
||||
figures = []
|
||||
equations = []
|
||||
footnotes = []
|
||||
page_headers = []
|
||||
page_footers = []
|
||||
|
||||
page_no_to_page = {p.page_no: p for p in self.pages}
|
||||
embedded_captions = set()
|
||||
for ix, (item, level) in enumerate(
|
||||
self.document.iterate_items(self.document.body)
|
||||
):
|
||||
|
||||
for element in self.assembled.elements:
|
||||
# Convert bboxes to lower-left origin.
|
||||
target_bbox = DsBoundingBox(
|
||||
element.cluster.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple()
|
||||
)
|
||||
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
|
||||
caption = item.caption_text(self.document)
|
||||
if caption:
|
||||
embedded_captions.add(caption)
|
||||
|
||||
if isinstance(element, TextElement):
|
||||
main_text.append(
|
||||
BaseText(
|
||||
text=element.text,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
name=element.label,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, len(element.text)],
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
elif isinstance(element, TableElement):
|
||||
index = len(tables)
|
||||
ref_str = f"#/tables/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
for item, level in self.document.iterate_items():
|
||||
if isinstance(item, DocItem):
|
||||
item_type = item.label
|
||||
|
||||
# Initialise empty table data grid (only empty cells)
|
||||
table_data = [
|
||||
[
|
||||
TableCell(
|
||||
text="",
|
||||
# bbox=[0,0,0,0],
|
||||
spans=[[i, j]],
|
||||
obj_type="body",
|
||||
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
|
||||
|
||||
if isinstance(item, ListItem) and item.marker:
|
||||
text = f"{item.marker} {item.text}"
|
||||
else:
|
||||
text = item.text
|
||||
|
||||
# Can be empty.
|
||||
prov = [
|
||||
Prov(
|
||||
bbox=p.bbox.as_tuple(),
|
||||
page=p.page_no,
|
||||
span=[0, len(item.text)],
|
||||
)
|
||||
for j in range(element.num_cols)
|
||||
for p in item.prov
|
||||
]
|
||||
for i in range(element.num_rows)
|
||||
]
|
||||
main_text.append(
|
||||
BaseText(
|
||||
text=text,
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
name=reverse_label_mapping[item.label],
|
||||
prov=prov,
|
||||
)
|
||||
)
|
||||
|
||||
# Overwrite cells in table data for which there is actual cell content.
|
||||
for cell in element.table_cells:
|
||||
for i in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for j in range(
|
||||
min(cell.start_col_offset_idx, element.num_cols),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
# skip captions of they are embedded in the actual
|
||||
# floating object
|
||||
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
||||
continue
|
||||
|
||||
elif isinstance(item, TableItem) and item.data:
|
||||
index = len(tables)
|
||||
ref_str = f"#/tables/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=reverse_label_mapping[item.label],
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
|
||||
# Initialise empty table data grid (only empty cells)
|
||||
table_data = [
|
||||
[
|
||||
TableCell(
|
||||
text="",
|
||||
# bbox=[0,0,0,0],
|
||||
spans=[[i, j]],
|
||||
obj_type="body",
|
||||
)
|
||||
for j in range(item.data.num_cols)
|
||||
]
|
||||
for i in range(item.data.num_rows)
|
||||
]
|
||||
|
||||
# Overwrite cells in table data for which there is actual cell content.
|
||||
for cell in item.data.table_cells:
|
||||
for i in range(
|
||||
min(cell.start_row_offset_idx, item.data.num_rows),
|
||||
min(cell.end_row_offset_idx, item.data.num_rows),
|
||||
):
|
||||
celltype = "body"
|
||||
if cell.column_header:
|
||||
celltype = "col_header"
|
||||
elif cell.row_header:
|
||||
celltype = "row_header"
|
||||
elif cell.row_section:
|
||||
celltype = "row_section"
|
||||
for j in range(
|
||||
min(cell.start_col_offset_idx, item.data.num_cols),
|
||||
min(cell.end_col_offset_idx, item.data.num_cols),
|
||||
):
|
||||
celltype = "body"
|
||||
if cell.column_header:
|
||||
celltype = "col_header"
|
||||
elif cell.row_header:
|
||||
celltype = "row_header"
|
||||
elif cell.row_section:
|
||||
celltype = "row_section"
|
||||
|
||||
def make_spans(cell):
|
||||
for rspan in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for cspan in range(
|
||||
def make_spans(cell):
|
||||
for rspan in range(
|
||||
min(
|
||||
cell.start_col_offset_idx, element.num_cols
|
||||
cell.start_row_offset_idx,
|
||||
item.data.num_rows,
|
||||
),
|
||||
min(
|
||||
cell.end_row_offset_idx, item.data.num_rows
|
||||
),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
):
|
||||
yield [rspan, cspan]
|
||||
for cspan in range(
|
||||
min(
|
||||
cell.start_col_offset_idx,
|
||||
item.data.num_cols,
|
||||
),
|
||||
min(
|
||||
cell.end_col_offset_idx,
|
||||
item.data.num_cols,
|
||||
),
|
||||
):
|
||||
yield [rspan, cspan]
|
||||
|
||||
spans = list(make_spans(cell))
|
||||
table_data[i][j] = TableCell(
|
||||
text=cell.text,
|
||||
bbox=cell.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple(),
|
||||
# col=j,
|
||||
# row=i,
|
||||
spans=spans,
|
||||
obj_type=celltype,
|
||||
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
|
||||
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
|
||||
)
|
||||
spans = list(make_spans(cell))
|
||||
table_data[i][j] = GlmTableCell(
|
||||
text=cell.text,
|
||||
bbox=(
|
||||
cell.bbox.as_tuple()
|
||||
if cell.bbox is not None
|
||||
else None
|
||||
), # check if this is bottom-left
|
||||
spans=spans,
|
||||
obj_type=celltype,
|
||||
col=j,
|
||||
row=i,
|
||||
row_header=cell.row_header,
|
||||
row_section=cell.row_section,
|
||||
col_header=cell.column_header,
|
||||
row_span=[
|
||||
cell.start_row_offset_idx,
|
||||
cell.end_row_offset_idx,
|
||||
],
|
||||
col_span=[
|
||||
cell.start_col_offset_idx,
|
||||
cell.end_col_offset_idx,
|
||||
],
|
||||
)
|
||||
|
||||
tables.append(
|
||||
DsSchemaTable(
|
||||
num_cols=element.num_cols,
|
||||
num_rows=element.num_rows,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
data=table_data,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
# Compute the caption
|
||||
caption = item.caption_text(self.document)
|
||||
|
||||
tables.append(
|
||||
DsSchemaTable(
|
||||
text=caption,
|
||||
num_cols=item.data.num_cols,
|
||||
num_rows=item.data.num_rows,
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
data=table_data,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=p.bbox.as_tuple(),
|
||||
page=p.page_no,
|
||||
span=[0, 0],
|
||||
)
|
||||
for p in item.prov
|
||||
],
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(element, FigureElement):
|
||||
index = len(figures)
|
||||
ref_str = f"#/figures/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
figures.append(
|
||||
Figure(
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
# data=[[]],
|
||||
elif isinstance(item, PictureItem):
|
||||
index = len(figures)
|
||||
ref_str = f"#/figures/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=reverse_label_mapping[item.label],
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
|
||||
# Compute the caption
|
||||
caption = item.caption_text(self.document)
|
||||
|
||||
figures.append(
|
||||
Figure(
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=p.bbox.as_tuple(),
|
||||
page=p.page_no,
|
||||
span=[0, len(caption)],
|
||||
)
|
||||
for p in item.prov
|
||||
],
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
text=caption,
|
||||
# data=[[]],
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
page_dimensions = [
|
||||
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
||||
for p in self.pages
|
||||
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
||||
for p in self.document.pages.values()
|
||||
]
|
||||
|
||||
ds_doc = DsDocument(
|
||||
@@ -303,6 +421,10 @@ class ConvertedDocument(BaseModel):
|
||||
description=desc,
|
||||
file_info=file_info,
|
||||
main_text=main_text,
|
||||
equations=equations,
|
||||
footnotes=footnotes,
|
||||
page_headers=page_headers,
|
||||
page_footers=page_footers,
|
||||
tables=tables,
|
||||
figures=figures,
|
||||
page_dimensions=page_dimensions,
|
||||
@@ -310,152 +432,76 @@ class ConvertedDocument(BaseModel):
|
||||
|
||||
return ds_doc
|
||||
|
||||
def render_as_dict(self):
|
||||
return self.output.model_dump(by_alias=True, exclude_none=True)
|
||||
|
||||
def render_as_markdown(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
main_text_stop: Optional[int] = None,
|
||||
main_text_labels: list[str] = [
|
||||
"title",
|
||||
"subtitle-level-1",
|
||||
"paragraph",
|
||||
"caption",
|
||||
"table",
|
||||
"figure",
|
||||
],
|
||||
strict_text: bool = False,
|
||||
image_placeholder: str = "<!-- image -->",
|
||||
):
|
||||
return self.output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
strict_text=strict_text,
|
||||
image_placeholder=image_placeholder,
|
||||
)
|
||||
class _DocumentConversionInput(BaseModel):
|
||||
|
||||
def render_as_text(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
main_text_stop: Optional[int] = None,
|
||||
main_text_labels: list[str] = [
|
||||
"title",
|
||||
"subtitle-level-1",
|
||||
"paragraph",
|
||||
"caption",
|
||||
],
|
||||
):
|
||||
return self.output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
strict_text=True,
|
||||
)
|
||||
|
||||
def render_as_doctags(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
main_text_stop: Optional[int] = None,
|
||||
main_text_labels: list[str] = [
|
||||
"title",
|
||||
"subtitle-level-1",
|
||||
"paragraph",
|
||||
"caption",
|
||||
"table",
|
||||
"figure",
|
||||
],
|
||||
xsize: int = 100,
|
||||
ysize: int = 100,
|
||||
add_location: bool = True,
|
||||
add_content: bool = True,
|
||||
add_page_index: bool = True,
|
||||
# table specific flags
|
||||
add_table_cell_location: bool = False,
|
||||
add_table_cell_label: bool = True,
|
||||
add_table_cell_text: bool = True,
|
||||
) -> str:
|
||||
return self.output.export_to_document_tokens(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
xsize=xsize,
|
||||
ysize=ysize,
|
||||
add_location=add_location,
|
||||
add_content=add_content,
|
||||
add_page_index=add_page_index,
|
||||
# table specific flags
|
||||
add_table_cell_location=add_table_cell_location,
|
||||
add_table_cell_label=add_table_cell_label,
|
||||
add_table_cell_text=add_table_cell_text,
|
||||
)
|
||||
|
||||
def render_element_images(
|
||||
self, element_types: Tuple[PageElement] = (FigureElement,)
|
||||
):
|
||||
for element in self.assembled.elements:
|
||||
if isinstance(element, element_types):
|
||||
page_ix = element.page_no
|
||||
scale = self.pages[page_ix]._default_image_scale
|
||||
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
|
||||
page_height=self.pages[page_ix].size.height * scale
|
||||
)
|
||||
|
||||
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
|
||||
yield element, cropped_im
|
||||
|
||||
|
||||
class ConversionResult(ConvertedDocument):
|
||||
pass
|
||||
|
||||
|
||||
class DocumentConversionInput(BaseModel):
|
||||
|
||||
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||
|
||||
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
||||
|
||||
def docs(
|
||||
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
|
||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||
) -> Iterable[InputDocument]:
|
||||
for item in self.path_or_stream_iterator:
|
||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||
format = self._guess_format(obj)
|
||||
if format not in format_options.keys():
|
||||
_log.info(
|
||||
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
||||
)
|
||||
continue
|
||||
else:
|
||||
backend = format_options[format].backend
|
||||
|
||||
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
|
||||
|
||||
for obj in self._path_or_stream_iterator:
|
||||
if isinstance(obj, Path):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
|
||||
path_or_stream=obj,
|
||||
format=format,
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
)
|
||||
elif isinstance(obj, DocumentStream):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj.stream,
|
||||
filename=obj.filename,
|
||||
format=format,
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
pdf_backend=pdf_backend,
|
||||
backend=backend,
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
||||
|
||||
@classmethod
|
||||
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
||||
paths = [Path(p) for p in paths]
|
||||
def _guess_format(self, obj):
|
||||
content = None
|
||||
if isinstance(obj, Path):
|
||||
mime = filetype.guess_mime(str(obj))
|
||||
if mime is None:
|
||||
with obj.open("rb") as f:
|
||||
content = f.read(1024) # Read first 1KB
|
||||
|
||||
doc_input = cls(limits=limits)
|
||||
doc_input._path_or_stream_iterator = paths
|
||||
elif isinstance(obj, DocumentStream):
|
||||
obj.stream.seek(0)
|
||||
content = obj.stream.read(8192)
|
||||
obj.stream.seek(0)
|
||||
mime = filetype.guess_mime(content)
|
||||
|
||||
return doc_input
|
||||
if mime is None:
|
||||
mime = self._detect_html_xhtml(content)
|
||||
|
||||
@classmethod
|
||||
def from_streams(
|
||||
cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
|
||||
):
|
||||
doc_input = cls(limits=limits)
|
||||
doc_input._path_or_stream_iterator = streams
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
return format
|
||||
|
||||
return doc_input
|
||||
def _detect_html_xhtml(self, content):
|
||||
content_str = content.decode("ascii", errors="ignore").lower()
|
||||
# Remove XML comments
|
||||
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
||||
content_str = content_str.lstrip()
|
||||
|
||||
if re.match(r"<\?xml", content_str):
|
||||
if "xhtml" in content_str[:1000]:
|
||||
return "application/xhtml+xml"
|
||||
|
||||
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
||||
return "text/html"
|
||||
|
||||
return None
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from enum import Enum, auto
|
||||
from pathlib import Path
|
||||
from typing import List, Literal, Optional, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
@@ -58,6 +59,13 @@ class TesseractOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
create_legacy_output: bool = (
|
||||
True # This defautl will be set to False on a future version of docling
|
||||
)
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
@@ -65,3 +73,8 @@ class PipelineOptions(BaseModel):
|
||||
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
|
||||
Field(EasyOcrOptions(), discriminator="kind")
|
||||
)
|
||||
|
||||
images_scale: float = 1.0
|
||||
generate_page_images: bool = False
|
||||
generate_picture_images: bool = False
|
||||
generate_table_images: bool = False
|
||||
|
||||
@@ -14,6 +14,7 @@ class BatchConcurrencySettings(BaseModel):
|
||||
doc_batch_concurrency: int = 2
|
||||
page_batch_size: int = 4
|
||||
page_batch_concurrency: int = 2
|
||||
elements_batch_size: int = 16
|
||||
|
||||
# doc_batch_size: int = 1
|
||||
# doc_batch_concurrency: int = 1
|
||||
|
||||
@@ -1,84 +1,179 @@
|
||||
import functools
|
||||
import logging
|
||||
import tempfile
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type, Union
|
||||
from typing import Dict, Iterable, Iterator, List, Optional, Type
|
||||
|
||||
import requests
|
||||
from PIL import ImageDraw
|
||||
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
AssembleOptions,
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
ErrorItem,
|
||||
Page,
|
||||
)
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
DocumentConversionInput,
|
||||
InputDocument,
|
||||
_DocumentConversionInput,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.ds_glm_model import GlmModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
from docling.pipeline.standard_model_pipeline import StandardModelPipeline
|
||||
from docling.utils.utils import chunkify, create_hash
|
||||
from docling.datamodel.settings import DocumentLimits, settings
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
from docling.utils.utils import chunkify
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FormatOption(BaseModel):
|
||||
pipeline_cls: Type[BasePipeline]
|
||||
pipeline_options: Optional[PipelineOptions] = None
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def set_optional_field_default(self) -> "FormatOption":
|
||||
if self.pipeline_options is None:
|
||||
self.pipeline_options = self.pipeline_cls.get_default_options()
|
||||
return self
|
||||
|
||||
|
||||
class WordFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
||||
|
||||
|
||||
class PowerpointFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
||||
|
||||
|
||||
class HTMLFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||
|
||||
|
||||
class PdfFormatOption(FormatOption):
|
||||
pipeline_cls: Type = StandardPdfPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||
|
||||
|
||||
class ImageFormatOption(FormatOption):
|
||||
pipeline_cls: Type = StandardPdfPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||
|
||||
|
||||
_format_to_default_options = {
|
||||
InputFormat.DOCX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
||||
),
|
||||
InputFormat.PPTX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class DocumentConverter:
|
||||
_default_download_filename = "file.pdf"
|
||||
_default_download_filename = "file"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
artifacts_path: Optional[Union[Path, str]] = None,
|
||||
pipeline_options: PipelineOptions = PipelineOptions(),
|
||||
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
|
||||
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
|
||||
assemble_options: AssembleOptions = AssembleOptions(),
|
||||
allowed_formats: Optional[List[InputFormat]] = None,
|
||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||
):
|
||||
if not artifacts_path:
|
||||
artifacts_path = self.download_models_hf()
|
||||
self.allowed_formats = allowed_formats
|
||||
self.format_to_options = format_options
|
||||
|
||||
artifacts_path = Path(artifacts_path)
|
||||
if self.allowed_formats is None:
|
||||
# if self.format_to_options is not None:
|
||||
# self.allowed_formats = self.format_to_options.keys()
|
||||
# else:
|
||||
self.allowed_formats = [e for e in InputFormat] # all formats
|
||||
|
||||
self.model_pipeline = pipeline_cls(
|
||||
artifacts_path=artifacts_path, pipeline_options=pipeline_options
|
||||
if self.format_to_options is None:
|
||||
self.format_to_options = _format_to_default_options
|
||||
else:
|
||||
for f in self.allowed_formats:
|
||||
if f not in self.format_to_options.keys():
|
||||
_log.debug(f"Requested format {f} will use default options.")
|
||||
self.format_to_options[f] = _format_to_default_options[f]
|
||||
|
||||
remove_keys = []
|
||||
for f in self.format_to_options.keys():
|
||||
if f not in self.allowed_formats:
|
||||
remove_keys.append(f)
|
||||
|
||||
for f in remove_keys:
|
||||
self.format_to_options.pop(f)
|
||||
|
||||
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
||||
|
||||
@validate_call(config=ConfigDict(strict=True))
|
||||
def convert(
|
||||
self,
|
||||
source: Path | str | DocumentStream, # TODO review naming
|
||||
raises_on_error: bool = True,
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
) -> ConversionResult:
|
||||
|
||||
all_res = self.convert_all(
|
||||
source=[source],
|
||||
raises_on_error=raises_on_error,
|
||||
max_num_pages=max_num_pages,
|
||||
max_file_size=max_file_size,
|
||||
)
|
||||
return next(all_res)
|
||||
|
||||
self.page_assemble_model = PageAssembleModel(config={})
|
||||
self.glm_model = GlmModel(config={})
|
||||
self.pdf_backend = pdf_backend
|
||||
self.assemble_options = assemble_options
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models",
|
||||
force_download=force,
|
||||
local_dir=local_dir,
|
||||
revision="v2.0.0",
|
||||
@validate_call(config=ConfigDict(strict=True))
|
||||
def convert_all(
|
||||
self,
|
||||
source: Iterable[Path | str | DocumentStream], # TODO review naming
|
||||
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
) -> Iterator[ConversionResult]:
|
||||
limits = DocumentLimits(
|
||||
max_num_pages=max_num_pages,
|
||||
max_file_size=max_file_size,
|
||||
)
|
||||
conv_input = _DocumentConversionInput(
|
||||
path_or_stream_iterator=source,
|
||||
limit=limits,
|
||||
)
|
||||
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
||||
for conv_res in conv_res_iter:
|
||||
if raises_on_error and conv_res.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
ConversionStatus.PARTIAL_SUCCESS,
|
||||
}:
|
||||
raise RuntimeError(
|
||||
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
||||
)
|
||||
else:
|
||||
yield conv_res
|
||||
|
||||
return Path(download_path)
|
||||
|
||||
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
||||
def _convert(
|
||||
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
||||
) -> Iterator[ConversionResult]:
|
||||
assert self.format_to_options is not None
|
||||
|
||||
for input_batch in chunkify(
|
||||
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
|
||||
conv_input.docs(self.format_to_options),
|
||||
settings.perf.doc_batch_size, # pass format_options
|
||||
):
|
||||
_log.info(f"Going to convert document batch...")
|
||||
# parallel processing only within input_batch
|
||||
@@ -87,211 +182,79 @@ class DocumentConverter:
|
||||
# ) as pool:
|
||||
# yield from pool.map(self.process_document, input_batch)
|
||||
|
||||
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
||||
yield from map(self._process_document, input_batch)
|
||||
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
||||
for item in map(
|
||||
partial(self._process_document, raises_on_error=raises_on_error),
|
||||
input_batch,
|
||||
):
|
||||
if item is not None:
|
||||
yield item
|
||||
|
||||
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
||||
"""Convert a single document.
|
||||
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
|
||||
assert self.format_to_options is not None
|
||||
|
||||
Args:
|
||||
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
|
||||
fopt = self.format_to_options.get(doc.format)
|
||||
|
||||
Raises:
|
||||
ValueError: If source is of unexpected type.
|
||||
RuntimeError: If conversion fails.
|
||||
if fopt is None:
|
||||
raise RuntimeError(f"Could not get pipeline for document {doc.file}")
|
||||
else:
|
||||
pipeline_class = fopt.pipeline_cls
|
||||
pipeline_options = fopt.pipeline_options
|
||||
|
||||
Returns:
|
||||
ConversionResult: The conversion result object.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
try:
|
||||
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
||||
res = requests.get(http_url, stream=True)
|
||||
res.raise_for_status()
|
||||
fname = None
|
||||
# try to get filename from response header
|
||||
if cont_disp := res.headers.get("Content-Disposition"):
|
||||
for par in cont_disp.strip().split(";"):
|
||||
# currently only handling directive "filename" (not "*filename")
|
||||
if (split := par.split("=")) and split[0].strip() == "filename":
|
||||
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
||||
break
|
||||
# otherwise, use name from URL:
|
||||
if fname is None:
|
||||
fname = Path(http_url.path).name or self._default_download_filename
|
||||
local_path = Path(temp_dir) / fname
|
||||
with open(local_path, "wb") as f:
|
||||
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
||||
f.write(chunk)
|
||||
except ValidationError:
|
||||
try:
|
||||
local_path = TypeAdapter(Path).validate_python(source)
|
||||
except ValidationError:
|
||||
raise ValueError(
|
||||
f"Unexpected file path type encountered: {type(source)}"
|
||||
)
|
||||
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
||||
conv_res_iter = self.convert(conv_inp)
|
||||
conv_res: ConversionResult = next(conv_res_iter)
|
||||
if conv_res.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
ConversionStatus.PARTIAL_SUCCESS,
|
||||
}:
|
||||
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
||||
return conv_res
|
||||
|
||||
def _process_document(self, in_doc: InputDocument) -> ConversionResult:
|
||||
start_doc_time = time.time()
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
|
||||
_log.info(f"Processing document {in_doc.file.name}")
|
||||
|
||||
if not in_doc.valid:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
for i in range(0, in_doc.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
||||
all_assembled_pages = []
|
||||
|
||||
try:
|
||||
# Iterate batches of pages (page_batch_size) in the doc
|
||||
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
||||
start_pb_time = time.time()
|
||||
# Pipeline
|
||||
|
||||
# 1. Initialise the page resources
|
||||
init_pages = map(
|
||||
functools.partial(self._initialize_page, in_doc), page_batch
|
||||
)
|
||||
|
||||
# 2. Populate page image
|
||||
pages_with_images = map(
|
||||
functools.partial(self._populate_page_images, in_doc), init_pages
|
||||
)
|
||||
|
||||
# 3. Populate programmatic page cells
|
||||
pages_with_cells = map(
|
||||
functools.partial(self._parse_page_cells, in_doc),
|
||||
pages_with_images,
|
||||
)
|
||||
|
||||
# 4. Run pipeline stages
|
||||
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
|
||||
|
||||
# 5. Assemble page elements (per page)
|
||||
assembled_pages = self.page_assemble_model(pipeline_pages)
|
||||
|
||||
# exhaust assembled_pages
|
||||
for assembled_page in assembled_pages:
|
||||
# Free up mem resources before moving on with next batch
|
||||
|
||||
# Remove page images (can be disabled)
|
||||
if self.assemble_options.images_scale is None:
|
||||
assembled_page._image_cache = {}
|
||||
|
||||
# Unload backend
|
||||
assembled_page._backend.unload()
|
||||
|
||||
all_assembled_pages.append(assembled_page)
|
||||
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
|
||||
conv_res.pages = all_assembled_pages
|
||||
self._assemble_doc(conv_res)
|
||||
|
||||
status = ConversionStatus.SUCCESS
|
||||
for page in conv_res.pages:
|
||||
if not page._backend.is_valid():
|
||||
conv_res.errors.append(
|
||||
ErrorItem(
|
||||
component_type=DoclingComponentType.PDF_BACKEND,
|
||||
module_name=type(page._backend).__name__,
|
||||
error_message=f"Page {page.page_no} failed to parse.",
|
||||
)
|
||||
)
|
||||
status = ConversionStatus.PARTIAL_SUCCESS
|
||||
|
||||
conv_res.status = status
|
||||
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
trace = "\n".join(traceback.format_exception(e))
|
||||
_log.info(
|
||||
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
||||
f"{trace}"
|
||||
assert pipeline_options is not None
|
||||
# TODO this will ignore if different options have been defined for the same pipeline class.
|
||||
if (
|
||||
pipeline_class not in self.initialized_pipelines
|
||||
or self.initialized_pipelines[pipeline_class].pipeline_options
|
||||
!= pipeline_options
|
||||
):
|
||||
self.initialized_pipelines[pipeline_class] = pipeline_class(
|
||||
pipeline_options=pipeline_options
|
||||
)
|
||||
return self.initialized_pipelines[pipeline_class]
|
||||
|
||||
finally:
|
||||
# Always unload the PDF backend, even in case of failure
|
||||
if in_doc._backend:
|
||||
in_doc._backend.unload()
|
||||
def _process_document(
|
||||
self, in_doc: InputDocument, raises_on_error: bool
|
||||
) -> Optional[ConversionResult]:
|
||||
assert self.allowed_formats is not None
|
||||
assert in_doc.format in self.allowed_formats
|
||||
|
||||
start_doc_time = time.time()
|
||||
|
||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||
|
||||
end_doc_time = time.time() - start_doc_time
|
||||
_log.info(
|
||||
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
|
||||
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
|
||||
)
|
||||
|
||||
return conv_res
|
||||
|
||||
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
|
||||
def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||
page._backend = doc._backend.load_page(page.page_no)
|
||||
page.size = page._backend.get_size()
|
||||
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
|
||||
def _execute_pipeline(
|
||||
self, in_doc: InputDocument, raises_on_error: bool
|
||||
) -> ConversionResult:
|
||||
if in_doc.valid:
|
||||
pipeline = self._get_pipeline(in_doc)
|
||||
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
||||
if raises_on_error:
|
||||
raise RuntimeError(
|
||||
f"No pipeline could be initialized for {in_doc.file}."
|
||||
)
|
||||
else:
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
return page
|
||||
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
||||
|
||||
# Generate the page image and store it in the page object
|
||||
def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
||||
# default scale
|
||||
page.get_image(
|
||||
scale=1.0
|
||||
) # puts the page image on the image cache at default scale
|
||||
else:
|
||||
if raises_on_error:
|
||||
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
|
||||
|
||||
# user requested scales
|
||||
if self.assemble_options.images_scale is not None:
|
||||
page._default_image_scale = self.assemble_options.images_scale
|
||||
page.get_image(
|
||||
scale=self.assemble_options.images_scale
|
||||
) # this will trigger storing the image in the internal cache
|
||||
else:
|
||||
# invalid doc or not of desired format
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
# TODO add error log why it failed.
|
||||
|
||||
return page
|
||||
|
||||
# Extract and populate the page cells and store it in the page object
|
||||
def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
|
||||
page.cells = page._backend.get_text_cells()
|
||||
|
||||
# DEBUG code:
|
||||
def draw_text_boxes(image, cells):
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
image.show()
|
||||
|
||||
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
||||
|
||||
return page
|
||||
|
||||
def _assemble_doc(self, conv_res: ConversionResult):
|
||||
all_elements = []
|
||||
all_headers = []
|
||||
all_body = []
|
||||
|
||||
for p in conv_res.pages:
|
||||
|
||||
for el in p.assembled.body:
|
||||
all_body.append(el)
|
||||
for el in p.assembled.headers:
|
||||
all_headers.append(el)
|
||||
for el in p.assembled.elements:
|
||||
all_elements.append(el)
|
||||
|
||||
conv_res.assembled = AssembledUnit(
|
||||
elements=all_elements, headers=all_headers, body=all_body
|
||||
)
|
||||
|
||||
conv_res.output = self.glm_model(conv_res)
|
||||
return conv_res
|
||||
|
||||
25
docling/models/base_model.py
Normal file
25
docling/models/base_model.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Iterable
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
|
||||
|
||||
class BasePageModel(ABC):
|
||||
@abstractmethod
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
pass
|
||||
|
||||
|
||||
class BaseEnrichmentModel(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def __call__(
|
||||
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
||||
) -> Iterable[Any]:
|
||||
pass
|
||||
@@ -1,14 +1,15 @@
|
||||
import copy
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from typing import Iterable, List, Tuple
|
||||
from typing import Iterable, List
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import find_objects, label
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.pipeline_options import OcrOptions
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -20,8 +21,9 @@ class BaseOcrModel:
|
||||
self.options = options
|
||||
|
||||
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
||||
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
|
||||
def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
|
||||
BITMAP_COVERAGE_TRESHOLD = 0.75
|
||||
assert page.size is not None
|
||||
|
||||
def find_ocr_rects(size, bitmap_rects):
|
||||
image = Image.new(
|
||||
@@ -60,7 +62,10 @@ class BaseOcrModel:
|
||||
|
||||
return (area_frac, bounding_boxes) # fraction covered # boxes
|
||||
|
||||
bitmap_rects = page._backend.get_bitmap_rects()
|
||||
if page._backend is not None:
|
||||
bitmap_rects = page._backend.get_bitmap_rects()
|
||||
else:
|
||||
bitmap_rects = []
|
||||
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
||||
|
||||
# return full-page rectangle if sufficiently covered with bitmaps
|
||||
@@ -75,7 +80,7 @@ class BaseOcrModel:
|
||||
)
|
||||
]
|
||||
# return individual rectangles if the bitmap coverage is smaller
|
||||
elif coverage < BITMAP_COVERAGE_TRESHOLD:
|
||||
else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
|
||||
return ocr_rects
|
||||
|
||||
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
||||
|
||||
@@ -1,39 +1,228 @@
|
||||
import copy
|
||||
import random
|
||||
from typing import List, Union
|
||||
|
||||
from deepsearch_glm.nlp_utils import init_nlp_model
|
||||
from deepsearch_glm.utils.doc_utils import to_legacy_document_format
|
||||
from deepsearch_glm.utils.doc_utils import to_docling_document
|
||||
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
||||
from docling_core.types import BaseText
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import Ref
|
||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||
from docling_core.types import Table as DsSchemaTable
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
|
||||
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
||||
from docling_core.types.legacy_doc.base import Figure, TableCell
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
||||
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
||||
from docling.utils.utils import create_hash
|
||||
|
||||
|
||||
class GlmOptions(BaseModel):
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
model_names: str = "" # e.g. "language;term;reference"
|
||||
|
||||
|
||||
class GlmModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.model_names = self.config.get(
|
||||
"model_names", ""
|
||||
) # "language;term;reference"
|
||||
load_pretrained_nlp_models()
|
||||
# model = init_nlp_model(model_names="language;term;reference")
|
||||
model = init_nlp_model(model_names=self.model_names)
|
||||
self.model = model
|
||||
def __init__(self, options: GlmOptions):
|
||||
self.options = options
|
||||
|
||||
def __call__(self, conv_res: ConversionResult) -> DsDocument:
|
||||
ds_doc = conv_res._to_ds_document()
|
||||
load_pretrained_nlp_models()
|
||||
self.model = init_nlp_model(model_names=self.options.model_names)
|
||||
|
||||
def _to_legacy_document(self, conv_res) -> DsDocument:
|
||||
title = ""
|
||||
desc: DsDocumentDescription = DsDocumentDescription(logs=[])
|
||||
|
||||
page_hashes = [
|
||||
PageReference(
|
||||
hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
|
||||
page=p.page_no + 1,
|
||||
model="default",
|
||||
)
|
||||
for p in conv_res.pages
|
||||
]
|
||||
|
||||
file_info = DsFileInfoObject(
|
||||
filename=conv_res.input.file.name,
|
||||
document_hash=conv_res.input.document_hash,
|
||||
num_pages=conv_res.input.page_count,
|
||||
page_hashes=page_hashes,
|
||||
)
|
||||
|
||||
main_text: List[Union[Ref, BaseText]] = []
|
||||
tables: List[DsSchemaTable] = []
|
||||
figures: List[Figure] = []
|
||||
|
||||
page_no_to_page = {p.page_no: p for p in conv_res.pages}
|
||||
|
||||
for element in conv_res.assembled.elements:
|
||||
# Convert bboxes to lower-left origin.
|
||||
target_bbox = DsBoundingBox(
|
||||
element.cluster.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple()
|
||||
)
|
||||
|
||||
if isinstance(element, TextElement):
|
||||
main_text.append(
|
||||
BaseText(
|
||||
text=element.text,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
name=element.label,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, len(element.text)],
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
elif isinstance(element, Table):
|
||||
index = len(tables)
|
||||
ref_str = f"#/tables/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
|
||||
# Initialise empty table data grid (only empty cells)
|
||||
table_data = [
|
||||
[
|
||||
TableCell(
|
||||
text="",
|
||||
# bbox=[0,0,0,0],
|
||||
spans=[[i, j]],
|
||||
obj_type="body",
|
||||
)
|
||||
for j in range(element.num_cols)
|
||||
]
|
||||
for i in range(element.num_rows)
|
||||
]
|
||||
|
||||
# Overwrite cells in table data for which there is actual cell content.
|
||||
for cell in element.table_cells:
|
||||
for i in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for j in range(
|
||||
min(cell.start_col_offset_idx, element.num_cols),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
):
|
||||
celltype = "body"
|
||||
if cell.column_header:
|
||||
celltype = "col_header"
|
||||
elif cell.row_header:
|
||||
celltype = "row_header"
|
||||
elif cell.row_section:
|
||||
celltype = "row_section"
|
||||
|
||||
def make_spans(cell):
|
||||
for rspan in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for cspan in range(
|
||||
min(
|
||||
cell.start_col_offset_idx, element.num_cols
|
||||
),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
):
|
||||
yield [rspan, cspan]
|
||||
|
||||
spans = list(make_spans(cell))
|
||||
if cell.bbox is not None:
|
||||
bbox = cell.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple()
|
||||
else:
|
||||
bbox = None
|
||||
|
||||
table_data[i][j] = TableCell(
|
||||
text=cell.text,
|
||||
bbox=bbox,
|
||||
# col=j,
|
||||
# row=i,
|
||||
spans=spans,
|
||||
obj_type=celltype,
|
||||
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
|
||||
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
|
||||
)
|
||||
|
||||
tables.append(
|
||||
DsSchemaTable(
|
||||
num_cols=element.num_cols,
|
||||
num_rows=element.num_rows,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
data=table_data,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(element, FigureElement):
|
||||
index = len(figures)
|
||||
ref_str = f"#/figures/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
figures.append(
|
||||
Figure(
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
# data=[[]],
|
||||
)
|
||||
)
|
||||
|
||||
page_dimensions = [
|
||||
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
||||
for p in conv_res.pages
|
||||
]
|
||||
|
||||
ds_doc: DsDocument = DsDocument(
|
||||
name=title,
|
||||
description=desc,
|
||||
file_info=file_info,
|
||||
main_text=main_text,
|
||||
tables=tables,
|
||||
figures=figures,
|
||||
page_dimensions=page_dimensions,
|
||||
)
|
||||
|
||||
return ds_doc
|
||||
|
||||
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
||||
ds_doc = self._to_legacy_document(conv_res)
|
||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||
|
||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||
ds_doc_dict = to_legacy_document_format(
|
||||
glm_doc, ds_doc_dict, update_name_label=True
|
||||
)
|
||||
|
||||
exported_doc = DsDocument.model_validate(ds_doc_dict)
|
||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||
|
||||
# DEBUG code:
|
||||
def draw_clusters_and_cells(ds_document, page_no):
|
||||
@@ -48,7 +237,7 @@ class GlmModel:
|
||||
if arr == "tables":
|
||||
prov = ds_document.tables[index].prov[0]
|
||||
elif arr == "figures":
|
||||
prov = ds_document.figures[index].prov[0]
|
||||
prov = ds_document.pictures[index].prov[0]
|
||||
else:
|
||||
prov = None
|
||||
|
||||
@@ -83,4 +272,4 @@ class GlmModel:
|
||||
# draw_clusters_and_cells(ds_doc, 0)
|
||||
# draw_clusters_and_cells(exported_doc, 0)
|
||||
|
||||
return exported_doc
|
||||
return docling_doc
|
||||
|
||||
@@ -2,8 +2,9 @@ import logging
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
@@ -39,6 +40,8 @@ class EasyOcrModel(BaseOcrModel):
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
|
||||
@@ -2,8 +2,10 @@ import copy
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List
|
||||
|
||||
from docling_core.types.doc import CoordOrigin, DocItemLabel
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
from PIL import ImageDraw
|
||||
|
||||
@@ -11,74 +13,73 @@ from docling.datamodel.base_models import (
|
||||
BoundingBox,
|
||||
Cell,
|
||||
Cluster,
|
||||
CoordOrigin,
|
||||
LayoutPrediction,
|
||||
Page,
|
||||
)
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils import layout_utils as lu
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LayoutModel:
|
||||
class LayoutModel(BasePageModel):
|
||||
|
||||
TEXT_ELEM_LABELS = [
|
||||
"Text",
|
||||
"Footnote",
|
||||
"Caption",
|
||||
"Checkbox-Unselected",
|
||||
"Checkbox-Selected",
|
||||
"Section-header",
|
||||
"Page-header",
|
||||
"Page-footer",
|
||||
"Code",
|
||||
"List-item",
|
||||
# "Title"
|
||||
DocItemLabel.TEXT,
|
||||
DocItemLabel.FOOTNOTE,
|
||||
DocItemLabel.CAPTION,
|
||||
DocItemLabel.CHECKBOX_UNSELECTED,
|
||||
DocItemLabel.CHECKBOX_SELECTED,
|
||||
DocItemLabel.SECTION_HEADER,
|
||||
DocItemLabel.PAGE_HEADER,
|
||||
DocItemLabel.PAGE_FOOTER,
|
||||
DocItemLabel.CODE,
|
||||
DocItemLabel.LIST_ITEM,
|
||||
# "Formula",
|
||||
]
|
||||
PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
|
||||
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
|
||||
|
||||
TABLE_LABEL = "Table"
|
||||
FIGURE_LABEL = "Picture"
|
||||
FORMULA_LABEL = "Formula"
|
||||
TABLE_LABEL = DocItemLabel.TABLE
|
||||
FIGURE_LABEL = DocItemLabel.PICTURE
|
||||
FORMULA_LABEL = DocItemLabel.FORMULA
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.layout_predictor = LayoutPredictor(
|
||||
config["artifacts_path"]
|
||||
) # TODO temporary
|
||||
def __init__(self, artifacts_path: Path):
|
||||
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
|
||||
|
||||
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
|
||||
def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
|
||||
MIN_INTERSECTION = 0.2
|
||||
CLASS_THRESHOLDS = {
|
||||
"Caption": 0.35,
|
||||
"Footnote": 0.35,
|
||||
"Formula": 0.35,
|
||||
"List-item": 0.35,
|
||||
"Page-footer": 0.35,
|
||||
"Page-header": 0.35,
|
||||
"Picture": 0.2, # low threshold adjust to capture chemical structures for examples.
|
||||
"Section-header": 0.45,
|
||||
"Table": 0.35,
|
||||
"Text": 0.45,
|
||||
"Title": 0.45,
|
||||
"Document Index": 0.45,
|
||||
"Code": 0.45,
|
||||
"Checkbox-Selected": 0.45,
|
||||
"Checkbox-Unselected": 0.45,
|
||||
"Form": 0.45,
|
||||
"Key-Value Region": 0.45,
|
||||
DocItemLabel.CAPTION: 0.35,
|
||||
DocItemLabel.FOOTNOTE: 0.35,
|
||||
DocItemLabel.FORMULA: 0.35,
|
||||
DocItemLabel.LIST_ITEM: 0.35,
|
||||
DocItemLabel.PAGE_FOOTER: 0.35,
|
||||
DocItemLabel.PAGE_HEADER: 0.35,
|
||||
DocItemLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
|
||||
DocItemLabel.SECTION_HEADER: 0.45,
|
||||
DocItemLabel.TABLE: 0.35,
|
||||
DocItemLabel.TEXT: 0.45,
|
||||
DocItemLabel.TITLE: 0.45,
|
||||
DocItemLabel.DOCUMENT_INDEX: 0.45,
|
||||
DocItemLabel.CODE: 0.45,
|
||||
DocItemLabel.CHECKBOX_SELECTED: 0.45,
|
||||
DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
|
||||
DocItemLabel.FORM: 0.45,
|
||||
DocItemLabel.KEY_VALUE_REGION: 0.45,
|
||||
}
|
||||
|
||||
CLASS_REMAPPINGS = {"Document Index": "Table", "Title": "Section-header"}
|
||||
CLASS_REMAPPINGS = {
|
||||
DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
|
||||
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
||||
}
|
||||
|
||||
_log.debug("================= Start postprocess function ====================")
|
||||
start_time = time.time()
|
||||
# Apply Confidence Threshold to cluster predictions
|
||||
# confidence = self.conf_threshold
|
||||
clusters_out = []
|
||||
clusters_mod = []
|
||||
|
||||
for cluster in clusters:
|
||||
for cluster in clusters_in:
|
||||
confidence = CLASS_THRESHOLDS[cluster.label]
|
||||
if cluster.confidence >= confidence:
|
||||
# annotation["created_by"] = "high_conf_pred"
|
||||
@@ -86,10 +87,10 @@ class LayoutModel:
|
||||
# Remap class labels where needed.
|
||||
if cluster.label in CLASS_REMAPPINGS.keys():
|
||||
cluster.label = CLASS_REMAPPINGS[cluster.label]
|
||||
clusters_out.append(cluster)
|
||||
clusters_mod.append(cluster)
|
||||
|
||||
# map to dictionary clusters and cells, with bottom left origin
|
||||
clusters = [
|
||||
clusters_orig = [
|
||||
{
|
||||
"id": c.id,
|
||||
"bbox": list(
|
||||
@@ -99,7 +100,7 @@ class LayoutModel:
|
||||
"cell_ids": [],
|
||||
"type": c.label,
|
||||
}
|
||||
for c in clusters
|
||||
for c in clusters_in
|
||||
]
|
||||
|
||||
clusters_out = [
|
||||
@@ -113,9 +114,11 @@ class LayoutModel:
|
||||
"cell_ids": [],
|
||||
"type": c.label,
|
||||
}
|
||||
for c in clusters_out
|
||||
for c in clusters_mod
|
||||
]
|
||||
|
||||
del clusters_mod
|
||||
|
||||
raw_cells = [
|
||||
{
|
||||
"id": c.id,
|
||||
@@ -149,7 +152,7 @@ class LayoutModel:
|
||||
|
||||
# Assign orphan cells with lower confidence predictions
|
||||
clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
|
||||
clusters_out, clusters, raw_cells, orphan_cell_indices
|
||||
clusters_out, clusters_orig, raw_cells, orphan_cell_indices
|
||||
)
|
||||
|
||||
# Refresh the cell_ids assignment, after creating new clusters using low conf predictions
|
||||
@@ -178,7 +181,7 @@ class LayoutModel:
|
||||
) = lu.cell_id_state_map(clusters_out, cell_count)
|
||||
|
||||
clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
|
||||
clusters_out, clusters, raw_cells, orphan_cell_indices
|
||||
clusters_out, clusters_orig, raw_cells, orphan_cell_indices
|
||||
)
|
||||
|
||||
_log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
|
||||
@@ -237,46 +240,55 @@ class LayoutModel:
|
||||
end_time = time.time() - start_time
|
||||
_log.debug(f"Finished post processing in seconds={end_time:.3f}")
|
||||
|
||||
cells_out = [
|
||||
cells_out_new = [
|
||||
Cell(
|
||||
id=c["id"],
|
||||
id=c["id"], # type: ignore
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
||||
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
|
||||
).to_top_left_origin(page_height),
|
||||
text=c["text"],
|
||||
text=c["text"], # type: ignore
|
||||
)
|
||||
for c in cells_out
|
||||
]
|
||||
|
||||
del cells_out
|
||||
|
||||
clusters_out_new = []
|
||||
for c in clusters_out:
|
||||
cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
|
||||
cluster_cells = [
|
||||
ccell for ccell in cells_out_new if ccell.id in c["cell_ids"] # type: ignore
|
||||
]
|
||||
c_new = Cluster(
|
||||
id=c["id"],
|
||||
id=c["id"], # type: ignore
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
||||
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
|
||||
).to_top_left_origin(page_height),
|
||||
confidence=c["confidence"],
|
||||
label=c["type"],
|
||||
confidence=c["confidence"], # type: ignore
|
||||
label=DocItemLabel(c["type"]),
|
||||
cells=cluster_cells,
|
||||
)
|
||||
clusters_out_new.append(c_new)
|
||||
|
||||
return clusters_out_new, cells_out
|
||||
return clusters_out_new, cells_out_new
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
assert page.size is not None
|
||||
|
||||
clusters = []
|
||||
for ix, pred_item in enumerate(
|
||||
self.layout_predictor.predict(page.get_image(scale=1.0))
|
||||
):
|
||||
label = DocItemLabel(
|
||||
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
|
||||
) # Temporary, until docling-ibm-model uses docling-core types
|
||||
cluster = Cluster(
|
||||
id=ix,
|
||||
label=pred_item["label"],
|
||||
label=label,
|
||||
confidence=pred_item["confidence"],
|
||||
bbox=BoundingBox.model_validate(pred_item),
|
||||
cells=[],
|
||||
)
|
||||
|
||||
clusters.append(cluster)
|
||||
|
||||
# Map cells to clusters
|
||||
|
||||
@@ -2,22 +2,29 @@ import logging
|
||||
import re
|
||||
from typing import Iterable, List
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
FigureElement,
|
||||
Page,
|
||||
PageElement,
|
||||
TableElement,
|
||||
Table,
|
||||
TextElement,
|
||||
)
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PageAssembleModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
class PageAssembleOptions(BaseModel):
|
||||
keep_images: bool = False
|
||||
|
||||
|
||||
class PageAssembleModel(BasePageModel):
|
||||
def __init__(self, options: PageAssembleOptions):
|
||||
self.options = options
|
||||
|
||||
def sanitize_text(self, lines):
|
||||
if len(lines) <= 1:
|
||||
@@ -46,6 +53,8 @@ class PageAssembleModel:
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
assert page.predictions.layout is not None
|
||||
# assembles some JSON output page by page.
|
||||
|
||||
elements: List[PageElement] = []
|
||||
@@ -84,7 +93,7 @@ class PageAssembleModel:
|
||||
if (
|
||||
not tbl
|
||||
): # fallback: add table without structure, if it isn't present
|
||||
tbl = TableElement(
|
||||
tbl = Table(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
text="",
|
||||
@@ -145,4 +154,11 @@ class PageAssembleModel:
|
||||
elements=elements, headers=headers, body=body
|
||||
)
|
||||
|
||||
# Remove page images (can be disabled)
|
||||
if not self.options.keep_images:
|
||||
page._image_cache = {}
|
||||
|
||||
# Unload backend
|
||||
page._backend.unload()
|
||||
|
||||
yield page
|
||||
|
||||
57
docling/models/page_preprocessing_model.py
Normal file
57
docling/models/page_preprocessing_model.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from typing import Iterable, Optional
|
||||
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.models.base_model import BasePageModel
|
||||
|
||||
|
||||
class PagePreprocessingOptions(BaseModel):
|
||||
images_scale: Optional[float]
|
||||
|
||||
|
||||
class PagePreprocessingModel(BasePageModel):
|
||||
def __init__(self, options: PagePreprocessingOptions):
|
||||
self.options = options
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
page = self._populate_page_images(page)
|
||||
page = self._parse_page_cells(page)
|
||||
yield page
|
||||
|
||||
# Generate the page image and store it in the page object
|
||||
def _populate_page_images(self, page: Page) -> Page:
|
||||
# default scale
|
||||
page.get_image(
|
||||
scale=1.0
|
||||
) # puts the page image on the image cache at default scale
|
||||
|
||||
images_scale = self.options.images_scale
|
||||
# user requested scales
|
||||
if images_scale is not None:
|
||||
page._default_image_scale = images_scale
|
||||
page.get_image(
|
||||
scale=images_scale
|
||||
) # this will trigger storing the image in the internal cache
|
||||
|
||||
return page
|
||||
|
||||
# Extract and populate the page cells and store it in the page object
|
||||
def _parse_page_cells(self, page: Page) -> Page:
|
||||
assert page._backend is not None
|
||||
|
||||
page.cells = list(page._backend.get_text_cells())
|
||||
|
||||
# DEBUG code:
|
||||
def draw_text_boxes(image, cells):
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
image.show()
|
||||
|
||||
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
||||
|
||||
return page
|
||||
@@ -3,29 +3,25 @@ from pathlib import Path
|
||||
from typing import Iterable, List
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||
from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
BoundingBox,
|
||||
Page,
|
||||
TableCell,
|
||||
TableElement,
|
||||
TableStructurePrediction,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import TableFormerMode
|
||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
||||
from docling.models.base_model import BasePageModel
|
||||
|
||||
|
||||
class TableStructureModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.do_cell_matching = config["do_cell_matching"]
|
||||
self.mode = config["mode"]
|
||||
class TableStructureModel(BasePageModel):
|
||||
def __init__(
|
||||
self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
|
||||
):
|
||||
self.options = options
|
||||
self.do_cell_matching = self.options.do_cell_matching
|
||||
self.mode = self.options.mode
|
||||
|
||||
self.enabled = config["enabled"]
|
||||
self.enabled = enabled
|
||||
if self.enabled:
|
||||
artifacts_path: Path = config["artifacts_path"]
|
||||
|
||||
if self.mode == TableFormerMode.ACCURATE:
|
||||
artifacts_path = artifacts_path / "fat"
|
||||
|
||||
@@ -39,7 +35,9 @@ class TableStructureModel:
|
||||
self.tf_predictor = TFPredictor(self.tm_config)
|
||||
self.scale = 2.0 # Scale up table input images to 144 dpi
|
||||
|
||||
def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
|
||||
def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
|
||||
assert page._backend is not None
|
||||
|
||||
image = (
|
||||
page._backend.get_page_image()
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
@@ -50,17 +48,18 @@ class TableStructureModel:
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
|
||||
for tc in table_element.table_cells:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
if tc.column_header:
|
||||
width = 3
|
||||
else:
|
||||
width = 1
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
|
||||
draw.text(
|
||||
(x0 + 3, y0 + 3),
|
||||
text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
|
||||
fill="black",
|
||||
)
|
||||
if tc.bbox is not None:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
if tc.column_header:
|
||||
width = 3
|
||||
else:
|
||||
width = 1
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
|
||||
draw.text(
|
||||
(x0 + 3, y0 + 3),
|
||||
text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
|
||||
fill="black",
|
||||
)
|
||||
|
||||
image.show()
|
||||
|
||||
@@ -71,6 +70,9 @@ class TableStructureModel:
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
assert page.predictions.layout is not None
|
||||
assert page.size is not None
|
||||
|
||||
page.predictions.tablestructure = TableStructurePrediction() # dummy
|
||||
|
||||
@@ -85,7 +87,7 @@ class TableStructureModel:
|
||||
],
|
||||
)
|
||||
for cluster in page.predictions.layout.clusters
|
||||
if cluster.label == "Table"
|
||||
if cluster.label == DocItemLabel.TABLE
|
||||
]
|
||||
if not len(in_tables):
|
||||
yield page
|
||||
@@ -132,7 +134,7 @@ class TableStructureModel:
|
||||
element["bbox"]["token"] = text_piece
|
||||
|
||||
tc = TableCell.model_validate(element)
|
||||
if self.do_cell_matching:
|
||||
if self.do_cell_matching and tc.bbox is not None:
|
||||
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
||||
table_cells.append(tc)
|
||||
|
||||
@@ -141,7 +143,7 @@ class TableStructureModel:
|
||||
num_cols = table_out["predict_details"]["num_cols"]
|
||||
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
|
||||
|
||||
tbl = TableElement(
|
||||
tbl = Table(
|
||||
otsl_seq=otsl_seq,
|
||||
table_cells=table_cells,
|
||||
num_rows=num_rows,
|
||||
@@ -149,7 +151,7 @@ class TableStructureModel:
|
||||
id=table_cluster.id,
|
||||
page_no=page.page_no,
|
||||
cluster=table_cluster,
|
||||
label="Table",
|
||||
label=DocItemLabel.TABLE,
|
||||
)
|
||||
|
||||
page.predictions.tablestructure.table_map[table_cluster.id] = tbl
|
||||
|
||||
@@ -2,11 +2,12 @@ import io
|
||||
import logging
|
||||
import tempfile
|
||||
from subprocess import DEVNULL, PIPE, Popen
|
||||
from typing import Iterable, Tuple
|
||||
from typing import Iterable, Optional, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
@@ -21,8 +22,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
|
||||
self._name = None
|
||||
self._version = None
|
||||
self._name: Optional[str] = None
|
||||
self._version: Optional[str] = None
|
||||
|
||||
if self.enabled:
|
||||
try:
|
||||
@@ -39,7 +40,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
def _get_name_and_version(self) -> Tuple[str, str]:
|
||||
|
||||
if self._name != None and self._version != None:
|
||||
return self._name, self._version
|
||||
return self._name, self._version # type: ignore
|
||||
|
||||
cmd = [self.options.tesseract_cmd, "--version"]
|
||||
|
||||
@@ -108,6 +109,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import logging
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
@@ -68,6 +68,9 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
assert self.reader is not None
|
||||
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, List
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
|
||||
|
||||
class BaseModelPipeline:
|
||||
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
||||
self.model_pipe: List[Callable] = []
|
||||
self.artifacts_path = artifacts_path
|
||||
self.pipeline_options = pipeline_options
|
||||
|
||||
def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for model in self.model_pipe:
|
||||
page_batch = model(page_batch)
|
||||
|
||||
yield from page_batch
|
||||
190
docling/pipeline/base_pipeline.py
Normal file
190
docling/pipeline/base_pipeline.py
Normal file
@@ -0,0 +1,190 @@
|
||||
import functools
|
||||
import logging
|
||||
import time
|
||||
import traceback
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Callable, Iterable, List
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
ErrorItem,
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BaseEnrichmentModel
|
||||
from docling.utils.utils import chunkify
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BasePipeline(ABC):
|
||||
def __init__(self, pipeline_options: PipelineOptions):
|
||||
self.pipeline_options = pipeline_options
|
||||
self.build_pipe: List[Callable] = []
|
||||
self.enrichment_pipe: List[BaseEnrichmentModel] = []
|
||||
|
||||
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
|
||||
_log.info(f"Processing document {in_doc.file.name}")
|
||||
try:
|
||||
# These steps are building and assembling the structure of the
|
||||
# output DoclingDocument
|
||||
conv_res = self._build_document(in_doc, conv_res)
|
||||
conv_res = self._assemble_document(in_doc, conv_res)
|
||||
# From this stage, all operations should rely only on conv_res.output
|
||||
conv_res = self._enrich_document(in_doc, conv_res)
|
||||
conv_res.status = self._determine_status(in_doc, conv_res)
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
if raises_on_error:
|
||||
raise e
|
||||
|
||||
return conv_res
|
||||
|
||||
@abstractmethod
|
||||
def _build_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
pass
|
||||
|
||||
def _assemble_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
return conv_res
|
||||
|
||||
def _enrich_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
|
||||
def _filter_elements(
|
||||
doc: DoclingDocument, model: BaseEnrichmentModel
|
||||
) -> Iterable[NodeItem]:
|
||||
for element, _level in doc.iterate_items():
|
||||
if model.is_processable(doc=doc, element=element):
|
||||
yield element
|
||||
|
||||
for model in self.enrichment_pipe:
|
||||
for element_batch in chunkify(
|
||||
_filter_elements(conv_res.document, model),
|
||||
settings.perf.elements_batch_size,
|
||||
):
|
||||
# TODO: currently we assume the element itself is modified, because
|
||||
# we don't have an interface to save the element back to the document
|
||||
for element in model(
|
||||
doc=conv_res.document, element_batch=element_batch
|
||||
): # Must exhaust!
|
||||
pass
|
||||
|
||||
return conv_res
|
||||
|
||||
@abstractmethod
|
||||
def _determine_status(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionStatus:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def get_default_options(cls) -> PipelineOptions:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||
pass
|
||||
|
||||
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
|
||||
# for model in self.build_pipe:
|
||||
# element_batch = model(element_batch)
|
||||
#
|
||||
# yield from element_batch
|
||||
|
||||
|
||||
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
|
||||
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for model in self.build_pipe:
|
||||
page_batch = model(page_batch)
|
||||
|
||||
yield from page_batch
|
||||
|
||||
def _build_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
|
||||
if not isinstance(in_doc._backend, PdfDocumentBackend):
|
||||
raise RuntimeError(
|
||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
|
||||
f"Can not convert this with a PDF pipeline. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
# conv_res.status = ConversionStatus.FAILURE
|
||||
# return conv_res
|
||||
|
||||
for i in range(0, in_doc.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
||||
try:
|
||||
# Iterate batches of pages (page_batch_size) in the doc
|
||||
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
||||
start_pb_time = time.time()
|
||||
|
||||
# 1. Initialise the page resources
|
||||
init_pages = map(
|
||||
functools.partial(self.initialize_page, in_doc), page_batch
|
||||
)
|
||||
|
||||
# 2. Run pipeline stages
|
||||
pipeline_pages = self._apply_on_pages(init_pages)
|
||||
|
||||
for p in pipeline_pages: # Must exhaust!
|
||||
pass
|
||||
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
trace = "\n".join(traceback.format_exception(e))
|
||||
_log.warning(
|
||||
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
||||
f"{trace}"
|
||||
)
|
||||
raise e
|
||||
|
||||
finally:
|
||||
# Always unload the PDF backend, even in case of failure
|
||||
if in_doc._backend:
|
||||
in_doc._backend.unload()
|
||||
|
||||
return conv_res
|
||||
|
||||
def _determine_status(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionStatus:
|
||||
status = ConversionStatus.SUCCESS
|
||||
for page in conv_res.pages:
|
||||
if page._backend is None or not page._backend.is_valid():
|
||||
conv_res.errors.append(
|
||||
ErrorItem(
|
||||
component_type=DoclingComponentType.DOCUMENT_BACKEND,
|
||||
module_name=type(page._backend).__name__,
|
||||
error_message=f"Page {page.page_no} failed to parse.",
|
||||
)
|
||||
)
|
||||
status = ConversionStatus.PARTIAL_SUCCESS
|
||||
|
||||
return status
|
||||
|
||||
# Initialise and load resources for a page
|
||||
@abstractmethod
|
||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||
pass
|
||||
59
docling/pipeline/simple_pipeline.py
Normal file
59
docling/pipeline/simple_pipeline.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import logging
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
DeclarativeDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SimplePipeline(BasePipeline):
|
||||
"""SimpleModelPipeline.
|
||||
|
||||
This class is used at the moment for formats / backends
|
||||
which produce straight DoclingDocument output.
|
||||
"""
|
||||
|
||||
def __init__(self, pipeline_options: PipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
|
||||
def _build_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
|
||||
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
|
||||
raise RuntimeError(
|
||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
|
||||
f"Can not convert this with simple pipeline. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
# conv_res.status = ConversionStatus.FAILURE
|
||||
# return conv_res
|
||||
|
||||
# Instead of running a page-level pipeline to build up the document structure,
|
||||
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
||||
# a DoclingDocument straight.
|
||||
|
||||
conv_res.document = in_doc._backend.convert()
|
||||
return conv_res
|
||||
|
||||
def _determine_status(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionStatus:
|
||||
# This is called only if the previous steps didn't raise.
|
||||
# Since we don't have anything else to evaluate, we can
|
||||
# safely return SUCCESS.
|
||||
return ConversionStatus.SUCCESS
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> PipelineOptions:
|
||||
return PipelineOptions()
|
||||
|
||||
@classmethod
|
||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||
return isinstance(backend, DeclarativeDocumentBackend)
|
||||
@@ -1,66 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
PipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
|
||||
|
||||
class StandardModelPipeline(BaseModelPipeline):
|
||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
|
||||
_table_model_path = "model_artifacts/tableformer"
|
||||
|
||||
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
||||
super().__init__(artifacts_path, pipeline_options)
|
||||
|
||||
ocr_model: BaseOcrModel
|
||||
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
|
||||
ocr_model = EasyOcrModel(
|
||||
enabled=pipeline_options.do_ocr,
|
||||
options=pipeline_options.ocr_options,
|
||||
)
|
||||
elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
|
||||
ocr_model = TesseractOcrCliModel(
|
||||
enabled=pipeline_options.do_ocr,
|
||||
options=pipeline_options.ocr_options,
|
||||
)
|
||||
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
|
||||
ocr_model = TesseractOcrModel(
|
||||
enabled=pipeline_options.do_ocr,
|
||||
options=pipeline_options.ocr_options,
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
||||
)
|
||||
|
||||
self.model_pipe = [
|
||||
# OCR
|
||||
ocr_model,
|
||||
# Layout
|
||||
LayoutModel(
|
||||
config={
|
||||
"artifacts_path": artifacts_path
|
||||
/ StandardModelPipeline._layout_model_path
|
||||
}
|
||||
),
|
||||
# Table structure
|
||||
TableStructureModel(
|
||||
config={
|
||||
"artifacts_path": artifacts_path
|
||||
/ StandardModelPipeline._table_model_path,
|
||||
"enabled": pipeline_options.do_table_structure,
|
||||
"mode": pipeline_options.table_structure_options.mode,
|
||||
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
||||
}
|
||||
),
|
||||
]
|
||||
198
docling/pipeline/standard_pdf_pipeline.py
Normal file
198
docling/pipeline/standard_pdf_pipeline.py
Normal file
@@ -0,0 +1,198 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import AssembledUnit, Page
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||
from docling.models.page_preprocessing_model import (
|
||||
PagePreprocessingModel,
|
||||
PagePreprocessingOptions,
|
||||
)
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StandardPdfPipeline(PaginatedPipeline):
|
||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
|
||||
_table_model_path = "model_artifacts/tableformer"
|
||||
|
||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
self.pipeline_options: PdfPipelineOptions
|
||||
|
||||
if pipeline_options.artifacts_path is None:
|
||||
self.artifacts_path = self.download_models_hf()
|
||||
else:
|
||||
self.artifacts_path = Path(pipeline_options.artifacts_path)
|
||||
|
||||
keep_images = (
|
||||
self.pipeline_options.generate_page_images
|
||||
or self.pipeline_options.generate_picture_images
|
||||
or self.pipeline_options.generate_table_images
|
||||
)
|
||||
|
||||
self.glm_model = GlmModel(options=GlmOptions())
|
||||
|
||||
if (ocr_model := self.get_ocr_model()) is None:
|
||||
raise RuntimeError(
|
||||
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
||||
)
|
||||
|
||||
self.build_pipe = [
|
||||
# Pre-processing
|
||||
PagePreprocessingModel(
|
||||
options=PagePreprocessingOptions(
|
||||
images_scale=pipeline_options.images_scale
|
||||
)
|
||||
),
|
||||
# OCR
|
||||
ocr_model,
|
||||
# Layout model
|
||||
LayoutModel(
|
||||
artifacts_path=self.artifacts_path
|
||||
/ StandardPdfPipeline._layout_model_path
|
||||
),
|
||||
# Table structure model
|
||||
TableStructureModel(
|
||||
enabled=pipeline_options.do_table_structure,
|
||||
artifacts_path=self.artifacts_path
|
||||
/ StandardPdfPipeline._table_model_path,
|
||||
options=pipeline_options.table_structure_options,
|
||||
),
|
||||
# Page assemble
|
||||
PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
|
||||
]
|
||||
|
||||
self.enrichment_pipe = [
|
||||
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models",
|
||||
force_download=force,
|
||||
local_dir=local_dir,
|
||||
revision="v2.0.1",
|
||||
)
|
||||
|
||||
return Path(download_path)
|
||||
|
||||
def get_ocr_model(self) -> Optional[BaseOcrModel]:
|
||||
if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
|
||||
return EasyOcrModel(
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
)
|
||||
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
|
||||
return TesseractOcrCliModel(
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
)
|
||||
elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
|
||||
return TesseractOcrModel(
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
)
|
||||
return None
|
||||
|
||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||
page._backend = doc._backend.load_page(page.page_no) # type: ignore
|
||||
if page._backend is not None and page._backend.is_valid():
|
||||
page.size = page._backend.get_size()
|
||||
|
||||
return page
|
||||
|
||||
def _assemble_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
all_elements = []
|
||||
all_headers = []
|
||||
all_body = []
|
||||
|
||||
for p in conv_res.pages:
|
||||
assert p.assembled is not None
|
||||
for el in p.assembled.body:
|
||||
all_body.append(el)
|
||||
for el in p.assembled.headers:
|
||||
all_headers.append(el)
|
||||
for el in p.assembled.elements:
|
||||
all_elements.append(el)
|
||||
|
||||
conv_res.assembled = AssembledUnit(
|
||||
elements=all_elements, headers=all_headers, body=all_body
|
||||
)
|
||||
|
||||
conv_res.document = self.glm_model(conv_res)
|
||||
|
||||
# Generate page images in the output
|
||||
if self.pipeline_options.generate_page_images:
|
||||
for page in conv_res.pages:
|
||||
assert page.image is not None
|
||||
page_no = page.page_no + 1
|
||||
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
||||
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
||||
)
|
||||
|
||||
# Generate images of the requested element types
|
||||
if (
|
||||
self.pipeline_options.generate_picture_images
|
||||
or self.pipeline_options.generate_table_images
|
||||
):
|
||||
scale = self.pipeline_options.images_scale
|
||||
for element, _level in conv_res.document.iterate_items():
|
||||
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
||||
continue
|
||||
if (
|
||||
isinstance(element, PictureItem)
|
||||
and self.pipeline_options.generate_picture_images
|
||||
) or (
|
||||
isinstance(element, TableItem)
|
||||
and self.pipeline_options.generate_table_images
|
||||
):
|
||||
page_ix = element.prov[0].page_no - 1
|
||||
page = conv_res.pages[page_ix]
|
||||
assert page.size is not None
|
||||
assert page.image is not None
|
||||
|
||||
crop_bbox = (
|
||||
element.prov[0]
|
||||
.bbox.scaled(scale=scale)
|
||||
.to_top_left_origin(page_height=page.size.height * scale)
|
||||
)
|
||||
|
||||
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
||||
element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
|
||||
|
||||
return conv_res
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> PdfPipelineOptions:
|
||||
return PdfPipelineOptions()
|
||||
|
||||
@classmethod
|
||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||
return isinstance(backend, PdfDocumentBackend)
|
||||
@@ -1,9 +1,10 @@
|
||||
import logging
|
||||
from typing import Any, Dict, Iterable, List, Tuple, Union
|
||||
|
||||
from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
|
||||
from docling.datamodel.base_models import OcrCell
|
||||
from docling.datamodel.document import ConversionResult, Page
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -40,7 +41,7 @@ def generate_multimodal_pages(
|
||||
end_ix = 0
|
||||
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
|
||||
|
||||
doc = doc_result.output
|
||||
doc = doc_result.legacy_document
|
||||
|
||||
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
|
||||
segments = []
|
||||
|
||||
@@ -2,6 +2,7 @@ import copy
|
||||
import logging
|
||||
|
||||
import networkx as nx
|
||||
from docling_core.types.doc import DocItemLabel
|
||||
|
||||
logger = logging.getLogger("layout_utils")
|
||||
|
||||
@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
|
||||
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
|
||||
)
|
||||
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
|
||||
if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
|
||||
if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
|
||||
logger.debug(" Empty non-picture, removed")
|
||||
continue ## Skip this former cluster, now without cells.
|
||||
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
|
||||
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
|
||||
|
||||
|
||||
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
|
||||
if not (cluster["type"] in ["Table", "Picture"]):
|
||||
if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
|
||||
## A text-like cluster. The bbox only needs to be around the text cells:
|
||||
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
||||
new_bbox = surrounding_list(
|
||||
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
||||
)
|
||||
logger.debug(" New bounding box:" + str(new_bbox))
|
||||
if cluster["type"] == "Picture":
|
||||
if cluster["type"] == DocItemLabel.PICTURE:
|
||||
## We only make the bbox completely comprise included text cells:
|
||||
logger.debug(" Picture")
|
||||
if len(cluster["cell_ids"]) != 0:
|
||||
@@ -587,7 +588,7 @@ def set_orphan_as_text(
|
||||
max_id = -1
|
||||
figures = []
|
||||
for cluster in cluster_predictions:
|
||||
if cluster["type"] == "Picture":
|
||||
if cluster["type"] == DocItemLabel.PICTURE:
|
||||
figures.append(cluster)
|
||||
|
||||
if cluster["id"] > max_id:
|
||||
@@ -638,13 +639,13 @@ def set_orphan_as_text(
|
||||
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
|
||||
if fig_flag == False and lines_detector == False:
|
||||
# get class from low confidence detections if not set as text:
|
||||
class_type = "Text"
|
||||
class_type = DocItemLabel.TEXT
|
||||
|
||||
for cluster in cluster_predictions_low:
|
||||
intersection = compute_intersection(
|
||||
orph_cell["bbox"], cluster["bbox"]
|
||||
)
|
||||
class_type = "Text"
|
||||
class_type = DocItemLabel.TEXT
|
||||
if (
|
||||
cluster["confidence"] > 0.1
|
||||
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
|
||||
@@ -718,7 +719,9 @@ def merge_cells(cluster_predictions):
|
||||
if cluster["id"] == node:
|
||||
lines.append(cluster)
|
||||
cluster_predictions.remove(cluster)
|
||||
new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
|
||||
new_merged_cluster = build_cluster_from_lines(
|
||||
lines, DocItemLabel.TEXT, max_id
|
||||
)
|
||||
cluster_predictions.append(new_merged_cluster)
|
||||
return cluster_predictions
|
||||
|
||||
@@ -753,9 +756,9 @@ def clean_up_clusters(
|
||||
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
|
||||
elif img_table == True:
|
||||
if (
|
||||
cluster_1["type"] == "Text"
|
||||
and cluster_2["type"] == "Picture"
|
||||
or cluster_2["type"] == "Table"
|
||||
cluster_1["type"] == DocItemLabel.TEXT
|
||||
and cluster_2["type"] == DocItemLabel.PICTURE
|
||||
or cluster_2["type"] == DocItemLabel.TABLE
|
||||
):
|
||||
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
|
||||
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
||||
@@ -771,7 +774,10 @@ def clean_up_clusters(
|
||||
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
||||
# remove tables that have one pdf cell
|
||||
if one_cell_table == True:
|
||||
if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
|
||||
if (
|
||||
cluster_1["type"] == DocItemLabel.TABLE
|
||||
and len(cluster_1["cell_ids"]) < 2
|
||||
):
|
||||
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
||||
|
||||
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
|
||||
|
||||
Reference in New Issue
Block a user