mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-15 16:18:22 +00:00
feat!: Docling v2 (#117)
--------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
@@ -1,18 +1,19 @@
|
||||
import copy
|
||||
import warnings
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
|
||||
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
from typing_extensions import Self
|
||||
|
||||
from docling.backend.abstract_backend import PdfPageBackend
|
||||
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
|
||||
PipelineOptions,
|
||||
TableStructureOptions,
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
DocItemLabel,
|
||||
PictureDataType,
|
||||
Size,
|
||||
TableCell,
|
||||
)
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.backend.pdf_backend import PdfPageBackend
|
||||
|
||||
|
||||
class ConversionStatus(str, Enum):
|
||||
@@ -23,18 +24,61 @@ class ConversionStatus(str, Enum):
|
||||
PARTIAL_SUCCESS = auto()
|
||||
|
||||
|
||||
class InputFormat(str, Enum):
|
||||
DOCX = "docx"
|
||||
PPTX = "pptx"
|
||||
HTML = "html"
|
||||
IMAGE = "image"
|
||||
PDF = "pdf"
|
||||
|
||||
|
||||
class OutputFormat(str, Enum):
|
||||
MARKDOWN = "md"
|
||||
JSON = "json"
|
||||
TEXT = "text"
|
||||
DOCTAGS = "doctags"
|
||||
|
||||
|
||||
FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
||||
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
||||
InputFormat.PDF: ["pdf"],
|
||||
InputFormat.HTML: ["html", "htm", "xhtml"],
|
||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||
}
|
||||
|
||||
FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
||||
InputFormat.DOCX: {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
||||
},
|
||||
InputFormat.PPTX: {
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
},
|
||||
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
|
||||
InputFormat.IMAGE: {
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/gif",
|
||||
"image/bmp",
|
||||
},
|
||||
InputFormat.PDF: {"application/pdf"},
|
||||
}
|
||||
MimeTypeToFormat = {
|
||||
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
||||
}
|
||||
|
||||
|
||||
class DocInputType(str, Enum):
|
||||
PATH = auto()
|
||||
STREAM = auto()
|
||||
|
||||
|
||||
class CoordOrigin(str, Enum):
|
||||
TOPLEFT = auto()
|
||||
BOTTOMLEFT = auto()
|
||||
|
||||
|
||||
class DoclingComponentType(str, Enum):
|
||||
PDF_BACKEND = auto()
|
||||
DOCUMENT_BACKEND = auto()
|
||||
MODEL = auto()
|
||||
DOC_ASSEMBLER = auto()
|
||||
|
||||
@@ -45,118 +89,6 @@ class ErrorItem(BaseModel):
|
||||
error_message: str
|
||||
|
||||
|
||||
class PageSize(BaseModel):
|
||||
width: float = 0.0
|
||||
height: float = 0.0
|
||||
|
||||
|
||||
class BoundingBox(BaseModel):
|
||||
l: float # left
|
||||
t: float # top
|
||||
r: float # right
|
||||
b: float # bottom
|
||||
|
||||
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
|
||||
|
||||
@property
|
||||
def width(self):
|
||||
return self.r - self.l
|
||||
|
||||
@property
|
||||
def height(self):
|
||||
return abs(self.t - self.b)
|
||||
|
||||
def scaled(self, scale: float) -> "BoundingBox":
|
||||
out_bbox = copy.deepcopy(self)
|
||||
out_bbox.l *= scale
|
||||
out_bbox.r *= scale
|
||||
out_bbox.t *= scale
|
||||
out_bbox.b *= scale
|
||||
|
||||
return out_bbox
|
||||
|
||||
def normalized(self, page_size: PageSize) -> "BoundingBox":
|
||||
out_bbox = copy.deepcopy(self)
|
||||
out_bbox.l /= page_size.width
|
||||
out_bbox.r /= page_size.width
|
||||
out_bbox.t /= page_size.height
|
||||
out_bbox.b /= page_size.height
|
||||
|
||||
return out_bbox
|
||||
|
||||
def as_tuple(self):
|
||||
if self.coord_origin == CoordOrigin.TOPLEFT:
|
||||
return (self.l, self.t, self.r, self.b)
|
||||
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
return (self.l, self.b, self.r, self.t)
|
||||
|
||||
@classmethod
|
||||
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
|
||||
if origin == CoordOrigin.TOPLEFT:
|
||||
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
||||
if r < l:
|
||||
l, r = r, l
|
||||
if b < t:
|
||||
b, t = t, b
|
||||
|
||||
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
||||
elif origin == CoordOrigin.BOTTOMLEFT:
|
||||
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
|
||||
if r < l:
|
||||
l, r = r, l
|
||||
if b > t:
|
||||
b, t = t, b
|
||||
|
||||
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
||||
|
||||
def area(self) -> float:
|
||||
area = (self.r - self.l) * (self.b - self.t)
|
||||
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
area = -area
|
||||
return area
|
||||
|
||||
def intersection_area_with(self, other: "BoundingBox") -> float:
|
||||
# Calculate intersection coordinates
|
||||
left = max(self.l, other.l)
|
||||
top = max(self.t, other.t)
|
||||
right = min(self.r, other.r)
|
||||
bottom = min(self.b, other.b)
|
||||
|
||||
# Calculate intersection dimensions
|
||||
width = right - left
|
||||
height = bottom - top
|
||||
|
||||
# If the bounding boxes do not overlap, width or height will be negative
|
||||
if width <= 0 or height <= 0:
|
||||
return 0.0
|
||||
|
||||
return width * height
|
||||
|
||||
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
|
||||
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
return self
|
||||
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
||||
return BoundingBox(
|
||||
l=self.l,
|
||||
r=self.r,
|
||||
t=page_height - self.t,
|
||||
b=page_height - self.b,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
|
||||
def to_top_left_origin(self, page_height):
|
||||
if self.coord_origin == CoordOrigin.TOPLEFT:
|
||||
return self
|
||||
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
return BoundingBox(
|
||||
l=self.l,
|
||||
r=self.r,
|
||||
t=page_height - self.t, # self.b
|
||||
b=page_height - self.b, # self.t
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
|
||||
|
||||
class Cell(BaseModel):
|
||||
id: int
|
||||
text: str
|
||||
@@ -169,14 +101,14 @@ class OcrCell(Cell):
|
||||
|
||||
class Cluster(BaseModel):
|
||||
id: int
|
||||
label: str
|
||||
label: DocItemLabel
|
||||
bbox: BoundingBox
|
||||
confidence: float = 1.0
|
||||
cells: List[Cell] = []
|
||||
|
||||
|
||||
class BasePageElement(BaseModel):
|
||||
label: str
|
||||
label: DocItemLabel
|
||||
id: int
|
||||
page_no: int
|
||||
cluster: Cluster
|
||||
@@ -187,37 +119,7 @@ class LayoutPrediction(BaseModel):
|
||||
clusters: List[Cluster] = []
|
||||
|
||||
|
||||
class TableCell(BaseModel):
|
||||
bbox: BoundingBox
|
||||
row_span: int
|
||||
col_span: int
|
||||
start_row_offset_idx: int
|
||||
end_row_offset_idx: int
|
||||
start_col_offset_idx: int
|
||||
end_col_offset_idx: int
|
||||
text: str
|
||||
column_header: bool = False
|
||||
row_header: bool = False
|
||||
row_section: bool = False
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def from_dict_format(cls, data: Any) -> Any:
|
||||
if isinstance(data, Dict):
|
||||
text = data["bbox"].get("token", "")
|
||||
if not len(text):
|
||||
text_cells = data.pop("text_cell_bboxes", None)
|
||||
if text_cells:
|
||||
for el in text_cells:
|
||||
text += el["token"] + " "
|
||||
|
||||
text = text.strip()
|
||||
data["text"] = text
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class TableElement(BasePageElement):
|
||||
class Table(BasePageElement):
|
||||
otsl_seq: List[str]
|
||||
num_rows: int = 0
|
||||
num_cols: int = 0
|
||||
@@ -225,18 +127,15 @@ class TableElement(BasePageElement):
|
||||
|
||||
|
||||
class TableStructurePrediction(BaseModel):
|
||||
table_map: Dict[int, TableElement] = {}
|
||||
table_map: Dict[int, Table] = {}
|
||||
|
||||
|
||||
class TextElement(BasePageElement): ...
|
||||
|
||||
|
||||
class FigureData(BaseModel):
|
||||
pass
|
||||
class TextElement(BasePageElement):
|
||||
text: str
|
||||
|
||||
|
||||
class FigureElement(BasePageElement):
|
||||
data: Optional[FigureData] = None
|
||||
annotations: List[PictureDataType] = []
|
||||
provenance: Optional[str] = None
|
||||
predicted_class: Optional[str] = None
|
||||
confidence: Optional[float] = None
|
||||
@@ -259,7 +158,7 @@ class PagePredictions(BaseModel):
|
||||
equations_prediction: Optional[EquationPrediction] = None
|
||||
|
||||
|
||||
PageElement = Union[TextElement, TableElement, FigureElement]
|
||||
PageElement = Union[TextElement, Table, FigureElement]
|
||||
|
||||
|
||||
class AssembledUnit(BaseModel):
|
||||
@@ -272,13 +171,13 @@ class Page(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
page_no: int
|
||||
page_hash: Optional[str] = None
|
||||
size: Optional[PageSize] = None
|
||||
# page_hash: Optional[str] = None
|
||||
size: Optional[Size] = None
|
||||
cells: List[Cell] = []
|
||||
predictions: PagePredictions = PagePredictions()
|
||||
assembled: Optional[AssembledUnit] = None
|
||||
|
||||
_backend: Optional[PdfPageBackend] = (
|
||||
_backend: Optional["PdfPageBackend"] = (
|
||||
None # Internal PDF backend. By default it is cleared during assembling.
|
||||
)
|
||||
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
||||
@@ -301,24 +200,5 @@ class Page(BaseModel):
|
||||
class DocumentStream(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
filename: str
|
||||
name: str
|
||||
stream: BytesIO
|
||||
|
||||
|
||||
class AssembleOptions(BaseModel):
|
||||
keep_page_images: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
||||
),
|
||||
] = False # False: page images are removed in the assemble step
|
||||
images_scale: Optional[float] = None # if set, the scale for generated images
|
||||
|
||||
@model_validator(mode="after")
|
||||
def set_page_images_from_deprecated(self) -> Self:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
default_scale = 1.0
|
||||
if self.keep_page_images and self.images_scale is None:
|
||||
self.images_scale = default_scale
|
||||
return self
|
||||
|
||||
@@ -1,87 +1,101 @@
|
||||
import logging
|
||||
import re
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
|
||||
from docling_core.types import BaseCell, BaseText
|
||||
import filetype
|
||||
from docling_core.types import BaseText
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||
from docling_core.types import Table as DsSchemaTable
|
||||
from docling_core.types import TableCell
|
||||
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
|
||||
from docling_core.types.doc.base import Figure
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
PictureItem,
|
||||
SectionHeaderItem,
|
||||
TableItem,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.document import ListItem
|
||||
from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
PaginatedDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
FigureElement,
|
||||
InputFormat,
|
||||
MimeTypeToFormat,
|
||||
Page,
|
||||
PageElement,
|
||||
TableElement,
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
from docling.utils.utils import create_file_hash
|
||||
from docling.utils.utils import create_file_hash, create_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.document_converter import FormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
layout_label_to_ds_type = {
|
||||
"Title": "title",
|
||||
"Document Index": "table-of-path_or_stream",
|
||||
"Section-header": "subtitle-level-1",
|
||||
"Checkbox-Selected": "checkbox-selected",
|
||||
"Checkbox-Unselected": "checkbox-unselected",
|
||||
"Caption": "caption",
|
||||
"Page-header": "page-header",
|
||||
"Page-footer": "page-footer",
|
||||
"Footnote": "footnote",
|
||||
"Table": "table",
|
||||
"Formula": "equation",
|
||||
"List-item": "paragraph",
|
||||
"Code": "paragraph",
|
||||
"Picture": "figure",
|
||||
"Text": "paragraph",
|
||||
DocItemLabel.TITLE: "title",
|
||||
DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
|
||||
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
|
||||
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
||||
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
||||
DocItemLabel.CAPTION: "caption",
|
||||
DocItemLabel.PAGE_HEADER: "page-header",
|
||||
DocItemLabel.PAGE_FOOTER: "page-footer",
|
||||
DocItemLabel.FOOTNOTE: "footnote",
|
||||
DocItemLabel.TABLE: "table",
|
||||
DocItemLabel.FORMULA: "equation",
|
||||
DocItemLabel.LIST_ITEM: "paragraph",
|
||||
DocItemLabel.CODE: "paragraph",
|
||||
DocItemLabel.PICTURE: "figure",
|
||||
DocItemLabel.TEXT: "paragraph",
|
||||
DocItemLabel.PARAGRAPH: "paragraph",
|
||||
}
|
||||
|
||||
_EMPTY_DOC = DsDocument(
|
||||
_name="",
|
||||
description=DsDocumentDescription(logs=[]),
|
||||
file_info=DsFileInfoObject(
|
||||
filename="",
|
||||
document_hash="",
|
||||
),
|
||||
)
|
||||
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
||||
|
||||
|
||||
class InputDocument(BaseModel):
|
||||
file: PurePath = None
|
||||
document_hash: Optional[str] = None
|
||||
valid: bool = False
|
||||
file: PurePath
|
||||
document_hash: str # = None
|
||||
valid: bool = True
|
||||
limits: DocumentLimits = DocumentLimits()
|
||||
format: InputFormat # = None
|
||||
|
||||
filesize: Optional[int] = None
|
||||
page_count: Optional[int] = None
|
||||
page_count: int = 0
|
||||
|
||||
_backend: PdfDocumentBackend = None # Internal PDF backend used
|
||||
_backend: AbstractDocumentBackend # Internal PDF backend used
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
format: InputFormat,
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
filename: Optional[str] = None,
|
||||
limits: Optional[DocumentLimits] = None,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
):
|
||||
super().__init__()
|
||||
super().__init__(
|
||||
file="", document_hash="", format=InputFormat.PDF
|
||||
) # initialize with dummy values
|
||||
|
||||
self.limits = limits or DocumentLimits()
|
||||
self.format = format
|
||||
|
||||
try:
|
||||
if isinstance(path_or_stream, Path):
|
||||
@@ -91,11 +105,12 @@ class InputDocument(BaseModel):
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = pdf_backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
self._init_doc(backend, path_or_stream)
|
||||
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
assert (
|
||||
filename is not None
|
||||
), "Can't construct InputDocument from stream without providing filename arg."
|
||||
self.file = PurePath(filename)
|
||||
self.filesize = path_or_stream.getbuffer().nbytes
|
||||
|
||||
@@ -103,15 +118,20 @@ class InputDocument(BaseModel):
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = pdf_backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
self._init_doc(backend, path_or_stream)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Unexpected type path_or_stream: {type(path_or_stream)}"
|
||||
)
|
||||
|
||||
if self.document_hash and self._backend.page_count() > 0:
|
||||
self.page_count = self._backend.page_count()
|
||||
|
||||
if self.page_count <= self.limits.max_num_pages:
|
||||
self.valid = True
|
||||
# For paginated backends, check if the maximum page count is exceeded.
|
||||
if self.valid and self._backend.is_valid():
|
||||
if self._backend.supports_pagination() and isinstance(
|
||||
self._backend, PaginatedDocumentBackend
|
||||
):
|
||||
self.page_count = self._backend.page_count()
|
||||
if not self.page_count <= self.limits.max_num_pages:
|
||||
self.valid = False
|
||||
|
||||
except (FileNotFoundError, OSError) as e:
|
||||
_log.exception(
|
||||
@@ -125,9 +145,26 @@ class InputDocument(BaseModel):
|
||||
)
|
||||
# raise
|
||||
|
||||
def _init_doc(
|
||||
self,
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
if backend is None:
|
||||
raise RuntimeError(
|
||||
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
|
||||
@deprecated("Use `ConversionResult` instead.")
|
||||
class ConvertedDocument(BaseModel):
|
||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||
|
||||
|
||||
class DocumentFormat(str, Enum):
|
||||
V2 = "v2"
|
||||
V1 = "v1"
|
||||
|
||||
|
||||
class ConversionResult(BaseModel):
|
||||
input: InputDocument
|
||||
|
||||
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
||||
@@ -136,15 +173,42 @@ class ConvertedDocument(BaseModel):
|
||||
pages: List[Page] = []
|
||||
assembled: AssembledUnit = AssembledUnit()
|
||||
|
||||
output: DsDocument = _EMPTY_DOC
|
||||
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||
|
||||
@property
|
||||
@deprecated("Use document instead.")
|
||||
def legacy_document(self):
|
||||
reverse_label_mapping = {
|
||||
DocItemLabel.CAPTION.value: "Caption",
|
||||
DocItemLabel.FOOTNOTE.value: "Footnote",
|
||||
DocItemLabel.FORMULA.value: "Formula",
|
||||
DocItemLabel.LIST_ITEM.value: "List-item",
|
||||
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
|
||||
DocItemLabel.PAGE_HEADER.value: "Page-header",
|
||||
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
|
||||
DocItemLabel.SECTION_HEADER.value: "Section-header",
|
||||
DocItemLabel.TABLE.value: "Table",
|
||||
DocItemLabel.TEXT.value: "Text",
|
||||
DocItemLabel.TITLE.value: "Title",
|
||||
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
|
||||
DocItemLabel.CODE.value: "Code",
|
||||
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
|
||||
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
|
||||
DocItemLabel.FORM.value: "Form",
|
||||
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
|
||||
DocItemLabel.PARAGRAPH.value: "paragraph",
|
||||
}
|
||||
|
||||
def _to_ds_document(self) -> DsDocument:
|
||||
title = ""
|
||||
desc = DsDocumentDescription(logs=[])
|
||||
|
||||
page_hashes = [
|
||||
PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
|
||||
for p in self.pages
|
||||
PageReference(
|
||||
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
|
||||
page=p.page_no,
|
||||
model="default",
|
||||
)
|
||||
for p in self.document.pages.values()
|
||||
]
|
||||
|
||||
file_info = DsFileInfoObject(
|
||||
@@ -157,145 +221,199 @@ class ConvertedDocument(BaseModel):
|
||||
main_text = []
|
||||
tables = []
|
||||
figures = []
|
||||
equations = []
|
||||
footnotes = []
|
||||
page_headers = []
|
||||
page_footers = []
|
||||
|
||||
page_no_to_page = {p.page_no: p for p in self.pages}
|
||||
embedded_captions = set()
|
||||
for ix, (item, level) in enumerate(
|
||||
self.document.iterate_items(self.document.body)
|
||||
):
|
||||
|
||||
for element in self.assembled.elements:
|
||||
# Convert bboxes to lower-left origin.
|
||||
target_bbox = DsBoundingBox(
|
||||
element.cluster.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple()
|
||||
)
|
||||
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
|
||||
caption = item.caption_text(self.document)
|
||||
if caption:
|
||||
embedded_captions.add(caption)
|
||||
|
||||
if isinstance(element, TextElement):
|
||||
main_text.append(
|
||||
BaseText(
|
||||
text=element.text,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
name=element.label,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, len(element.text)],
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
elif isinstance(element, TableElement):
|
||||
index = len(tables)
|
||||
ref_str = f"#/tables/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
for item, level in self.document.iterate_items():
|
||||
if isinstance(item, DocItem):
|
||||
item_type = item.label
|
||||
|
||||
# Initialise empty table data grid (only empty cells)
|
||||
table_data = [
|
||||
[
|
||||
TableCell(
|
||||
text="",
|
||||
# bbox=[0,0,0,0],
|
||||
spans=[[i, j]],
|
||||
obj_type="body",
|
||||
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
|
||||
|
||||
if isinstance(item, ListItem) and item.marker:
|
||||
text = f"{item.marker} {item.text}"
|
||||
else:
|
||||
text = item.text
|
||||
|
||||
# Can be empty.
|
||||
prov = [
|
||||
Prov(
|
||||
bbox=p.bbox.as_tuple(),
|
||||
page=p.page_no,
|
||||
span=[0, len(item.text)],
|
||||
)
|
||||
for j in range(element.num_cols)
|
||||
for p in item.prov
|
||||
]
|
||||
for i in range(element.num_rows)
|
||||
]
|
||||
main_text.append(
|
||||
BaseText(
|
||||
text=text,
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
name=reverse_label_mapping[item.label],
|
||||
prov=prov,
|
||||
)
|
||||
)
|
||||
|
||||
# Overwrite cells in table data for which there is actual cell content.
|
||||
for cell in element.table_cells:
|
||||
for i in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for j in range(
|
||||
min(cell.start_col_offset_idx, element.num_cols),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
# skip captions of they are embedded in the actual
|
||||
# floating object
|
||||
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
||||
continue
|
||||
|
||||
elif isinstance(item, TableItem) and item.data:
|
||||
index = len(tables)
|
||||
ref_str = f"#/tables/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=reverse_label_mapping[item.label],
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
|
||||
# Initialise empty table data grid (only empty cells)
|
||||
table_data = [
|
||||
[
|
||||
TableCell(
|
||||
text="",
|
||||
# bbox=[0,0,0,0],
|
||||
spans=[[i, j]],
|
||||
obj_type="body",
|
||||
)
|
||||
for j in range(item.data.num_cols)
|
||||
]
|
||||
for i in range(item.data.num_rows)
|
||||
]
|
||||
|
||||
# Overwrite cells in table data for which there is actual cell content.
|
||||
for cell in item.data.table_cells:
|
||||
for i in range(
|
||||
min(cell.start_row_offset_idx, item.data.num_rows),
|
||||
min(cell.end_row_offset_idx, item.data.num_rows),
|
||||
):
|
||||
celltype = "body"
|
||||
if cell.column_header:
|
||||
celltype = "col_header"
|
||||
elif cell.row_header:
|
||||
celltype = "row_header"
|
||||
elif cell.row_section:
|
||||
celltype = "row_section"
|
||||
for j in range(
|
||||
min(cell.start_col_offset_idx, item.data.num_cols),
|
||||
min(cell.end_col_offset_idx, item.data.num_cols),
|
||||
):
|
||||
celltype = "body"
|
||||
if cell.column_header:
|
||||
celltype = "col_header"
|
||||
elif cell.row_header:
|
||||
celltype = "row_header"
|
||||
elif cell.row_section:
|
||||
celltype = "row_section"
|
||||
|
||||
def make_spans(cell):
|
||||
for rspan in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for cspan in range(
|
||||
def make_spans(cell):
|
||||
for rspan in range(
|
||||
min(
|
||||
cell.start_col_offset_idx, element.num_cols
|
||||
cell.start_row_offset_idx,
|
||||
item.data.num_rows,
|
||||
),
|
||||
min(
|
||||
cell.end_row_offset_idx, item.data.num_rows
|
||||
),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
):
|
||||
yield [rspan, cspan]
|
||||
for cspan in range(
|
||||
min(
|
||||
cell.start_col_offset_idx,
|
||||
item.data.num_cols,
|
||||
),
|
||||
min(
|
||||
cell.end_col_offset_idx,
|
||||
item.data.num_cols,
|
||||
),
|
||||
):
|
||||
yield [rspan, cspan]
|
||||
|
||||
spans = list(make_spans(cell))
|
||||
table_data[i][j] = TableCell(
|
||||
text=cell.text,
|
||||
bbox=cell.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple(),
|
||||
# col=j,
|
||||
# row=i,
|
||||
spans=spans,
|
||||
obj_type=celltype,
|
||||
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
|
||||
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
|
||||
)
|
||||
spans = list(make_spans(cell))
|
||||
table_data[i][j] = GlmTableCell(
|
||||
text=cell.text,
|
||||
bbox=(
|
||||
cell.bbox.as_tuple()
|
||||
if cell.bbox is not None
|
||||
else None
|
||||
), # check if this is bottom-left
|
||||
spans=spans,
|
||||
obj_type=celltype,
|
||||
col=j,
|
||||
row=i,
|
||||
row_header=cell.row_header,
|
||||
row_section=cell.row_section,
|
||||
col_header=cell.column_header,
|
||||
row_span=[
|
||||
cell.start_row_offset_idx,
|
||||
cell.end_row_offset_idx,
|
||||
],
|
||||
col_span=[
|
||||
cell.start_col_offset_idx,
|
||||
cell.end_col_offset_idx,
|
||||
],
|
||||
)
|
||||
|
||||
tables.append(
|
||||
DsSchemaTable(
|
||||
num_cols=element.num_cols,
|
||||
num_rows=element.num_rows,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
data=table_data,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
# Compute the caption
|
||||
caption = item.caption_text(self.document)
|
||||
|
||||
tables.append(
|
||||
DsSchemaTable(
|
||||
text=caption,
|
||||
num_cols=item.data.num_cols,
|
||||
num_rows=item.data.num_rows,
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
data=table_data,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=p.bbox.as_tuple(),
|
||||
page=p.page_no,
|
||||
span=[0, 0],
|
||||
)
|
||||
for p in item.prov
|
||||
],
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(element, FigureElement):
|
||||
index = len(figures)
|
||||
ref_str = f"#/figures/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
figures.append(
|
||||
Figure(
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
# data=[[]],
|
||||
elif isinstance(item, PictureItem):
|
||||
index = len(figures)
|
||||
ref_str = f"#/figures/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=reverse_label_mapping[item.label],
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
|
||||
# Compute the caption
|
||||
caption = item.caption_text(self.document)
|
||||
|
||||
figures.append(
|
||||
Figure(
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=p.bbox.as_tuple(),
|
||||
page=p.page_no,
|
||||
span=[0, len(caption)],
|
||||
)
|
||||
for p in item.prov
|
||||
],
|
||||
obj_type=layout_label_to_ds_type.get(item.label),
|
||||
text=caption,
|
||||
# data=[[]],
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
page_dimensions = [
|
||||
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
||||
for p in self.pages
|
||||
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
||||
for p in self.document.pages.values()
|
||||
]
|
||||
|
||||
ds_doc = DsDocument(
|
||||
@@ -303,6 +421,10 @@ class ConvertedDocument(BaseModel):
|
||||
description=desc,
|
||||
file_info=file_info,
|
||||
main_text=main_text,
|
||||
equations=equations,
|
||||
footnotes=footnotes,
|
||||
page_headers=page_headers,
|
||||
page_footers=page_footers,
|
||||
tables=tables,
|
||||
figures=figures,
|
||||
page_dimensions=page_dimensions,
|
||||
@@ -310,152 +432,76 @@ class ConvertedDocument(BaseModel):
|
||||
|
||||
return ds_doc
|
||||
|
||||
def render_as_dict(self):
|
||||
return self.output.model_dump(by_alias=True, exclude_none=True)
|
||||
|
||||
def render_as_markdown(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
main_text_stop: Optional[int] = None,
|
||||
main_text_labels: list[str] = [
|
||||
"title",
|
||||
"subtitle-level-1",
|
||||
"paragraph",
|
||||
"caption",
|
||||
"table",
|
||||
"figure",
|
||||
],
|
||||
strict_text: bool = False,
|
||||
image_placeholder: str = "<!-- image -->",
|
||||
):
|
||||
return self.output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
strict_text=strict_text,
|
||||
image_placeholder=image_placeholder,
|
||||
)
|
||||
class _DocumentConversionInput(BaseModel):
|
||||
|
||||
def render_as_text(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
main_text_stop: Optional[int] = None,
|
||||
main_text_labels: list[str] = [
|
||||
"title",
|
||||
"subtitle-level-1",
|
||||
"paragraph",
|
||||
"caption",
|
||||
],
|
||||
):
|
||||
return self.output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
strict_text=True,
|
||||
)
|
||||
|
||||
def render_as_doctags(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
main_text_stop: Optional[int] = None,
|
||||
main_text_labels: list[str] = [
|
||||
"title",
|
||||
"subtitle-level-1",
|
||||
"paragraph",
|
||||
"caption",
|
||||
"table",
|
||||
"figure",
|
||||
],
|
||||
xsize: int = 100,
|
||||
ysize: int = 100,
|
||||
add_location: bool = True,
|
||||
add_content: bool = True,
|
||||
add_page_index: bool = True,
|
||||
# table specific flags
|
||||
add_table_cell_location: bool = False,
|
||||
add_table_cell_label: bool = True,
|
||||
add_table_cell_text: bool = True,
|
||||
) -> str:
|
||||
return self.output.export_to_document_tokens(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
xsize=xsize,
|
||||
ysize=ysize,
|
||||
add_location=add_location,
|
||||
add_content=add_content,
|
||||
add_page_index=add_page_index,
|
||||
# table specific flags
|
||||
add_table_cell_location=add_table_cell_location,
|
||||
add_table_cell_label=add_table_cell_label,
|
||||
add_table_cell_text=add_table_cell_text,
|
||||
)
|
||||
|
||||
def render_element_images(
|
||||
self, element_types: Tuple[PageElement] = (FigureElement,)
|
||||
):
|
||||
for element in self.assembled.elements:
|
||||
if isinstance(element, element_types):
|
||||
page_ix = element.page_no
|
||||
scale = self.pages[page_ix]._default_image_scale
|
||||
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
|
||||
page_height=self.pages[page_ix].size.height * scale
|
||||
)
|
||||
|
||||
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
|
||||
yield element, cropped_im
|
||||
|
||||
|
||||
class ConversionResult(ConvertedDocument):
|
||||
pass
|
||||
|
||||
|
||||
class DocumentConversionInput(BaseModel):
|
||||
|
||||
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||
|
||||
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
||||
|
||||
def docs(
|
||||
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
|
||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||
) -> Iterable[InputDocument]:
|
||||
for item in self.path_or_stream_iterator:
|
||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||
format = self._guess_format(obj)
|
||||
if format not in format_options.keys():
|
||||
_log.info(
|
||||
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
||||
)
|
||||
continue
|
||||
else:
|
||||
backend = format_options[format].backend
|
||||
|
||||
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
|
||||
|
||||
for obj in self._path_or_stream_iterator:
|
||||
if isinstance(obj, Path):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
|
||||
path_or_stream=obj,
|
||||
format=format,
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
)
|
||||
elif isinstance(obj, DocumentStream):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj.stream,
|
||||
filename=obj.filename,
|
||||
format=format,
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
pdf_backend=pdf_backend,
|
||||
backend=backend,
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
||||
|
||||
@classmethod
|
||||
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
||||
paths = [Path(p) for p in paths]
|
||||
def _guess_format(self, obj):
|
||||
content = None
|
||||
if isinstance(obj, Path):
|
||||
mime = filetype.guess_mime(str(obj))
|
||||
if mime is None:
|
||||
with obj.open("rb") as f:
|
||||
content = f.read(1024) # Read first 1KB
|
||||
|
||||
doc_input = cls(limits=limits)
|
||||
doc_input._path_or_stream_iterator = paths
|
||||
elif isinstance(obj, DocumentStream):
|
||||
obj.stream.seek(0)
|
||||
content = obj.stream.read(8192)
|
||||
obj.stream.seek(0)
|
||||
mime = filetype.guess_mime(content)
|
||||
|
||||
return doc_input
|
||||
if mime is None:
|
||||
mime = self._detect_html_xhtml(content)
|
||||
|
||||
@classmethod
|
||||
def from_streams(
|
||||
cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
|
||||
):
|
||||
doc_input = cls(limits=limits)
|
||||
doc_input._path_or_stream_iterator = streams
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
return format
|
||||
|
||||
return doc_input
|
||||
def _detect_html_xhtml(self, content):
|
||||
content_str = content.decode("ascii", errors="ignore").lower()
|
||||
# Remove XML comments
|
||||
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
||||
content_str = content_str.lstrip()
|
||||
|
||||
if re.match(r"<\?xml", content_str):
|
||||
if "xhtml" in content_str[:1000]:
|
||||
return "application/xhtml+xml"
|
||||
|
||||
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
||||
return "text/html"
|
||||
|
||||
return None
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from enum import Enum, auto
|
||||
from pathlib import Path
|
||||
from typing import List, Literal, Optional, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
@@ -58,6 +59,13 @@ class TesseractOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
create_legacy_output: bool = (
|
||||
True # This defautl will be set to False on a future version of docling
|
||||
)
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
@@ -65,3 +73,8 @@ class PipelineOptions(BaseModel):
|
||||
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
|
||||
Field(EasyOcrOptions(), discriminator="kind")
|
||||
)
|
||||
|
||||
images_scale: float = 1.0
|
||||
generate_page_images: bool = False
|
||||
generate_picture_images: bool = False
|
||||
generate_table_images: bool = False
|
||||
|
||||
@@ -14,6 +14,7 @@ class BatchConcurrencySettings(BaseModel):
|
||||
doc_batch_concurrency: int = 2
|
||||
page_batch_size: int = 4
|
||||
page_batch_concurrency: int = 2
|
||||
elements_batch_size: int = 16
|
||||
|
||||
# doc_batch_size: int = 1
|
||||
# doc_batch_concurrency: int = 1
|
||||
|
||||
Reference in New Issue
Block a user