Merge pull request #504 from DS4SD/cau/layout-postprocessing

feat: Support hierarchical layout components, expose and group content in pictures, forms and key-value regions
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-12-06 12:26:34 +01:00 committed by GitHub
commit b0da1a2127
16 changed files with 1292 additions and 1297 deletions

View File

@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_parse.docling_parse import pdf_parser_v1
from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage

View File

@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_parse.docling_parse import pdf_parser_v2
from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage

View File

@ -121,6 +121,7 @@ class Cluster(BaseModel):
bbox: BoundingBox
confidence: float = 1.0
cells: List[Cell] = []
children: List["Cluster"] = [] # Add child cluster support
class BasePageElement(BaseModel):
@ -135,6 +136,12 @@ class LayoutPrediction(BaseModel):
clusters: List[Cluster] = []
class ContainerElement(
BasePageElement
): # Used for Form and Key-Value-Regions, only for typing.
pass
class Table(BasePageElement):
otsl_seq: List[str]
num_rows: int = 0
@ -174,7 +181,7 @@ class PagePredictions(BaseModel):
equations_prediction: Optional[EquationPrediction] = None
PageElement = Union[TextElement, Table, FigureElement]
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
class AssembledUnit(BaseModel):

View File

@ -77,6 +77,8 @@ layout_label_to_ds_type = {
DocItemLabel.PICTURE: "figure",
DocItemLabel.TEXT: "paragraph",
DocItemLabel.PARAGRAPH: "paragraph",
DocItemLabel.FORM: DocItemLabel.FORM.value,
DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
}
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")

View File

@ -31,6 +31,7 @@ class DebugSettings(BaseModel):
visualize_cells: bool = False
visualize_ocr: bool = False
visualize_layout: bool = False
visualize_raw_layout: bool = False
visualize_tables: bool = False
profile_pipeline_timings: bool = False

View File

@ -10,6 +10,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
@ -84,7 +85,7 @@ class HTMLFormatOption(FormatOption):
class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
class ImageFormatOption(FormatOption):

View File

@ -4,7 +4,6 @@ from pathlib import Path
from typing import List, Union
from deepsearch_glm.nlp_utils import init_nlp_model
from deepsearch_glm.utils.doc_utils import to_docling_document
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
@ -24,11 +23,18 @@ from docling_core.types.legacy_doc.document import (
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict
from pydantic import BaseModel, ConfigDict, TypeAdapter
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
from docling.datamodel.base_models import (
Cluster,
ContainerElement,
FigureElement,
Table,
TextElement,
)
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.datamodel.settings import settings
from docling.utils.glm_utils import to_docling_document
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import create_hash
@ -45,7 +51,9 @@ class GlmModel:
if self.options.model_names != "":
load_pretrained_nlp_models()
self.model = init_nlp_model(model_names=self.options.model_names)
self.model = init_nlp_model(
model_names=self.options.model_names, loglevel="ERROR"
)
def _to_legacy_document(self, conv_res) -> DsDocument:
title = ""
@ -207,7 +215,31 @@ class GlmModel:
)
],
obj_type=layout_label_to_ds_type.get(element.label),
# data=[[]],
payload={
"children": TypeAdapter(List[Cluster]).dump_python(
element.cluster.children
)
}, # hack to channel child clusters through GLM
)
)
elif isinstance(element, ContainerElement):
main_text.append(
BaseText(
text="",
payload={
"children": TypeAdapter(List[Cluster]).dump_python(
element.cluster.children
)
}, # hack to channel child clusters through GLM
obj_type=layout_label_to_ds_type.get(element.label),
name=element.label,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, 0],
)
],
)
)
@ -232,7 +264,7 @@ class GlmModel:
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True)
ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
glm_doc = self.model.apply_on_doc(ds_doc_dict)

View File

@ -7,7 +7,7 @@ from typing import Iterable, List
from docling_core.types.doc import CoordOrigin, DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import ImageDraw
from PIL import Image, ImageDraw
from docling.datamodel.base_models import (
BoundingBox,
@ -19,7 +19,7 @@ from docling.datamodel.base_models import (
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils import layout_utils as lu
from docling.utils.layout_postprocessor import LayoutPostprocessor
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@ -46,233 +46,108 @@ class LayoutModel(BasePageModel):
FIGURE_LABEL = DocItemLabel.PICTURE
FORMULA_LABEL = DocItemLabel.FORMULA
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
def __init__(self, artifacts_path: Path):
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
MIN_INTERSECTION = 0.2
CLASS_THRESHOLDS = {
DocItemLabel.CAPTION: 0.35,
DocItemLabel.FOOTNOTE: 0.35,
DocItemLabel.FORMULA: 0.35,
DocItemLabel.LIST_ITEM: 0.35,
DocItemLabel.PAGE_FOOTER: 0.35,
DocItemLabel.PAGE_HEADER: 0.35,
DocItemLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
DocItemLabel.SECTION_HEADER: 0.45,
DocItemLabel.TABLE: 0.35,
DocItemLabel.TEXT: 0.45,
DocItemLabel.TITLE: 0.45,
DocItemLabel.DOCUMENT_INDEX: 0.45,
DocItemLabel.CODE: 0.45,
DocItemLabel.CHECKBOX_SELECTED: 0.45,
DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
DocItemLabel.FORM: 0.45,
DocItemLabel.KEY_VALUE_REGION: 0.45,
def draw_clusters_and_cells_side_by_side(
self, conv_res, page, clusters, mode_prefix: str, show: bool = False
):
"""
Draws a page image side by side with clusters filtered into two categories:
- Left: Clusters excluding FORM, KEY_VALUE_REGION, and PICTURE.
- Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
"""
label_to_color = {
DocItemLabel.TEXT: (255, 255, 153), # Light Yellow
DocItemLabel.CAPTION: (255, 204, 153), # Light Orange
DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
DocItemLabel.FORMULA: (192, 192, 192), # Gray
DocItemLabel.TABLE: (255, 204, 204), # Light Pink
DocItemLabel.PICTURE: (255, 204, 164), # Light Beige
DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
DocItemLabel.PAGE_FOOTER: (
204,
255,
204,
), # Light Green (same as Page-Header)
DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header)
DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
DocItemLabel.CODE: (255, 223, 186), # Peach
DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green
DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink
DocItemLabel.FORM: (200, 255, 255), # Light Cyan
DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange
}
CLASS_REMAPPINGS = {
DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
# Filter clusters for left and right images
exclude_labels = {
DocItemLabel.FORM,
DocItemLabel.KEY_VALUE_REGION,
DocItemLabel.PICTURE,
}
left_clusters = [c for c in clusters if c.label not in exclude_labels]
right_clusters = [c for c in clusters if c.label in exclude_labels]
_log.debug("================= Start postprocess function ====================")
start_time = time.time()
# Apply Confidence Threshold to cluster predictions
# confidence = self.conf_threshold
clusters_mod = []
# Create a deep copy of the original image for both sides
left_image = copy.deepcopy(page.image)
right_image = copy.deepcopy(page.image)
for cluster in clusters_in:
confidence = CLASS_THRESHOLDS[cluster.label]
if cluster.confidence >= confidence:
# annotation["created_by"] = "high_conf_pred"
# Function to draw clusters on an image
def draw_clusters(image, clusters):
draw = ImageDraw.Draw(image, "RGBA")
for c_tl in clusters:
all_clusters = [c_tl, *c_tl.children]
for c in all_clusters:
cell_color = (0, 0, 0, 40) # Transparent black for cells
for tc in c.cells:
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
draw.rectangle(
[(cx0, cy0), (cx1, cy1)],
outline=None,
fill=cell_color,
)
# Remap class labels where needed.
if cluster.label in CLASS_REMAPPINGS.keys():
cluster.label = CLASS_REMAPPINGS[cluster.label]
clusters_mod.append(cluster)
x0, y0, x1, y1 = c.bbox.as_tuple()
cluster_fill_color = (
*list(label_to_color.get(c.label)), # type: ignore
70,
)
cluster_outline_color = (
*list(label_to_color.get(c.label)), # type: ignore
255,
)
draw.rectangle(
[(x0, y0), (x1, y1)],
outline=cluster_outline_color,
fill=cluster_fill_color,
)
# map to dictionary clusters and cells, with bottom left origin
clusters_orig = [
{
"id": c.id,
"bbox": list(
c.bbox.to_bottom_left_origin(page_height).as_tuple()
), # TODO
"confidence": c.confidence,
"cell_ids": [],
"type": c.label,
}
for c in clusters_in
]
# Draw clusters on both images
draw_clusters(left_image, left_clusters)
draw_clusters(right_image, right_clusters)
clusters_out = [
{
"id": c.id,
"bbox": list(
c.bbox.to_bottom_left_origin(page_height).as_tuple()
), # TODO
"confidence": c.confidence,
"created_by": "high_conf_pred",
"cell_ids": [],
"type": c.label,
}
for c in clusters_mod
]
# Combine the images side by side
combined_width = left_image.width * 2
combined_height = left_image.height
combined_image = Image.new("RGB", (combined_width, combined_height))
combined_image.paste(left_image, (0, 0))
combined_image.paste(right_image, (left_image.width, 0))
del clusters_mod
raw_cells = [
{
"id": c.id,
"bbox": list(
c.bbox.to_bottom_left_origin(page_height).as_tuple()
), # TODO
"text": c.text,
}
for c in cells
]
cell_count = len(raw_cells)
_log.debug("---- 0. Treat cluster overlaps ------")
clusters_out = lu.remove_cluster_duplicates_by_conf(clusters_out, 0.8)
_log.debug(
"---- 1. Initially assign cells to clusters based on minimum intersection ------"
)
## Check for cells included in or touched by clusters:
clusters_out = lu.assigning_cell_ids_to_clusters(
clusters_out, raw_cells, MIN_INTERSECTION
)
_log.debug("---- 2. Assign Orphans with Low Confidence Detections")
# Creates a map of cell_id->cluster_id
(
clusters_around_cells,
orphan_cell_indices,
ambiguous_cell_indices,
) = lu.cell_id_state_map(clusters_out, cell_count)
# Assign orphan cells with lower confidence predictions
clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
clusters_out, clusters_orig, raw_cells, orphan_cell_indices
)
# Refresh the cell_ids assignment, after creating new clusters using low conf predictions
clusters_out = lu.assigning_cell_ids_to_clusters(
clusters_out, raw_cells, MIN_INTERSECTION
)
_log.debug("---- 3. Settle Ambigous Cells")
# Creates an update map after assignment of cell_id->cluster_id
(
clusters_around_cells,
orphan_cell_indices,
ambiguous_cell_indices,
) = lu.cell_id_state_map(clusters_out, cell_count)
# Settle pdf cells that belong to multiple clusters
clusters_out, ambiguous_cell_indices = lu.remove_ambigous_pdf_cell_by_conf(
clusters_out, raw_cells, ambiguous_cell_indices
)
_log.debug("---- 4. Set Orphans as Text")
(
clusters_around_cells,
orphan_cell_indices,
ambiguous_cell_indices,
) = lu.cell_id_state_map(clusters_out, cell_count)
clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
clusters_out, clusters_orig, raw_cells, orphan_cell_indices
)
_log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
# Merge cells orphan cells
clusters_out = lu.merge_cells(clusters_out)
# Clean up clusters that remain from merged and unreasonable clusters
clusters_out = lu.clean_up_clusters(
clusters_out,
raw_cells,
merge_cells=True,
img_table=True,
one_cell_table=True,
)
new_clusters = lu.adapt_bboxes(raw_cells, clusters_out, orphan_cell_indices)
clusters_out = new_clusters
## We first rebuild where every cell is now:
## Now we write into a prediction cells list, not into the raw cells list.
## As we don't need previous labels, we best overwrite any old list, because that might
## have been sorted differently.
(
clusters_around_cells,
orphan_cell_indices,
ambiguous_cell_indices,
) = lu.cell_id_state_map(clusters_out, cell_count)
target_cells = []
for ix, cell in enumerate(raw_cells):
new_cell = {
"id": ix,
"rawcell_id": ix,
"label": "None",
"bbox": cell["bbox"],
"text": cell["text"],
}
for cluster_index in clusters_around_cells[
ix
]: # By previous analysis, this is always 1 cluster.
new_cell["label"] = clusters_out[cluster_index]["type"]
target_cells.append(new_cell)
# _log.debug("New label of cell " + str(ix) + " is " + str(new_cell["label"]))
cells_out = target_cells
## -------------------------------
## Sort clusters into reasonable reading order, and sort the cells inside each cluster
_log.debug("---- 5. Sort clusters in reading order ------")
sorted_clusters = lu.produce_reading_order(
clusters_out, "raw_cell_ids", "raw_cell_ids", True
)
clusters_out = sorted_clusters
# end_time = timer()
_log.debug("---- End of postprocessing function ------")
end_time = time.time() - start_time
_log.debug(f"Finished post processing in seconds={end_time:.3f}")
cells_out_new = [
Cell(
id=c["id"], # type: ignore
bbox=BoundingBox.from_tuple(
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
).to_top_left_origin(page_height),
text=c["text"], # type: ignore
if show:
combined_image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
for c in cells_out
]
out_path.mkdir(parents=True, exist_ok=True)
del cells_out
clusters_out_new = []
for c in clusters_out:
cluster_cells = [
ccell for ccell in cells_out_new if ccell.id in c["cell_ids"] # type: ignore
]
c_new = Cluster(
id=c["id"], # type: ignore
bbox=BoundingBox.from_tuple(
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
).to_top_left_origin(page_height),
confidence=c["confidence"], # type: ignore
label=DocItemLabel(c["type"]),
cells=cluster_cells,
)
clusters_out_new.append(c_new)
return clusters_out_new, cells_out_new
out_file = out_path / f"{mode_prefix}_layout_page_{page.page_no:05}.png"
combined_image.save(str(out_file), format="png")
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
@ -305,43 +180,78 @@ class LayoutModel(BasePageModel):
)
clusters.append(cluster)
# Map cells to clusters
# TODO: Remove, postprocess should take care of it anyway.
for cell in page.cells:
for cluster in clusters:
if not cell.bbox.area() > 0:
overlap_frac = 0.0
else:
overlap_frac = (
cell.bbox.intersection_area_with(cluster.bbox)
/ cell.bbox.area()
)
if overlap_frac > 0.5:
cluster.cells.append(cell)
# Pre-sort clusters
# clusters = self.sort_clusters_by_cell_order(clusters)
# DEBUG code:
def draw_clusters_and_cells(show: bool = False):
def draw_clusters_and_cells(
clusters, mode_prefix: str, show: bool = False
):
label_to_color = {
DocItemLabel.TEXT: (255, 255, 153), # Light Yellow
DocItemLabel.CAPTION: (255, 204, 153), # Light Orange
DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
DocItemLabel.FORMULA: (192, 192, 192), # Gray
DocItemLabel.TABLE: (255, 204, 204), # Light Pink
DocItemLabel.PICTURE: (255, 255, 204), # Light Beige
DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
DocItemLabel.PAGE_FOOTER: (
204,
255,
204,
), # Light Green (same as Page-Header)
DocItemLabel.TITLE: (
255,
153,
153,
), # Light Red (same as Section-Header)
DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
DocItemLabel.CODE: (255, 223, 186), # Peach
DocItemLabel.CHECKBOX_SELECTED: (
255,
182,
193,
), # Pale Green
DocItemLabel.CHECKBOX_UNSELECTED: (
255,
182,
193,
), # Light Pink
DocItemLabel.FORM: (200, 255, 255), # Light Cyan
DocItemLabel.KEY_VALUE_REGION: (
183,
65,
14,
), # Rusty orange
}
image = copy.deepcopy(page.image)
if image is not None:
draw = ImageDraw.Draw(image)
draw = ImageDraw.Draw(image, "RGBA")
for c in clusters:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
cell_color = (0, 0, 0, 40)
for tc in c.cells: # [:1]:
x0, y0, x1, y1 = tc.bbox.as_tuple()
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
draw.rectangle(
[(x0, y0), (x1, y1)], outline=cell_color
[(cx0, cy0), (cx1, cy1)],
outline=None,
fill=cell_color,
)
x0, y0, x1, y1 = c.bbox.as_tuple()
cluster_fill_color = (
*list(label_to_color.get(c.label)), # type: ignore
70,
)
cluster_outline_color = (
*list(label_to_color.get(c.label)), # type: ignore
255,
)
draw.rectangle(
[(x0, y0), (x1, y1)],
outline=cluster_outline_color,
fill=cluster_fill_color,
)
if show:
image.show()
else:
@ -352,19 +262,30 @@ class LayoutModel(BasePageModel):
out_path.mkdir(parents=True, exist_ok=True)
out_file = (
out_path / f"layout_page_{page.page_no:05}.png"
out_path
/ f"{mode_prefix}_layout_page_{page.page_no:05}.png"
)
image.save(str(out_file), format="png")
# draw_clusters_and_cells()
if settings.debug.visualize_raw_layout:
self.draw_clusters_and_cells_side_by_side(
conv_res, page, clusters, mode_prefix="raw"
)
clusters, page.cells = self.postprocess(
clusters, page.cells, page.size.height
# Apply postprocessing
processed_clusters, processed_cells = LayoutPostprocessor(
page.cells, clusters
).postprocess()
# processed_clusters, processed_cells = clusters, page.cells
page.cells = processed_cells
page.predictions.layout = LayoutPrediction(
clusters=processed_clusters
)
page.predictions.layout = LayoutPrediction(clusters=clusters)
if settings.debug.visualize_layout:
draw_clusters_and_cells()
self.draw_clusters_and_cells_side_by_side(
conv_res, page, processed_clusters, mode_prefix="postprocessed"
)
yield page

View File

@ -6,6 +6,7 @@ from pydantic import BaseModel
from docling.datamodel.base_models import (
AssembledUnit,
ContainerElement,
FigureElement,
Page,
PageElement,
@ -159,6 +160,15 @@ class PageAssembleModel(BasePageModel):
)
elements.append(equation)
body.append(equation)
elif cluster.label in LayoutModel.CONTAINER_LABELS:
container_el = ContainerElement(
label=cluster.label,
id=cluster.id,
page_no=page.page_no,
cluster=cluster,
)
elements.append(container_el)
body.append(container_el)
page.assembled = AssembledUnit(
elements=elements, headers=headers, body=body

View File

@ -38,7 +38,7 @@ _log = logging.getLogger(__name__)
class StandardPdfPipeline(PaginatedPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
_layout_model_path = "model_artifacts/layout"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, pipeline_options: PdfPipelineOptions):
@ -102,7 +102,7 @@ class StandardPdfPipeline(PaginatedPipeline):
repo_id="ds4sd/docling-models",
force_download=force,
local_dir=local_dir,
revision="v2.0.1",
revision="refs/pr/2",
)
return Path(download_path)

336
docling/utils/glm_utils.py Normal file
View File

@ -0,0 +1,336 @@
import re
from pathlib import Path
from typing import List
import pandas as pd
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
ProvenanceItem,
Size,
TableCell,
TableData,
)
def resolve_item(paths, obj):
"""Find item in document from a reference path"""
if len(paths) == 0:
return obj
if paths[0] == "#":
return resolve_item(paths[1:], obj)
try:
key = int(paths[0])
except:
key = paths[0]
if len(paths) == 1:
if isinstance(key, str) and key in obj:
return obj[key]
elif isinstance(key, int) and key < len(obj):
return obj[key]
else:
return None
elif len(paths) > 1:
if isinstance(key, str) and key in obj:
return resolve_item(paths[1:], obj[key])
elif isinstance(key, int) and key < len(obj):
return resolve_item(paths[1:], obj[key])
else:
return None
else:
return None
def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
unique_objects = []
seen_spans = set()
for sublist in grid:
for obj in sublist:
# Convert the spans list to a tuple of tuples for hashing
spans_tuple = tuple(tuple(span) for span in obj["spans"])
if spans_tuple not in seen_spans:
seen_spans.add(spans_tuple)
unique_objects.append(obj)
return unique_objects
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
origin = DocumentOrigin(
mimetype="application/pdf",
filename=doc_glm["file-info"]["filename"],
binary_hash=doc_glm["file-info"]["document-hash"],
)
doc_name = Path(origin.filename).stem
doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
for page_dim in doc_glm["page-dimensions"]:
page_no = int(page_dim["page"])
size = Size(width=page_dim["width"], height=page_dim["height"])
doc.add_page(page_no=page_no, size=size)
if "properties" in doc_glm:
props = pd.DataFrame(
doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"]
)
else:
props = pd.DataFrame()
current_list = None
for ix, pelem in enumerate(doc_glm["page-elements"]):
ptype = pelem["type"]
span_i = pelem["span"][0]
span_j = pelem["span"][1]
if "iref" not in pelem:
# print(json.dumps(pelem, indent=2))
continue
iref = pelem["iref"]
if re.match("#/figures/(\\d+)/captions/(.+)", iref):
# print(f"skip {iref}")
continue
if re.match("#/tables/(\\d+)/captions/(.+)", iref):
# print(f"skip {iref}")
continue
path = iref.split("/")
obj = resolve_item(path, doc_glm)
if obj is None:
current_list = None
print(f"warning: undefined {path}")
continue
if ptype == "figure":
current_list = None
text = ""
caption_refs = []
for caption in obj["captions"]:
text += caption["text"]
for nprov in caption["prov"]:
npaths = nprov["$ref"].split("/")
nelem = resolve_item(npaths, doc_glm)
if nelem is None:
# print(f"warning: undefined caption {npaths}")
continue
span_i = nelem["span"][0]
span_j = nelem["span"][1]
cap_text = caption["text"][span_i:span_j]
# doc_glm["page-elements"].remove(nelem)
prov = ProvenanceItem(
page_no=nelem["page"],
charspan=tuple(nelem["span"]),
bbox=BoundingBox.from_tuple(
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
),
)
caption_obj = doc.add_text(
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
)
caption_refs.append(caption_obj.get_ref())
prov = ProvenanceItem(
page_no=pelem["page"],
charspan=(0, len(text)),
bbox=BoundingBox.from_tuple(
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
),
)
pic = doc.add_picture(prov=prov)
pic.captions.extend(caption_refs)
_add_child_elements(pic, doc, obj, pelem)
elif ptype == "table":
current_list = None
text = ""
caption_refs = []
for caption in obj["captions"]:
text += caption["text"]
for nprov in caption["prov"]:
npaths = nprov["$ref"].split("/")
nelem = resolve_item(npaths, doc_glm)
if nelem is None:
# print(f"warning: undefined caption {npaths}")
continue
span_i = nelem["span"][0]
span_j = nelem["span"][1]
cap_text = caption["text"][span_i:span_j]
# doc_glm["page-elements"].remove(nelem)
prov = ProvenanceItem(
page_no=nelem["page"],
charspan=tuple(nelem["span"]),
bbox=BoundingBox.from_tuple(
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
),
)
caption_obj = doc.add_text(
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
)
caption_refs.append(caption_obj.get_ref())
table_cells_glm = _flatten_table_grid(obj["data"])
table_cells = []
for tbl_cell_glm in table_cells_glm:
if tbl_cell_glm["bbox"] is not None:
bbox = BoundingBox.from_tuple(
tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT
)
else:
bbox = None
is_col_header = False
is_row_header = False
is_row_section = False
if tbl_cell_glm["type"] == "col_header":
is_col_header = True
elif tbl_cell_glm["type"] == "row_header":
is_row_header = True
elif tbl_cell_glm["type"] == "row_section":
is_row_section = True
table_cells.append(
TableCell(
row_span=tbl_cell_glm["row-span"][1]
- tbl_cell_glm["row-span"][0],
col_span=tbl_cell_glm["col-span"][1]
- tbl_cell_glm["col-span"][0],
start_row_offset_idx=tbl_cell_glm["row-span"][0],
end_row_offset_idx=tbl_cell_glm["row-span"][1],
start_col_offset_idx=tbl_cell_glm["col-span"][0],
end_col_offset_idx=tbl_cell_glm["col-span"][1],
text=tbl_cell_glm["text"],
bbox=bbox,
column_header=is_col_header,
row_header=is_row_header,
row_section=is_row_section,
)
)
tbl_data = TableData(
num_rows=obj.get("#-rows", 0),
num_cols=obj.get("#-cols", 0),
table_cells=table_cells,
)
prov = ProvenanceItem(
page_no=pelem["page"],
charspan=(0, 0),
bbox=BoundingBox.from_tuple(
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
),
)
tbl = doc.add_table(data=tbl_data, prov=prov)
tbl.captions.extend(caption_refs)
elif ptype in ["form", "key_value_region"]:
label = DocItemLabel(ptype)
container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label)
_add_child_elements(container_el, doc, obj, pelem)
elif "text" in obj:
text = obj["text"][span_i:span_j]
type_label = pelem["type"]
name_label = pelem["name"]
if update_name_label and len(props) > 0 and type_label == "paragraph":
prop = props[
(props["type"] == "semantic") & (props["subj_path"] == iref)
]
if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85:
name_label = prop.iloc[0]["label"]
prov = ProvenanceItem(
page_no=pelem["page"],
charspan=(0, len(text)),
bbox=BoundingBox.from_tuple(
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
),
)
label = DocItemLabel(name_label)
if label == DocItemLabel.LIST_ITEM:
if current_list is None:
current_list = doc.add_group(label=GroupLabel.LIST, name="list")
# TODO: Infer if this is a numbered or a bullet list item
doc.add_list_item(
text=text, enumerated=False, prov=prov, parent=current_list
)
elif label == DocItemLabel.SECTION_HEADER:
current_list = None
doc.add_heading(text=text, prov=prov)
else:
current_list = None
doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)
return doc
def _add_child_elements(container_el, doc, obj, pelem):
payload = obj.get("payload")
if payload is not None:
children = payload.get("children", [])
for child in children:
c_label = DocItemLabel(child["label"])
c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin(
doc.pages[pelem["page"]].size.height
)
c_text = " ".join(
[
cell["text"].replace("\x02", "-").strip()
for cell in child["cells"]
if len(cell["text"].strip()) > 0
]
)
c_prov = ProvenanceItem(
page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox
)
if c_label == DocItemLabel.LIST_ITEM:
# TODO: Infer if this is a numbered or a bullet list item
doc.add_list_item(parent=container_el, text=c_text, prov=c_prov)
elif c_label == DocItemLabel.SECTION_HEADER:
doc.add_heading(parent=container_el, text=c_text, prov=c_prov)
else:
doc.add_text(
parent=container_el, label=c_label, text=c_text, prov=c_prov
)

View File

@ -0,0 +1,496 @@
import bisect
import logging
import sys
from collections import defaultdict
from typing import Dict, List, Set, Tuple
from docling_core.types.doc import DocItemLabel
from rtree import index
from docling.datamodel.base_models import BoundingBox, Cell, Cluster
_log = logging.getLogger(__name__)
class UnionFind:
"""Efficient Union-Find data structure for grouping elements."""
def __init__(self, elements):
self.parent = {elem: elem for elem in elements}
self.rank = {elem: 0 for elem in elements}
def find(self, x):
if self.parent[x] != x:
self.parent[x] = self.find(self.parent[x]) # Path compression
return self.parent[x]
def union(self, x, y):
root_x, root_y = self.find(x), self.find(y)
if root_x == root_y:
return
if self.rank[root_x] > self.rank[root_y]:
self.parent[root_y] = root_x
elif self.rank[root_x] < self.rank[root_y]:
self.parent[root_x] = root_y
else:
self.parent[root_y] = root_x
self.rank[root_x] += 1
def get_groups(self) -> Dict[int, List[int]]:
"""Returns groups as {root: [elements]}."""
groups = defaultdict(list)
for elem in self.parent:
groups[self.find(elem)].append(elem)
return groups
class SpatialClusterIndex:
"""Efficient spatial indexing for clusters using R-tree and interval trees."""
def __init__(self, clusters: List[Cluster]):
p = index.Property()
p.dimension = 2
self.spatial_index = index.Index(properties=p)
self.x_intervals = IntervalTree()
self.y_intervals = IntervalTree()
self.clusters_by_id: Dict[int, Cluster] = {}
for cluster in clusters:
self.add_cluster(cluster)
def add_cluster(self, cluster: Cluster):
bbox = cluster.bbox
self.spatial_index.insert(cluster.id, bbox.as_tuple())
self.x_intervals.insert(bbox.l, bbox.r, cluster.id)
self.y_intervals.insert(bbox.t, bbox.b, cluster.id)
self.clusters_by_id[cluster.id] = cluster
def remove_cluster(self, cluster: Cluster):
self.spatial_index.delete(cluster.id, cluster.bbox.as_tuple())
del self.clusters_by_id[cluster.id]
def find_candidates(self, bbox: BoundingBox) -> Set[int]:
"""Find potential overlapping cluster IDs using all indexes."""
spatial = set(self.spatial_index.intersection(bbox.as_tuple()))
x_candidates = self.x_intervals.find_containing(
bbox.l
) | self.x_intervals.find_containing(bbox.r)
y_candidates = self.y_intervals.find_containing(
bbox.t
) | self.y_intervals.find_containing(bbox.b)
return spatial | x_candidates | y_candidates
def check_overlap(
self,
bbox1: BoundingBox,
bbox2: BoundingBox,
overlap_threshold: float,
containment_threshold: float,
) -> bool:
"""Check if two bboxes overlap sufficiently."""
area1, area2 = bbox1.area(), bbox2.area()
if area1 <= 0 or area2 <= 0:
return False
overlap_area = bbox1.intersection_area_with(bbox2)
if overlap_area <= 0:
return False
iou = overlap_area / (area1 + area2 - overlap_area)
containment1 = overlap_area / area1
containment2 = overlap_area / area2
return (
iou > overlap_threshold
or containment1 > containment_threshold
or containment2 > containment_threshold
)
class IntervalTree:
"""Memory-efficient interval tree for 1D overlap queries."""
def __init__(self):
self.intervals: List[Tuple[float, float, int]] = (
[]
) # (min, max, id) sorted by min
def insert(self, min_val: float, max_val: float, id: int):
bisect.insort(self.intervals, (min_val, max_val, id), key=lambda x: x[0])
def find_containing(self, point: float) -> Set[int]:
"""Find all intervals containing the point."""
pos = bisect.bisect_left(self.intervals, (point, float("-inf"), -1))
result = set()
# Check intervals starting before point
for min_val, max_val, id in reversed(self.intervals[:pos]):
if min_val <= point <= max_val:
result.add(id)
else:
break
# Check intervals starting at/after point
for min_val, max_val, id in self.intervals[pos:]:
if point <= max_val:
if min_val <= point:
result.add(id)
else:
break
return result
class LayoutPostprocessor:
"""Postprocesses layout predictions by cleaning up clusters and mapping cells."""
# Cluster type-specific parameters for overlap resolution
OVERLAP_PARAMS = {
"regular": {"area_threshold": 1.3, "conf_threshold": 0.05},
"picture": {"area_threshold": 2.0, "conf_threshold": 0.3},
"wrapper": {"area_threshold": 2.0, "conf_threshold": 0.2},
}
WRAPPER_TYPES = {DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION}
SPECIAL_TYPES = WRAPPER_TYPES | {DocItemLabel.PICTURE}
CONFIDENCE_THRESHOLDS = {
DocItemLabel.CAPTION: 0.35,
DocItemLabel.FOOTNOTE: 0.35,
DocItemLabel.FORMULA: 0.35,
DocItemLabel.LIST_ITEM: 0.35,
DocItemLabel.PAGE_FOOTER: 0.35,
DocItemLabel.PAGE_HEADER: 0.35,
DocItemLabel.PICTURE: 0.1,
DocItemLabel.SECTION_HEADER: 0.45,
DocItemLabel.TABLE: 0.35,
DocItemLabel.TEXT: 0.45,
DocItemLabel.TITLE: 0.45,
DocItemLabel.CODE: 0.45,
DocItemLabel.CHECKBOX_SELECTED: 0.45,
DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
DocItemLabel.FORM: 0.45,
DocItemLabel.KEY_VALUE_REGION: 0.45,
DocItemLabel.DOCUMENT_INDEX: 0.45,
}
LABEL_REMAPPING = {
DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
}
def __init__(self, cells: List[Cell], clusters: List[Cluster]):
"""Initialize processor with cells and clusters."""
"""Initialize processor with cells and spatial indices."""
self.cells = cells
self.regular_clusters = [
c for c in clusters if c.label not in self.SPECIAL_TYPES
]
self.special_clusters = [c for c in clusters if c.label in self.SPECIAL_TYPES]
# Build spatial indices once
self.regular_index = SpatialClusterIndex(self.regular_clusters)
self.picture_index = SpatialClusterIndex(
[c for c in self.special_clusters if c.label == DocItemLabel.PICTURE]
)
self.wrapper_index = SpatialClusterIndex(
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
)
def postprocess(self) -> Tuple[List[Cluster], List[Cell]]:
"""Main processing pipeline."""
self.regular_clusters = self._process_regular_clusters()
self.special_clusters = self._process_special_clusters()
# Remove regular clusters that are included in wrappers
contained_ids = {
child.id
for wrapper in self.special_clusters
if wrapper.label in self.SPECIAL_TYPES
for child in wrapper.children
}
self.regular_clusters = [
c for c in self.regular_clusters if c.id not in contained_ids
]
# Combine and sort final clusters
final_clusters = self._sort_clusters(
self.regular_clusters + self.special_clusters
)
return final_clusters, self.cells
def _process_regular_clusters(self) -> List[Cluster]:
"""Process regular clusters with iterative refinement."""
clusters = [
c
for c in self.regular_clusters
if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label]
]
# Apply label remapping
for cluster in clusters:
if cluster.label in self.LABEL_REMAPPING:
cluster.label = self.LABEL_REMAPPING[cluster.label]
# Initial cell assignment
clusters = self._assign_cells_to_clusters(clusters)
# Handle orphaned cells
unassigned = self._find_unassigned_cells(clusters)
if unassigned:
next_id = max((c.id for c in clusters), default=0) + 1
orphan_clusters = [
Cluster(
id=next_id + i,
label=DocItemLabel.TEXT,
bbox=cell.bbox,
confidence=0.0,
cells=[cell],
)
for i, cell in enumerate(unassigned)
]
clusters.extend(orphan_clusters)
# Iterative refinement
prev_count = len(clusters) + 1
for _ in range(3): # Maximum 3 iterations
if prev_count == len(clusters):
break
prev_count = len(clusters)
clusters = self._adjust_cluster_bboxes(clusters)
clusters = self._remove_overlapping_clusters(clusters, "regular")
return clusters
def _process_special_clusters(self) -> List[Cluster]:
special_clusters = [
c
for c in self.special_clusters
if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label]
]
for special in special_clusters:
contained = []
for cluster in self.regular_clusters:
overlap = cluster.bbox.intersection_area_with(special.bbox)
if overlap > 0:
containment = overlap / cluster.bbox.area()
if containment > 0.8:
contained.append(cluster)
if contained:
# Sort contained clusters by minimum cell ID
contained.sort(
key=lambda cluster: (
min(cell.id for cell in cluster.cells)
if cluster.cells
else sys.maxsize
)
)
special.children = contained
# Adjust bbox only for wrapper types
if special.label in self.WRAPPER_TYPES:
special.bbox = BoundingBox(
l=min(c.bbox.l for c in contained),
t=min(c.bbox.t for c in contained),
r=max(c.bbox.r for c in contained),
b=max(c.bbox.b for c in contained),
)
picture_clusters = [
c for c in special_clusters if c.label == DocItemLabel.PICTURE
]
picture_clusters = self._remove_overlapping_clusters(
picture_clusters, "picture"
)
wrapper_clusters = [
c for c in special_clusters if c.label in self.WRAPPER_TYPES
]
wrapper_clusters = self._remove_overlapping_clusters(
wrapper_clusters, "wrapper"
)
return picture_clusters + wrapper_clusters
def _remove_overlapping_clusters(
self,
clusters: List[Cluster],
cluster_type: str,
overlap_threshold: float = 0.8,
containment_threshold: float = 0.8,
) -> List[Cluster]:
if not clusters:
return []
spatial_index = (
self.regular_index
if cluster_type == "regular"
else self.picture_index if cluster_type == "picture" else self.wrapper_index
)
# Map of currently valid clusters
valid_clusters = {c.id: c for c in clusters}
uf = UnionFind(valid_clusters.keys())
params = self.OVERLAP_PARAMS[cluster_type]
for cluster in clusters:
candidates = spatial_index.find_candidates(cluster.bbox)
candidates &= valid_clusters.keys() # Only keep existing candidates
candidates.discard(cluster.id)
for other_id in candidates:
if spatial_index.check_overlap(
cluster.bbox,
valid_clusters[other_id].bbox,
overlap_threshold,
containment_threshold,
):
uf.union(cluster.id, other_id)
result = []
for group in uf.get_groups().values():
if len(group) == 1:
result.append(valid_clusters[group[0]])
continue
group_clusters = [valid_clusters[cid] for cid in group]
current_best = None
for candidate in group_clusters:
should_select = True
for other in group_clusters:
if other == candidate:
continue
area_ratio = candidate.bbox.area() / other.bbox.area()
conf_diff = other.confidence - candidate.confidence
if (
area_ratio <= params["area_threshold"]
and conf_diff > params["conf_threshold"]
):
should_select = False
break
if should_select:
if current_best is None or (
candidate.bbox.area() > current_best.bbox.area()
and current_best.confidence - candidate.confidence
<= params["conf_threshold"]
):
current_best = candidate
best = current_best if current_best else group_clusters[0]
for cluster in group_clusters:
if cluster != best:
best.cells.extend(cluster.cells)
result.append(best)
return result
def _select_best_cluster(
self,
clusters: List[Cluster],
area_threshold: float,
conf_threshold: float,
) -> Cluster:
"""Iteratively select best cluster based on area and confidence thresholds."""
current_best = None
for candidate in clusters:
should_select = True
for other in clusters:
if other == candidate:
continue
area_ratio = candidate.bbox.area() / other.bbox.area()
conf_diff = other.confidence - candidate.confidence
if area_ratio <= area_threshold and conf_diff > conf_threshold:
should_select = False
break
if should_select:
if current_best is None or (
candidate.bbox.area() > current_best.bbox.area()
and current_best.confidence - candidate.confidence <= conf_threshold
):
current_best = candidate
return current_best if current_best else clusters[0]
def _assign_cells_to_clusters(
self, clusters: List[Cluster], min_overlap: float = 0.2
) -> List[Cluster]:
"""Assign cells to best overlapping cluster."""
for cluster in clusters:
cluster.cells = []
for cell in self.cells:
if not cell.text.strip():
continue
best_overlap = min_overlap
best_cluster = None
for cluster in clusters:
if cell.bbox.area() <= 0:
continue
overlap = cell.bbox.intersection_area_with(cluster.bbox)
overlap_ratio = overlap / cell.bbox.area()
if overlap_ratio > best_overlap:
best_overlap = overlap_ratio
best_cluster = cluster
if best_cluster is not None:
best_cluster.cells.append(cell)
return clusters
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
"""Find cells not assigned to any cluster."""
assigned = {cell.id for cluster in clusters for cell in cluster.cells}
return [
cell for cell in self.cells if cell.id not in assigned and cell.text.strip()
]
def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
"""Adjust cluster bounding boxes to contain their cells."""
for cluster in clusters:
if not cluster.cells:
continue
cells_bbox = BoundingBox(
l=min(cell.bbox.l for cell in cluster.cells),
t=min(cell.bbox.t for cell in cluster.cells),
r=max(cell.bbox.r for cell in cluster.cells),
b=max(cell.bbox.b for cell in cluster.cells),
)
if cluster.label == DocItemLabel.TABLE:
# For tables, take union of current bbox and cells bbox
cluster.bbox = BoundingBox(
l=min(cluster.bbox.l, cells_bbox.l),
t=min(cluster.bbox.t, cells_bbox.t),
r=max(cluster.bbox.r, cells_bbox.r),
b=max(cluster.bbox.b, cells_bbox.b),
)
else:
cluster.bbox = cells_bbox
return clusters
def _sort_clusters(self, clusters: List[Cluster]) -> List[Cluster]:
"""Sort clusters in reading order (top-to-bottom, left-to-right)."""
def reading_order_key(cluster: Cluster) -> Tuple[float, float]:
if cluster.cells and cluster.label != DocItemLabel.PICTURE:
first_cell = min(cluster.cells, key=lambda c: (c.bbox.t, c.bbox.l))
return (first_cell.bbox.t, first_cell.bbox.l)
return (cluster.bbox.t, cluster.bbox.l)
return sorted(clusters, key=reading_order_key)

View File

@ -1,812 +0,0 @@
import copy
import logging
import networkx as nx
from docling_core.types.doc import DocItemLabel
logger = logging.getLogger("layout_utils")
## -------------------------------
## Geometric helper functions
## The coordinates grow left to right, and bottom to top.
## The bounding box list elements 0 to 3 are x_left, y_bottom, x_right, y_top.
def area(bbox):
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
def contains(bbox_i, bbox_j):
## Returns True if bbox_i contains bbox_j, else False
return (
bbox_i[0] <= bbox_j[0]
and bbox_i[1] <= bbox_j[1]
and bbox_i[2] >= bbox_j[2]
and bbox_i[3] >= bbox_j[3]
)
def is_intersecting(bbox_i, bbox_j):
return not (
bbox_i[2] < bbox_j[0]
or bbox_i[0] > bbox_j[2]
or bbox_i[3] < bbox_j[1]
or bbox_i[1] > bbox_j[3]
)
def bb_iou(boxA, boxB):
# determine the (x, y)-coordinates of the intersection rectangle
xA = max(boxA[0], boxB[0])
yA = max(boxA[1], boxB[1])
xB = min(boxA[2], boxB[2])
yB = min(boxA[3], boxB[3])
# compute the area of intersection rectangle
interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
# compute the area of both the prediction and ground-truth
# rectangles
boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
# compute the intersection over union by taking the intersection
# area and dividing it by the sum of prediction + ground-truth
# areas - the interesection area
iou = interArea / float(boxAArea + boxBArea - interArea)
# return the intersection over union value
return iou
def compute_intersection(bbox_i, bbox_j):
## Returns the size of the intersection area of the two boxes
if not is_intersecting(bbox_i, bbox_j):
return 0
## Determine the (x, y)-coordinates of the intersection rectangle:
xA = max(bbox_i[0], bbox_j[0])
yA = max(bbox_i[1], bbox_j[1])
xB = min(bbox_i[2], bbox_j[2])
yB = min(bbox_i[3], bbox_j[3])
## Compute the area of intersection rectangle:
interArea = (xB - xA) * (yB - yA)
if interArea < 0:
logger.debug("Warning: Negative intersection detected!")
return 0
return interArea
def surrounding(bbox_i, bbox_j):
## Computes minimal box that contains both input boxes
sbox = []
sbox.append(min(bbox_i[0], bbox_j[0]))
sbox.append(min(bbox_i[1], bbox_j[1]))
sbox.append(max(bbox_i[2], bbox_j[2]))
sbox.append(max(bbox_i[3], bbox_j[3]))
return sbox
def surrounding_list(bbox_list):
## Computes minimal box that contains all boxes in the input list
## The list should be non-empty, but just in case it's not:
if len(bbox_list) == 0:
sbox = [0, 0, 0, 0]
else:
sbox = []
sbox.append(min([bbox[0] for bbox in bbox_list]))
sbox.append(min([bbox[1] for bbox in bbox_list]))
sbox.append(max([bbox[2] for bbox in bbox_list]))
sbox.append(max([bbox[3] for bbox in bbox_list]))
return sbox
def vertical_overlap(bboxA, bboxB):
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
if bboxB[3] < bboxA[1]: ## B below A
return False
elif bboxA[3] < bboxB[1]: ## A below B
return False
else:
return True
def vertical_overlap_fraction(bboxA, bboxB):
## Returns the vertical overlap as fraction of the lower bbox height.
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
## Height 0 is permitted in the input.
heightA = bboxA[3] - bboxA[1]
heightB = bboxB[3] - bboxB[1]
min_height = min(heightA, heightB)
if bboxA[3] >= bboxB[3]: ## A starts higher or equal
if (
bboxA[1] <= bboxB[1]
): ## B is completely in A; this can include height of B = 0:
fraction = 1
else:
overlap = max(bboxB[3] - bboxA[1], 0)
fraction = overlap / max(min_height, 0.001)
else:
if (
bboxB[1] <= bboxA[1]
): ## A is completely in B; this can include height of A = 0:
fraction = 1
else:
overlap = max(bboxA[3] - bboxB[1], 0)
fraction = overlap / max(min_height, 0.001)
return fraction
## -------------------------------
## Cluster-and-cell relations
def compute_enclosed_cells(
cluster_bbox, raw_cells, min_cell_intersection_with_cluster=0.2
):
cells_in_cluster = []
cells_in_cluster_int = []
for ix, cell in enumerate(raw_cells):
cell_bbox = cell["bbox"]
intersection = compute_intersection(cell_bbox, cluster_bbox)
frac_area = area(cell_bbox) * min_cell_intersection_with_cluster
if (
intersection > frac_area and frac_area > 0
): # intersect > certain fraction of cell
cells_in_cluster.append(ix)
cells_in_cluster_int.append(intersection)
elif contains(
cluster_bbox,
[cell_bbox[0] + 3, cell_bbox[1] + 3, cell_bbox[2] - 3, cell_bbox[3] - 3],
):
cells_in_cluster.append(ix)
return cells_in_cluster, cells_in_cluster_int
def find_clusters_around_cells(cell_count, clusters):
## Per raw cell, find to which clusters it belongs.
## Return list of these indices in the raw-cell order.
clusters_around_cells = [[] for _ in range(cell_count)]
for cl_ix, cluster in enumerate(clusters):
for ix in cluster["cell_ids"]:
clusters_around_cells[ix].append(cl_ix)
return clusters_around_cells
def find_cell_index(raw_ix, cell_array):
## "raw_ix" is a rawcell_id.
## "cell_array" has the structure of an (annotation) cells array.
## Returns index of cell in cell_array that has this rawcell_id.
for ix, cell in enumerate(cell_array):
if cell["rawcell_id"] == raw_ix:
return ix
def find_cell_indices(cluster, cell_array):
## "cluster" must have the structure as in a clusters array in a prediction,
## "cell_array" that of a cells array.
## Returns list of indices of cells in cell_array that have the rawcell_ids as in the cluster,
## in the order of the rawcell_ids.
result = []
for raw_ix in sorted(cluster["cell_ids"]):
## Find the cell with this rawcell_id (if any)
for ix, cell in enumerate(cell_array):
if cell["rawcell_id"] == raw_ix:
result.append(ix)
return result
def find_first_cell_index(cluster, cell_array):
## "cluster" must be a dict with key "cell_ids"; it can also be a line.
## "cell_array" has the structure of a cells array in an annotation.
## Returns index of cell in cell_array that has the lowest rawcell_id from the cluster.
result = [] ## We keep it a list as it can be empty (picture without text cells)
if len(cluster["cell_ids"]) == 0:
return result
raw_ix = min(cluster["cell_ids"])
## Find the cell with this rawcell_id (if any)
for ix, cell in enumerate(cell_array):
if cell["rawcell_id"] == raw_ix:
result.append(ix)
break ## One is enough; should be only one anyway.
if result == []:
logger.debug(
" Warning: Raw cell " + str(raw_ix) + " not found in annotation cells"
)
return result
## -------------------------------
## Cluster labels and text
def relabel_cluster(cluster, cl_ix, new_label, target_pred):
## "cluster" must have the structure as in a clusters array in a prediction,
## "cl_ix" is its index in target_pred,
## "new_label" is the intended new label,
## "target_pred" is the entire current target prediction.
## Sets label on the cluster itself, and on the cells in the target_pred.
## Returns new_label so that also the cl_label variable in the main code is easily set.
target_pred["clusters"][cl_ix]["type"] = new_label
cluster_target_cells = find_cell_indices(cluster, target_pred["cells"])
for ix in cluster_target_cells:
target_pred["cells"][ix]["label"] = new_label
return new_label
def find_cluster_text(cluster, raw_cells):
## "cluster" must be a dict with "cell_ids"; it can also be a line.
## "raw_cells" must have the format of item["raw"]["cells"]
## Returns the text of the cluster, with blanks between the cell contents
## (which seem to be words or phrases without starting or trailing blanks).
## Note that in formulas, this may give a lot more blanks than originally
cluster_text = ""
for raw_ix in sorted(cluster["cell_ids"]):
cluster_text = cluster_text + raw_cells[raw_ix]["text"] + " "
return cluster_text.rstrip()
def find_cluster_text_without_blanks(cluster, raw_cells):
## "cluster" must be a dict with "cell_ids"; it can also be a line.
## "raw_cells" must have the format of item["raw"]["cells"]
## Returns the text of the cluster, without blanks between the cell contents
## Interesting in formula analysis.
cluster_text = ""
for raw_ix in sorted(cluster["cell_ids"]):
cluster_text = cluster_text + raw_cells[raw_ix]["text"]
return cluster_text.rstrip()
## -------------------------------
## Clusters and lines
## (Most line-oriented functions are only needed in TextAnalysisGivenClusters,
## but this one also in FormulaAnalysis)
def build_cluster_from_lines(lines, label, id):
## Lines must be a non-empty list of dicts (lines) with elements "cell_ids" and "bbox"
## (There is no condition that they are really geometrically lines)
## A cluster in standard format is returned with given label and id
local_lines = copy.deepcopy(
lines
) ## without this, it changes "lines" also outside this function
first_line = local_lines.pop(0)
cluster = {
"id": id,
"type": label,
"cell_ids": first_line["cell_ids"],
"bbox": first_line["bbox"],
"confidence": 0,
"created_by": "merged_cells",
}
confidence = 0
counter = 0
for line in local_lines:
new_cell_ids = cluster["cell_ids"] + line["cell_ids"]
cluster["cell_ids"] = new_cell_ids
cluster["bbox"] = surrounding(cluster["bbox"], line["bbox"])
counter += 1
confidence += line["confidence"]
confidence = confidence / counter
cluster["confidence"] = confidence
return cluster
## -------------------------------
## Reading order
def produce_reading_order(clusters, cluster_sort_type, cell_sort_type, sort_ids):
## In:
## Clusters: list as in predictions.
## cluster_sort_type: string, currently only "raw_cells".
## cell_sort_type: string, currently only "raw_cells".
## sort_ids: Boolean, whether the cluster ids should be adapted to their new position
## Out: Another clusters list, sorted according to the type.
logger.debug("---- Start cluster sorting ------")
if cell_sort_type == "raw_cell_ids":
for cl in clusters:
sorted_cell_ids = sorted(cl["cell_ids"])
cl["cell_ids"] = sorted_cell_ids
else:
logger.debug(
"Unknown cell_sort_type `"
+ cell_sort_type
+ "`, no cell sorting will happen."
)
if cluster_sort_type == "raw_cell_ids":
clusters_with_cells = [cl for cl in clusters if cl["cell_ids"] != []]
clusters_without_cells = [cl for cl in clusters if cl["cell_ids"] == []]
logger.debug(
"Clusters with cells: " + str([cl["id"] for cl in clusters_with_cells])
)
logger.debug(
" Their first cell ids: "
+ str([cl["cell_ids"][0] for cl in clusters_with_cells])
)
logger.debug(
"Clusters without cells: "
+ str([cl["id"] for cl in clusters_without_cells])
)
clusters_with_cells_sorted = sorted(
clusters_with_cells, key=lambda cluster: cluster["cell_ids"][0]
)
logger.debug(
" First cell ids after sorting: "
+ str([cl["cell_ids"][0] for cl in clusters_with_cells_sorted])
)
sorted_clusters = clusters_with_cells_sorted + clusters_without_cells
else:
logger.debug(
"Unknown cluster_sort_type: `"
+ cluster_sort_type
+ "`, no cluster sorting will happen."
)
if sort_ids:
for i, cl in enumerate(sorted_clusters):
cl["id"] = i
return sorted_clusters
## -------------------------------
## Line Splitting
def sort_cells_horizontal(line_cell_ids, raw_cells):
## "line_cells" should be a non-empty list of (raw) cell_ids
## "raw_cells" has the structure of item["raw"]["cells"].
## Sorts the cells in the line by x0 (left start).
new_line_cell_ids = sorted(
line_cell_ids, key=lambda cell_id: raw_cells[cell_id]["bbox"][0]
)
return new_line_cell_ids
def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
new_clusters = []
for ix, cluster in enumerate(clusters):
new_cluster = copy.deepcopy(cluster)
logger.debug(
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
)
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
logger.debug(" Empty non-picture, removed")
continue ## Skip this former cluster, now without cells.
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
new_cluster["bbox"] = new_bbox
new_clusters.append(new_cluster)
return new_clusters
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
## A text-like cluster. The bbox only needs to be around the text cells:
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
new_bbox = surrounding_list(
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
)
logger.debug(" New bounding box:" + str(new_bbox))
if cluster["type"] == DocItemLabel.PICTURE:
## We only make the bbox completely comprise included text cells:
logger.debug(" Picture")
if len(cluster["cell_ids"]) != 0:
min_bbox = surrounding_list(
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
)
logger.debug(" Minimum bbox: " + str(min_bbox))
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
new_bbox = surrounding(min_bbox, cluster["bbox"])
logger.debug(" New bbox (initial and text cells): " + str(new_bbox))
else:
logger.debug(" without text cells, no change.")
new_bbox = cluster["bbox"]
else: ## A table
## At least we have to keep the included text cells, and we make the bbox completely comprise them
min_bbox = surrounding_list(
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
)
logger.debug(" Minimum bbox: " + str(min_bbox))
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
new_bbox = surrounding(min_bbox, cluster["bbox"])
logger.debug(" Possibly increased bbox: " + str(new_bbox))
## Now we look which non-belonging cells are covered.
## (To decrease dependencies, we don't make use of which cells we actually removed.)
## We don't worry about orphan cells, those could still be added to the table.
enclosed_cells = compute_enclosed_cells(
new_bbox, raw_cells, min_cell_intersection_with_cluster=0.3
)[0]
additional_cells = set(enclosed_cells) - set(cluster["cell_ids"])
logger.debug(
" Additional cells enclosed by Table bbox: " + str(additional_cells)
)
spurious_cells = additional_cells - set(orphan_cell_indices)
logger.debug(
" Spurious cells enclosed by Table bbox (additional minus orphans): "
+ str(spurious_cells)
)
if len(spurious_cells) == 0:
return new_bbox
## Else we want to keep as much as possible, e.g., grid lines, but not the spurious cells if we can.
## We initialize possible cuts with the current bbox.
left_cut = new_bbox[0]
right_cut = new_bbox[2]
upper_cut = new_bbox[3]
lower_cut = new_bbox[1]
for cell_ix in spurious_cells:
cell = raw_cells[cell_ix]
# logger.debug(" Spurious cell bbox: " + str(cell["bbox"]))
is_left = cell["bbox"][2] < min_bbox[0]
is_right = cell["bbox"][0] > min_bbox[2]
is_above = cell["bbox"][1] > min_bbox[3]
is_below = cell["bbox"][3] < min_bbox[1]
# logger.debug(" Left, right, above, below? " + str([is_left, is_right, is_above, is_below]))
if is_left:
if cell["bbox"][2] > left_cut:
## We move the left cut to exclude this cell:
left_cut = cell["bbox"][2]
if is_right:
if cell["bbox"][0] < right_cut:
## We move the right cut to exclude this cell:
right_cut = cell["bbox"][0]
if is_above:
if cell["bbox"][1] < upper_cut:
## We move the upper cut to exclude this cell:
upper_cut = cell["bbox"][1]
if is_below:
if cell["bbox"][3] > lower_cut:
## We move the left cut to exclude this cell:
lower_cut = cell["bbox"][3]
# logger.debug(" Current bbox: " + str([left_cut, lower_cut, right_cut, upper_cut]))
new_bbox = [left_cut, lower_cut, right_cut, upper_cut]
logger.debug(" Final bbox: " + str(new_bbox))
return new_bbox
def remove_cluster_duplicates_by_conf(cluster_predictions, threshold=0.5):
DuplicateDeletedClusterIDs = []
for cluster_1 in cluster_predictions:
for cluster_2 in cluster_predictions:
if cluster_1["id"] != cluster_2["id"]:
if_conf = False
if cluster_1["confidence"] > cluster_2["confidence"]:
if_conf = True
if if_conf == True:
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > threshold:
DuplicateDeletedClusterIDs.append(cluster_2["id"])
elif contains(
cluster_1["bbox"],
[
cluster_2["bbox"][0] + 3,
cluster_2["bbox"][1] + 3,
cluster_2["bbox"][2] - 3,
cluster_2["bbox"][3] - 3,
],
):
DuplicateDeletedClusterIDs.append(cluster_2["id"])
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
for cl_id in DuplicateDeletedClusterIDs:
for cluster in cluster_predictions:
if cl_id == cluster["id"]:
cluster_predictions.remove(cluster)
return cluster_predictions
# Assign orphan cells by a low confidence prediction that is below the assigned confidence
def assign_orphans_with_low_conf_pred(
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
):
for orph_id in orphan_cell_indices:
cluster_chosen = {}
iou_thresh = 0.05
confidence = 0.05
# Loop over all predictions, and find the one with the highest IOU, and confidence
for cluster in cluster_predictions_low:
calc_iou = bb_iou(cluster["bbox"], raw_cells[orph_id]["bbox"])
cluster_area = (cluster["bbox"][3] - cluster["bbox"][1]) * (
cluster["bbox"][2] - cluster["bbox"][0]
)
cell_area = (
raw_cells[orph_id]["bbox"][3] - raw_cells[orph_id]["bbox"][1]
) * (raw_cells[orph_id]["bbox"][2] - raw_cells[orph_id]["bbox"][0])
if (
(iou_thresh < calc_iou)
and (cluster["confidence"] > confidence)
and (cell_area * 3 > cluster_area)
):
cluster_chosen = cluster
iou_thresh = calc_iou
confidence = cluster["confidence"]
# If a candidate is found, assign to it the PDF cell ids, and tag that it was created by this function for tracking
if iou_thresh != 0.05 and confidence != 0.05:
cluster_chosen["cell_ids"].append(orph_id)
cluster_chosen["created_by"] = "orph_low_conf"
cluster_predictions.append(cluster_chosen)
orphan_cell_indices.remove(orph_id)
return cluster_predictions, orphan_cell_indices
def remove_ambigous_pdf_cell_by_conf(cluster_predictions, raw_cells, amb_cell_idxs):
for amb_cell_id in amb_cell_idxs:
highest_conf = 0
highest_bbox_iou = 0
cluster_chosen = None
problamatic_clusters = []
# Find clusters in question
for cluster in cluster_predictions:
if amb_cell_id in cluster["cell_ids"]:
problamatic_clusters.append(amb_cell_id)
# If the cell_id is in a cluster of high conf, and highest iou score, and smaller in area
bbox_iou_val = bb_iou(cluster["bbox"], raw_cells[amb_cell_id]["bbox"])
if (
cluster["confidence"] > highest_conf
and bbox_iou_val > highest_bbox_iou
):
cluster_chosen = cluster
highest_conf = cluster["confidence"]
highest_bbox_iou = bbox_iou_val
if cluster["id"] in problamatic_clusters:
problamatic_clusters.remove(cluster["id"])
# now remove the assigning of cell id from lower confidence, and threshold
for cluster in cluster_predictions:
for prob_amb_id in problamatic_clusters:
if prob_amb_id in cluster["cell_ids"]:
cluster["cell_ids"].remove(prob_amb_id)
amb_cell_idxs.remove(amb_cell_id)
return cluster_predictions, amb_cell_idxs
def ranges(nums):
# Find if consecutive numbers exist within pdf cells
# Used to remove line numbers for review manuscripts
nums = sorted(set(nums))
gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e]
edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
return list(zip(edges, edges))
def set_orphan_as_text(
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
):
max_id = -1
figures = []
for cluster in cluster_predictions:
if cluster["type"] == DocItemLabel.PICTURE:
figures.append(cluster)
if cluster["id"] > max_id:
max_id = cluster["id"]
max_id += 1
lines_detector = False
content_of_orphans = []
for orph_id in orphan_cell_indices:
orph_cell = raw_cells[orph_id]
content_of_orphans.append(raw_cells[orph_id]["text"])
fil_content_of_orphans = []
for cell_content in content_of_orphans:
if cell_content.isnumeric():
try:
num = int(cell_content)
fil_content_of_orphans.append(num)
except ValueError: # ignore the cell
pass
# line_orphans = []
# Check if there are more than 2 pdf orphan cells, if there are more than 2,
# then check between the orphan cells if they are numeric
# and if they are a consecutive series of numbers (using ranges function) to decide
if len(fil_content_of_orphans) > 2:
out_ranges = ranges(fil_content_of_orphans)
if len(out_ranges) > 1:
cnt_range = 0
for ranges_ in out_ranges:
if ranges_[0] != ranges_[1]:
# If there are more than 75 (half the total line number of a review manuscript page)
# decide that there are line numbers on page to be ignored.
if len(list(range(ranges_[0], ranges_[1]))) > 75:
lines_detector = True
# line_orphans = line_orphans + list(range(ranges_[0], ranges_[1]))
for orph_id in orphan_cell_indices:
orph_cell = raw_cells[orph_id]
if bool(orph_cell["text"] and not orph_cell["text"].isspace()):
fig_flag = False
# Do not assign orphan cells if they are inside a figure
for fig in figures:
if contains(fig["bbox"], orph_cell["bbox"]):
fig_flag = True
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
if fig_flag == False and lines_detector == False:
# get class from low confidence detections if not set as text:
class_type = DocItemLabel.TEXT
for cluster in cluster_predictions_low:
intersection = compute_intersection(
orph_cell["bbox"], cluster["bbox"]
)
class_type = DocItemLabel.TEXT
if (
cluster["confidence"] > 0.1
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
):
class_type = cluster["type"]
elif contains(
cluster["bbox"],
[
orph_cell["bbox"][0] + 3,
orph_cell["bbox"][1] + 3,
orph_cell["bbox"][2] - 3,
orph_cell["bbox"][3] - 3,
],
):
class_type = cluster["type"]
elif intersection > area(orph_cell["bbox"]) * 0.2:
class_type = cluster["type"]
new_cluster = {
"id": max_id,
"bbox": orph_cell["bbox"],
"type": class_type,
"cell_ids": [orph_id],
"confidence": -1,
"created_by": "orphan_default",
}
max_id += 1
cluster_predictions.append(new_cluster)
return cluster_predictions, orphan_cell_indices
def merge_cells(cluster_predictions):
# Using graph component creates clusters if orphan cells are touching or too close.
G = nx.Graph()
for cluster in cluster_predictions:
if cluster["created_by"] == "orphan_default":
G.add_node(cluster["id"])
for cluster_1 in cluster_predictions:
for cluster_2 in cluster_predictions:
if (
cluster_1["id"] != cluster_2["id"]
and cluster_2["created_by"] == "orphan_default"
and cluster_1["created_by"] == "orphan_default"
):
cl1 = copy.deepcopy(cluster_1["bbox"])
cl2 = copy.deepcopy(cluster_2["bbox"])
cl1[0] = cl1[0] - 2
cl1[1] = cl1[1] - 2
cl1[2] = cl1[2] + 2
cl1[3] = cl1[3] + 2
cl2[0] = cl2[0] - 2
cl2[1] = cl2[1] - 2
cl2[2] = cl2[2] + 2
cl2[3] = cl2[3] + 2
if is_intersecting(cl1, cl2):
G.add_edge(cluster_1["id"], cluster_2["id"])
component = sorted(map(sorted, nx.k_edge_components(G, k=1)))
max_id = -1
for cluster_1 in cluster_predictions:
if cluster_1["id"] > max_id:
max_id = cluster_1["id"]
for nodes in component:
if len(nodes) > 1:
max_id += 1
lines = []
for node in nodes:
for cluster in cluster_predictions:
if cluster["id"] == node:
lines.append(cluster)
cluster_predictions.remove(cluster)
new_merged_cluster = build_cluster_from_lines(
lines, DocItemLabel.TEXT, max_id
)
cluster_predictions.append(new_merged_cluster)
return cluster_predictions
def clean_up_clusters(
cluster_predictions,
raw_cells,
merge_cells=False,
img_table=False,
one_cell_table=False,
):
DuplicateDeletedClusterIDs = []
for cluster_1 in cluster_predictions:
for cluster_2 in cluster_predictions:
if cluster_1["id"] != cluster_2["id"]:
# remove any artifcats created by merging clusters
if merge_cells == True:
if contains(
cluster_1["bbox"],
[
cluster_2["bbox"][0] + 3,
cluster_2["bbox"][1] + 3,
cluster_2["bbox"][2] - 3,
cluster_2["bbox"][3] - 3,
],
):
cluster_1["cell_ids"] = (
cluster_1["cell_ids"] + cluster_2["cell_ids"]
)
DuplicateDeletedClusterIDs.append(cluster_2["id"])
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
elif img_table == True:
if (
cluster_1["type"] == DocItemLabel.TEXT
and cluster_2["type"] == DocItemLabel.PICTURE
or cluster_2["type"] == DocItemLabel.TABLE
):
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
DuplicateDeletedClusterIDs.append(cluster_1["id"])
elif contains(
[
cluster_2["bbox"][0] - 3,
cluster_2["bbox"][1] - 3,
cluster_2["bbox"][2] + 3,
cluster_2["bbox"][3] + 3,
],
cluster_1["bbox"],
):
DuplicateDeletedClusterIDs.append(cluster_1["id"])
# remove tables that have one pdf cell
if one_cell_table == True:
if (
cluster_1["type"] == DocItemLabel.TABLE
and len(cluster_1["cell_ids"]) < 2
):
DuplicateDeletedClusterIDs.append(cluster_1["id"])
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
for cl_id in DuplicateDeletedClusterIDs:
for cluster in cluster_predictions:
if cl_id == cluster["id"]:
cluster_predictions.remove(cluster)
return cluster_predictions
def assigning_cell_ids_to_clusters(clusters, raw_cells, threshold):
for cluster in clusters:
cells_in_cluster, _ = compute_enclosed_cells(
cluster["bbox"], raw_cells, min_cell_intersection_with_cluster=threshold
)
cluster["cell_ids"] = cells_in_cluster
## These cell_ids are ids of the raw cells.
## They are often, but not always, the same as the "id" or the index of the "cells" list in a prediction.
return clusters
# Creates a map of cell_id->cluster_id
def cell_id_state_map(clusters, cell_count):
clusters_around_cells = find_clusters_around_cells(cell_count, clusters)
orphan_cell_indices = [
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) == 0
] # which cells are assigned no cluster?
ambiguous_cell_indices = [
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) > 1
] # which cells are assigned > 1 clusters?
return clusters_around_cells, orphan_cell_indices, ambiguous_cell_indices

View File

@ -74,6 +74,10 @@ def main():
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options.lang = "es"
pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=4, device=Device.AUTO
)
doc_converter = DocumentConverter(
format_options={

431
poetry.lock generated
View File

@ -1,5 +1,36 @@
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
[[package]]
name = "accelerate"
version = "1.1.1"
description = "Accelerate"
optional = false
python-versions = ">=3.9.0"
files = [
{file = "accelerate-1.1.1-py3-none-any.whl", hash = "sha256:61edd81762131b8d4bede008643fa1e1f3bf59bec710ebda9771443e24feae02"},
{file = "accelerate-1.1.1.tar.gz", hash = "sha256:0d39dfac557052bc735eb2703a0e87742879e1e40b88af8a2f9a93233d4cd7db"},
]
[package.dependencies]
huggingface-hub = ">=0.21.0"
numpy = ">=1.17,<3.0.0"
packaging = ">=20.0"
psutil = "*"
pyyaml = "*"
safetensors = ">=0.4.3"
torch = ">=1.10.0"
[package.extras]
deepspeed = ["deepspeed"]
dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "diffusers", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.6.4,<0.7.0)", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.6.4,<0.7.0)"]
rich = ["rich"]
sagemaker = ["sagemaker"]
test-dev = ["bitsandbytes", "datasets", "diffusers", "evaluate", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
test-prod = ["parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist"]
test-trackers = ["comet-ml", "dvclive", "tensorboard", "wandb"]
testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
[[package]]
name = "aiohappyeyeballs"
version = "2.4.4"
@ -231,6 +262,21 @@ docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphi
tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
[[package]]
name = "autoflake"
version = "2.3.1"
description = "Removes unused imports and unused variables"
optional = false
python-versions = ">=3.8"
files = [
{file = "autoflake-2.3.1-py3-none-any.whl", hash = "sha256:3ae7495db9084b7b32818b4140e6dc4fc280b712fb414f5b8fe57b0a8e85a840"},
{file = "autoflake-2.3.1.tar.gz", hash = "sha256:c98b75dc5b0a86459c4f01a1d32ac7eb4338ec4317a4469515ff1e687ecd909e"},
]
[package.dependencies]
pyflakes = ">=3.0.0"
tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""}
[[package]]
name = "autopep8"
version = "2.2.0"
@ -793,64 +839,32 @@ name = "deepsearch-glm"
version = "0.26.2"
description = "Graph Language Models"
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "deepsearch_glm-0.26.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:00453a02bc8df959da576bc598ba528b394a9c016d6a428efc948c867be98938"},
{file = "deepsearch_glm-0.26.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:9e6f654ab4d9dc3e6e2033c9c45294c36e5e62650cac0e4a650af576364eb370"},
{file = "deepsearch_glm-0.26.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:1fdf2fce9d642bbc5222600a1b280a7413aa640ed01acee13d43401ec27d6ad5"},
{file = "deepsearch_glm-0.26.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:218cab085a58b88c55dbeb80cc5f5f7b3c5a96c8537eb2ada8e5cab70cd8e439"},
{file = "deepsearch_glm-0.26.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75be007e62d11780f2433b213dad14d14a270c3607e909fd1fc95efdf02446c6"},
{file = "deepsearch_glm-0.26.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a9b34c6cfb8b873ccf6e0072f5434c0c65a1d90652a6b901becc5b3b1695106"},
{file = "deepsearch_glm-0.26.2-cp310-cp310-win_amd64.whl", hash = "sha256:f4b63c6e1d4a7be597efbe96052286bca805784cd7283a037919c349971051c5"},
{file = "deepsearch_glm-0.26.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:eaabedca45fdd87dc455dc08b1785db15ba5ea6b706820330447f2cf7f03a67a"},
{file = "deepsearch_glm-0.26.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:101bc2a79027df555050d08112717249916c4d82ad5815be2a1ac0581d9ab2b5"},
{file = "deepsearch_glm-0.26.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:000d4a4895c4ff89c465b746bb7db3bb054a1fb5c3fabe2772d5431700c15d33"},
{file = "deepsearch_glm-0.26.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:2d97f9ebdff1a9086cc32ddd0abb14b42c4b4b2ae666986078fd77db3aa4487d"},
{file = "deepsearch_glm-0.26.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:666a3b53b0949735cff77a8209f2833866e34b635ca0c7f444807963d8379d93"},
{file = "deepsearch_glm-0.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aae1ec83222ef39e045f0186023473e5ce2ed30846c13f2943192d34d57c0f"},
{file = "deepsearch_glm-0.26.2-cp311-cp311-win_amd64.whl", hash = "sha256:9bb173dcd0caef1d8a0d440e1ac3e9959c6b849e06b95b1d9b436661504c98f7"},
{file = "deepsearch_glm-0.26.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:bb286be157a7b163b46a4d1f7e48a30d5cc365d4926c18e8b3c72994a8f296f7"},
{file = "deepsearch_glm-0.26.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:defca9ecf1451ce3422b7783ea188571ffad7c941dbf52acc2638c5a4ffa7743"},
{file = "deepsearch_glm-0.26.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:226f8862c616a4def202a6d0f71eb5d8e9f6ddbded2cf431c146150303888cf8"},
{file = "deepsearch_glm-0.26.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:6ff0fe662254835763ad7d3edc2db320de8d233f645064e0356187d8e1fabe3b"},
{file = "deepsearch_glm-0.26.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91c1b84ec5b1308de37c660f49570ee1e72bd7f0f607566344446b9293f1183c"},
{file = "deepsearch_glm-0.26.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d634eeaae8943e1912c0dfbf3193e09bea8c1aac38db8a6fa1f03fe6a49cb84"},
{file = "deepsearch_glm-0.26.2-cp312-cp312-win_amd64.whl", hash = "sha256:9294087d26037574817e8e1710e387fd9ef9ba4328705de86dd40d819f32909a"},
{file = "deepsearch_glm-0.26.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:df7181143c62a1f0e166bc9ffb25deab617b53ba7c468284e3072b861c17405a"},
{file = "deepsearch_glm-0.26.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:2c3fef2c8394d6dc22d1bcdab12d0f46df9b411c5431dfb585a2c7bb128e1744"},
{file = "deepsearch_glm-0.26.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f641a88421aa806ccef8f8e657fbb65135f59732110d21b5103c09138a659315"},
{file = "deepsearch_glm-0.26.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:cf78499892caffb4bdc020b8c50ab7d623f568478375dcc2e3ec107d40972adc"},
{file = "deepsearch_glm-0.26.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72f2b432b81b0bc7c87e33c41a97c7a8da2536dd2b337eb1b7d054fba12d556"},
{file = "deepsearch_glm-0.26.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4703cae0d329b77e1d97892910313035204daa026d6e67ce6eb1b3e74e41f93e"},
{file = "deepsearch_glm-0.26.2-cp313-cp313-win_amd64.whl", hash = "sha256:c906c75d080414490727de416fd1782bc6a10301378f72a741aa227b183832cf"},
{file = "deepsearch_glm-0.26.2-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:10a366512540eff9f76645eb521df3469a160e8460ff6c3c1bfe172342c6c670"},
{file = "deepsearch_glm-0.26.2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:304988f1e08bd86a8a7b7cc0495e38faf586231f33f05c1023597c6177758572"},
{file = "deepsearch_glm-0.26.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:c8f69b877846031648811ff80070b90b834bf9e4cdd74e5c2d93c7e18f408cd1"},
{file = "deepsearch_glm-0.26.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:1ba12361d1e4b8b02a72f515028f22686d98526a703a1091f89e9487fa3aa3c7"},
{file = "deepsearch_glm-0.26.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c03bb8b3cdb2952c9c269849830f7830fa7e0384b76809e25f4c2d5d091f746c"},
{file = "deepsearch_glm-0.26.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fe719b26d7cfcf5632a56be1f1420920fcdbea4418c014dd6e7e218dd2aca11"},
{file = "deepsearch_glm-0.26.2-cp39-cp39-win_amd64.whl", hash = "sha256:2b31fa419287af3429efc2d5610cbf2428bafc762e45b610a48ad30dffedaa9e"},
{file = "deepsearch_glm-0.26.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6df2504998e60c1aac3655820ad25e5eccca137da2e9f78fb53dc0fd0d1cdbf4"},
{file = "deepsearch_glm-0.26.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:e1b4a789ec9555ec9f4ff6730d68081be37eaa43cb51c9463962967c9f672684"},
{file = "deepsearch_glm-0.26.2.tar.gz", hash = "sha256:7a607e78903b66d28beac3408156c11ab7b34ee70e8ccd0d292b28433e5a9c1d"},
]
python-versions = "^3.9"
files = []
develop = false
[package.dependencies]
docling-core = ">=2.0,<3.0"
docling-core = "^2.0"
docutils = "!=0.21"
numpy = ">=1.24.4,<3.0.0"
pandas = ">=1.5.1,<3.0.0"
python-dotenv = ">=1.0.0,<2.0.0"
pywin32 = {version = ">=307,<308", markers = "sys_platform == \"win32\""}
requests = ">=2.32.3,<3.0.0"
rich = ">=13.7.0,<14.0.0"
python-dotenv = "^1.0.0"
pywin32 = {version = "^307", markers = "sys_platform == \"win32\""}
requests = "^2.32.3"
rich = "^13.7.0"
tabulate = ">=0.8.9"
tqdm = ">=4.64.0,<5.0.0"
tqdm = "^4.64.0"
[package.extras]
pyplot = ["matplotlib (>=3.7.1,<4.0.0)"]
toolkit = ["deepsearch-toolkit (>=1.1.0,<2.0.0)"]
[package.source]
type = "git"
url = "ssh://git@github.com/DS4SD/deepsearch-glm.git"
reference = "cau/layout-processing-children-payloads"
resolved_reference = "8fac776c07fb7541d17ebc9db48c9900074f25b1"
[[package]]
name = "defusedxml"
version = "0.7.1"
@ -893,94 +907,74 @@ name = "docling-core"
version = "2.6.1"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_core-2.6.1-py3-none-any.whl", hash = "sha256:8e7a5bc0ce13289567738481949fed3ab580f2d8cea7525b246159233d81b26b"},
{file = "docling_core-2.6.1.tar.gz", hash = "sha256:c8af45e0873611120cc24757d567d37e053a54e2ce060b7b5b44efd0d73f75e5"},
]
python-versions = "^3.9"
files = []
develop = false
[package.dependencies]
jsonref = ">=1.1.0,<2.0.0"
jsonschema = ">=4.16.0,<5.0.0"
pandas = ">=2.1.4,<3.0.0"
pillow = ">=10.3.0,<11.0.0"
jsonref = "^1.1.0"
jsonschema = "^4.16.0"
pandas = "^2.1.4"
pillow = "^10.3.0"
pydantic = ">=2.6.0,<2.10"
pyyaml = ">=5.1,<7.0.0"
tabulate = ">=0.9.0,<0.10.0"
typing-extensions = ">=4.12.2,<5.0.0"
tabulate = "^0.9.0"
typing-extensions = "^4.12.2"
[package.source]
type = "git"
url = "ssh://git@github.com/DS4SD/docling-core.git"
reference = "feat-add-legacy-convert"
resolved_reference = "4434b1073dc15fefb75f28c37299abd32d9c532f"
[[package]]
name = "docling-ibm-models"
version = "2.0.7"
description = "This package contains the AI models used by the Docling PDF conversion package"
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_ibm_models-2.0.7-py3-none-any.whl", hash = "sha256:bf362add22e9c526ac56c04bce412d7bb1c331b44a73204abba0b1d90a500c78"},
{file = "docling_ibm_models-2.0.7.tar.gz", hash = "sha256:e1372c4f2517d522125fb02a820558f01914926f532bcd0534f1028a25d63667"},
]
python-versions = "^3.9"
files = []
develop = false
[package.dependencies]
accelerate = "^1.1.1"
huggingface_hub = ">=0.23,<1"
jsonlines = ">=3.1.0,<4.0.0"
jsonlines = "^3.1.0"
numpy = ">=1.24.4,<3.0.0"
opencv-python-headless = ">=4.6.0.66,<5.0.0.0"
Pillow = ">=10.0.0,<11.0.0"
torch = ">=2.2.2,<3.0.0"
torchvision = ">=0,<1"
tqdm = ">=4.64.0,<5.0.0"
opencv-python-headless = "^4.6.0.66"
Pillow = "^10.0.0"
torch = "^2.2.2"
torchvision = "^0"
tqdm = "^4.64.0"
transformers = "^4.46.2"
[package.source]
type = "git"
url = "ssh://git@github.com/DS4SD/docling-ibm-models.git"
reference = "nli/performance"
resolved_reference = "c1bed7d5451ee16b7fb5b0bc5e847f599ed93aa7"
[[package]]
name = "docling-parse"
version = "2.1.2"
description = "Simple package to extract text with coordinates from programmatic PDFs"
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_parse-2.1.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:140319e3eac73f9768d35313739891ae637af57fda03eade17d90e2d28ad80eb"},
{file = "docling_parse-2.1.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:cec968a436ad14e8a45a72fc0e0074750eee28548a14f3c3df5157a68ac958e7"},
{file = "docling_parse-2.1.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:c84eba992fee49d190cf4834fd44ef4e6549c3f1fcd41b91622114703a7e4a87"},
{file = "docling_parse-2.1.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:ae02af07f3dd335f56383a83efdc1f6450b7d38e21e1131005dbd341eb38e47d"},
{file = "docling_parse-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fa0731e97d2644ff8a3257ae53208b88be3ddc6a4bc54fbe39e21f8395530f0"},
{file = "docling_parse-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d26d60136aab5f4a3a773922a8dcc530334165331660d074cd88dcd5d91206cd"},
{file = "docling_parse-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:76eef41d50017c2fc531face44c1a35bef66095951622617d0f281e35d18e9e0"},
{file = "docling_parse-2.1.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:7f1ad037d3ac0d80252c493e73b12688ded3ece9bae7954ba62765506c139d21"},
{file = "docling_parse-2.1.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:9f1360c0558c84f4b6633b0882256f6d621fd9e52179acae39c727a43b48d937"},
{file = "docling_parse-2.1.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:5d505c2d3e9eff4f3064b4d1f017a3c6577b5d8ba55540d558f4899561862956"},
{file = "docling_parse-2.1.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:58f552f61ac35c02890b03fe59b06552353314c3c1ee2a050c68a8a206ab1b4b"},
{file = "docling_parse-2.1.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22069dadcfdcebc02e36e27f80d452f1265a5a97d894f2391490bf099bc5432c"},
{file = "docling_parse-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68942b31684a021e27b9b07d27ed139911444b33963f7e0b5d2dbda8aaa5cb1"},
{file = "docling_parse-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:d87e3fbf1549cd8bc171240c18584ba8c32f83963b5af66b2a70a2bc3af56d2e"},
{file = "docling_parse-2.1.2-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:5b00b81fa8eb0b34621f1ef9d07623d7dbcc354a33295a5b0c4209c39b1ff8eb"},
{file = "docling_parse-2.1.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:1b99b122f941d0f19e92a215e589b94f49db899c5eec0147e83824652b18ce74"},
{file = "docling_parse-2.1.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:744fe368a8fa49778e881c1052427c38a7d0e367273fcdef493e047513783108"},
{file = "docling_parse-2.1.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b8a3e558a96f7d593269be75ba4147ebe221f5edad3d41244cef3533e8a51b74"},
{file = "docling_parse-2.1.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afcf53bce8c91886c1360e625e51d15ebfb36d37cd53b6e019e86ce1118c1d0c"},
{file = "docling_parse-2.1.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89d25fc4fb8f16a8ed5bc8c4f00a77739d2536732c0ddae16340b1859adf68fd"},
{file = "docling_parse-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:28a7f49a865a0cd71033a7899aac00c7d2e3b6c3a76488f8676ba0fc353d9f3a"},
{file = "docling_parse-2.1.2-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:ad1560532cdf15dcb4a6005c8b7fe19def0e910e6125863f14978d6d07a1ba47"},
{file = "docling_parse-2.1.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:19003b1bb64cd5a40999a3c5ffcb9a9d9608a073949b76acc58d58fb5054ea03"},
{file = "docling_parse-2.1.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:041bf1c72a23d62e2dd30dcc3508222f6674e85b0f1d19a3196fd6d7b5f56015"},
{file = "docling_parse-2.1.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:12403c26e833d8fdf0f406d2895f5108fd07b64a4d929c9105ca60f09b882c34"},
{file = "docling_parse-2.1.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1082e227af3e31085eff3e96103b09becdf95324304e17ce0b1b61c43b93fbb7"},
{file = "docling_parse-2.1.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77b36e36d1e07a06a1616ee281079d6b972c3059f2fa02dafcfc225a41e5bd1a"},
{file = "docling_parse-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:4300df86657935b0109c44702857ebf3d0713f1bbe376982f369504a762e2fef"},
{file = "docling_parse-2.1.2-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:912fe44507f209d997e1183f38a71d4e14c31d53a164fb862631822624dad892"},
{file = "docling_parse-2.1.2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:10ff1928b12099f446fcd0b043182173e6b02ce74008ea6ce921d56cdee8964e"},
{file = "docling_parse-2.1.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:391ad31a4086fabbc290851432f4cf0bdc366e07a454adf49e42029898d6b477"},
{file = "docling_parse-2.1.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:ebf478e99c0c16d7dad30c0fdb1f5e236ae94d48da8dec48dbe5f0841eead4ed"},
{file = "docling_parse-2.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b1c904017330d096981b7db6b225b66aff1cebdc422843103a782121d6e8be8"},
{file = "docling_parse-2.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bc8ec6ad1bec6168991b895d749b222bef14b568d1d9f6c06efaeb1645dfe12"},
{file = "docling_parse-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:e6eb130aa367247e1f32225bb1608cee901d711b475527404bbc4330c9199b99"},
{file = "docling_parse-2.1.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ef88d565c761b48f8a175fd474e068c0da9d4401e22d3e38de73e2f00f3df2d1"},
{file = "docling_parse-2.1.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:bdc8ccbdc4ab91b829b8c421ad89da276442a2c891eda1f6507f248d0bd8dff9"},
{file = "docling_parse-2.1.2.tar.gz", hash = "sha256:3c249f50e6351eb6126331a179fe86b64dc2073e9f881d52f8c8fb391633b89e"},
]
python-versions = "^3.9"
files = []
develop = false
[package.dependencies]
autoflake = "^2.3.1"
pillow = "^10.4.0"
pywin32 = {version = ">=305", markers = "sys_platform == \"win32\""}
tabulate = ">=0.9.0,<1.0.0"
[package.source]
type = "git"
url = "ssh://git@github.com/DS4SD/docling-parse.git"
reference = "dev/expose-cell-sanitisation-via-python"
resolved_reference = "8ea65ae3080db88f54f8a3f7b622e7b002c9b7f0"
[[package]]
name = "docutils"
version = "0.21.2"
@ -3192,6 +3186,7 @@ files = [
{file = "nh3-0.2.19-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00810cd5275f5c3f44b9eb0e521d1a841ee2f8023622de39ffc7d88bd533d8e0"},
{file = "nh3-0.2.19-cp38-abi3-win32.whl", hash = "sha256:7e98621856b0a911c21faa5eef8f8ea3e691526c2433f9afc2be713cb6fbdb48"},
{file = "nh3-0.2.19-cp38-abi3-win_amd64.whl", hash = "sha256:75c7cafb840f24430b009f7368945cb5ca88b2b54bb384ebfba495f16bc9c121"},
{file = "nh3-0.2.19.tar.gz", hash = "sha256:790056b54c068ff8dceb443eaefb696b84beff58cca6c07afd754d17692a4804"},
]
[[package]]
@ -3766,9 +3761,9 @@ numpy = [
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
]
[[package]]
@ -3792,9 +3787,9 @@ numpy = [
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
]
[[package]]
@ -5474,12 +5469,12 @@ cffi = {version = "*", markers = "implementation_name == \"pypy\""}
[[package]]
name = "rapidocr-onnxruntime"
version = "1.4.0"
version = "1.4.1"
description = "A cross platform OCR Library based on OnnxRuntime."
optional = true
python-versions = "<3.13,>=3.6"
files = [
{file = "rapidocr_onnxruntime-1.4.0-py3-none-any.whl", hash = "sha256:d21c4ba2ef80b7a8ecf8178632f273398a92ab44a1ffb9e171139ef2a589d690"},
{file = "rapidocr_onnxruntime-1.4.1-py3-none-any.whl", hash = "sha256:5ecdb8f4f3beec56630197f87c3e67ab744fce0cc66394b7b1da08c8c96a727f"},
]
[package.dependencies]
@ -5700,112 +5695,114 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"]
[[package]]
name = "rpds-py"
version = "0.22.0"
version = "0.22.1"
description = "Python bindings to Rust's persistent data structures (rpds)"
optional = false
python-versions = ">=3.9"
files = [
{file = "rpds_py-0.22.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a4366f264fa60d3c109f0b27af0cd9eb8d46746bd70bd3d9d425f035b6c7e286"},
{file = "rpds_py-0.22.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e34a3e665d38d0749072e6565400c8ce9abae976e338919a0dfbfb0e1ba43068"},
{file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38cacf1f378571450576f2c8ce87da6f3fddc59d744de5c12b37acc23285b1e1"},
{file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8cbb040fec8eddd5a6a75e737fd73c9ce37e51f94bacdd0b178d0174a4758395"},
{file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d80fd710b3307a3c63809048b72c536689b9b0b31a2518339c3f1a4d29c73d7a"},
{file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b5d17d8f5b885ce50e0cda85f99c0719e365e98b587338535fa566a48375afb"},
{file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f7a048ec1ebc991331d709be4884dc318c9eaafa66dcde8be0933ac0e702149"},
{file = "rpds_py-0.22.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:306da3dfa174b489a3fc63b0872e2226a5ddf94c59875a770d72aff945d5ed96"},
{file = "rpds_py-0.22.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c7b4450093c0c909299770226fb0285be47b0a57545bae25b5c4e51566b0e587"},
{file = "rpds_py-0.22.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0903ffdb5b9007e503203b6285e4ff0faf96d875c19f1d103b475acf7d9f7311"},
{file = "rpds_py-0.22.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d1522025cda9e57329aade769f56e5793b2a5da7759a21914ee10e67e17e601e"},
{file = "rpds_py-0.22.0-cp310-cp310-win32.whl", hash = "sha256:49e084d47a66027ac72844f9f52f13d347a9a1f05d4f84381b420e47f836a7fd"},
{file = "rpds_py-0.22.0-cp310-cp310-win_amd64.whl", hash = "sha256:d9ceca96df54cb1675a0b7f52f1c6d5d1df62c5b40741ba211780f1b05a282a2"},
{file = "rpds_py-0.22.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:771c9a3851beaa617d8c8115d65f834a2b52490f42ee2b88b13f1fc5529e9e0c"},
{file = "rpds_py-0.22.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:341a07a4b55126bfae68c9bf24220a73d456111e5eb3dcbdab9fd16de2341224"},
{file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7649c8b8e4bd1ccc5fcbd51a855d57a617deeba19c66e3d04b1abecc61036b2"},
{file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2f513758e7cda8bc262e80299a8e3395d7ef7f4ae705be62632f229bc6c33208"},
{file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba1fc34d0b2f6fd53377a4c954116251eba6d076bf64f903311f4a7d27d10acd"},
{file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:632d2fdddd9fbe3ac8896a119fd18a71fc95ca9c4cbe5223096c142d8c4a2b1d"},
{file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:326e42f2b49462e05f8527a1311ce98f9f97c484b3e443ec0ea4638bed3aebcf"},
{file = "rpds_py-0.22.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e9bbdba9e75b1a9ee1dd1335034dad998ef1acc08492226c6fd50aa773bdfa7d"},
{file = "rpds_py-0.22.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:41f65a97bf2c4b161c9f8f89bc37058346bec9b36e373c8ad00a16c957bff625"},
{file = "rpds_py-0.22.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0686f2c16eafdc2c6b4ce6e86e5b3092e87db09ae64be2787616444eb35b9756"},
{file = "rpds_py-0.22.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4e7c9aa2353eb0b0d845323857197daa036c2ff8624df990b0d886d22a8f665e"},
{file = "rpds_py-0.22.0-cp311-cp311-win32.whl", hash = "sha256:2d2fc3ab021be3e0b5aec6d4164f2689d231b8bfc5185cc454314746aa4aee72"},
{file = "rpds_py-0.22.0-cp311-cp311-win_amd64.whl", hash = "sha256:87453d491369cd8018016d2714a13e8461975161703c18ee31eecf087a8ae5d4"},
{file = "rpds_py-0.22.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:e9d4293b21c69ee4f9e1a99ac4f772951d345611c614a0cfae2ec6b565279bc9"},
{file = "rpds_py-0.22.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:67e013a17a3db4d98cc228fd5aeb36a51b0f5cf7330b9102a552060f1fe4e560"},
{file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b639a19e1791b646d27f15d17530a51722cc728d43b2dff3aeb904f92d91bac"},
{file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1357c3092702078b7782b6ebd5ba9b22c1a291c34fbf9d8f1a48237466ac7758"},
{file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:842855bbb113a19c393c6de5aa6ed9a26c6b13c2fead5e49114d39f0d08b94d8"},
{file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ae7927cd2b869ca4dc645169d8af5494a29c99afd0ea0f24dd00c811ab1d8b8"},
{file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91bfef5daa2a5a4fe62f8d317fc91a626073639f951f851bd2cb252d01bc6c5"},
{file = "rpds_py-0.22.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4fc4824e38c1e91a73bc820e7caacaf19d0acd557465aceef0420ca59489b390"},
{file = "rpds_py-0.22.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:92d28a608127b357da47c99e0d0e0655ca2060286540fe9f2a25a2e8ac666e05"},
{file = "rpds_py-0.22.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c637188b930175c256f13adbfc427b83ec7e64476d1ec9d6608f312bb84e06c3"},
{file = "rpds_py-0.22.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:93bbd66f46dddc41e8c656130c97c0fb515e0fa44e1eebb2592769dbbd41b2f5"},
{file = "rpds_py-0.22.0-cp312-cp312-win32.whl", hash = "sha256:54d8f94dec5765a9edc19610fecf0fdf9cab36cbb9def1213188215f735a6f98"},
{file = "rpds_py-0.22.0-cp312-cp312-win_amd64.whl", hash = "sha256:931bf3d0705b2834fed29354f35170fa022fe22a95542b61b7c66aca5f8a224f"},
{file = "rpds_py-0.22.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:2a57300cc8b034c5707085249efd09f19116bb80278d0ec925d7f3710165c510"},
{file = "rpds_py-0.22.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c398a5a8e258dfdc5ea2aa4e5aa2ca3207f654a8eb268693dd1a76939074a588"},
{file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a6cc4eb1e86364331928acafb2bb41d8ab735ca3caf2d6019b9f6dac3f4f65d"},
{file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:574c5c94213bc9990805bfd7e4ba3826d3c098516cbc19f0d0ef0433ad93fa06"},
{file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4c0321bc03a1c513eca1837e3bba948b975bcf3a172aebc197ab3573207f137a"},
{file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d276280649305c1da6cdd84585d48ae1f0efa67434d8b10d2df95228e59a05bb"},
{file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c17b43fe9c6da16885e3fe28922bcd1a029e61631fb771c7d501019b40bcc904"},
{file = "rpds_py-0.22.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:48c95997af9314f4034fe5ba2d837399e786586e220835a578d28fe8161e6ae5"},
{file = "rpds_py-0.22.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e9aa4af6b879bb75a3c7766fbf49d77f4097dd12b548ecbbd8b3f85caa833281"},
{file = "rpds_py-0.22.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8426f97117b914b9bfb2a7bd46edc148e8defda728a55a5df3a564abe70cd7a4"},
{file = "rpds_py-0.22.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:034964ea0ea09645bdde13038b38abb14be0aa747f20fcfab6181207dd9e0483"},
{file = "rpds_py-0.22.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:3dc7c64b56b82428894f056e9ff6e8ee917ff74fc26b65211a33602c2372e928"},
{file = "rpds_py-0.22.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:1212cb231f2002934cd8d71a0d718fdd9d9a2dd671e0feef8501038df3508026"},
{file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f21e1278c9456cd601832375c778ca44614d3433996488221a56572c223f04a"},
{file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:875fe8dffb43c20f68379ee098b035a7038d7903c795d46715f66575a7050b19"},
{file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e23dcdd4b2ff9c6b3317ea7921b210d39592f8ca1cdea58ada25b202c65c0a69"},
{file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0fb8efc9e579acf1e556fd86277fecec320c21ca9b5d39db96433ad8c45bc4a"},
{file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe23687924b25a2dee52fab15976fd6577ed8518072bcda9ff2e2b88ab1f168b"},
{file = "rpds_py-0.22.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d5469b347445d1c31105f33e7bfc9a8ba213d48e42641a610dda65bf9e3c83f5"},
{file = "rpds_py-0.22.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a810a57ce5e8ecf8eac6ec4dab534ff80c34e5a2c31db60e992009cd20f58e0f"},
{file = "rpds_py-0.22.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d9bb9242b38a664f307b3b897f093896f7ed51ef4fe25a0502e5a368de9151ea"},
{file = "rpds_py-0.22.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b4660943030406aaa40ec9f51960dd88049903d9536bc3c8ebb5cc4e1f119bbe"},
{file = "rpds_py-0.22.0-cp313-cp313t-win32.whl", hash = "sha256:208ce1d8e3af138d1d9b21d7206356b7f29b96675e0113aea652cf024e4ddfdc"},
{file = "rpds_py-0.22.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e6da2e0500742e0f157f005924a0589f2e2dcbfdd6cd0cc0abce367433e989be"},
{file = "rpds_py-0.22.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:f980a0640599a74f27fd9d50c84c293f1cb7afc2046c5c6d3efaf8ec7cdbc326"},
{file = "rpds_py-0.22.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ca505fd3767a09a139737f3278bc8a485cb64043062da89bcba27e2f2ea78d33"},
{file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba235e00e0878ba1080b0f2a761f143b2a2d1c354f3d8e507fbf2f3de401bf18"},
{file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:81e7a27365b02fe70a77f1365376879917235b3fec551d19b4c91b51d0bc1d07"},
{file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:32a0e24cab2daae0503b06666d516e90a080c1a95aff0406b9f03c6489177c4b"},
{file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a73ed43d64209e853bba567a543170267a5cd64f359540b0ca2d597e329ba172"},
{file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0abcce5e874474d3eab5ad53be03dae2abe651d248bdeaabe83708e82969e78"},
{file = "rpds_py-0.22.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f4e9946c8c7def17e4fcb5eddb14c4eb6ebc7f6f309075e6c8d23b133c104607"},
{file = "rpds_py-0.22.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:758098b38c344d9a7f279baf0689261777e601f620078ef5afdc9bd3339965c3"},
{file = "rpds_py-0.22.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:9ad4640a409bc2b7d22b7921e7660f0db96c5c8c69fbb2e8f3261d4f71d33983"},
{file = "rpds_py-0.22.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8c48fc7458fe3a74dcdf56ba3534ff41bd421f69436df09ff3497fdaac18b431"},
{file = "rpds_py-0.22.0-cp39-cp39-win32.whl", hash = "sha256:fde778947304e55fc732bc8ea5c6063e74244ac1808471cb498983a210aaf62c"},
{file = "rpds_py-0.22.0-cp39-cp39-win_amd64.whl", hash = "sha256:5fdf91a7c07f40e47b193f2acae0ed9da35d09325d7c3c3279f722b7cbf3d264"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c8fd7a16f7a047e06c747cfcf2acef3ac316132df1c6077445b29ee6f3f3a70b"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:6b6e4bcfc32f831bfe3d6d8a5acedfbfd5e252a03c83fa24813b277a3a8a13ca"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eadd2417e83a77ce3ae4a0efd08cb0ebdfd317b6406d11020354a53ad458ec84"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9dc2113e0cf0dd637751ca736186fca63664939ceb9f9f67e93ade88c69c0c9"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc2c00acdf68f1f69a476b770af311a7dc3955b7de228b04a40bcc51ac4d743b"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dfdabdf8519c93908b2bf0f87c3f86f9e88bab279fb4acfd0907519ca5a1739f"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8338db3c76833d02dc21c3e2c42534091341d26e4f7ba32c6032bb558a02e07b"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8ad4dfda52e64af3202ceb2143a62deba97894b71c64a4405ee80f6b3ea77285"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:3b94b074dcce39976db22ea75c7aea8b22d95e6d3b62f76e20e1179a278521d8"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:d4f2af3107fe4dc40c0d1a2409863f5249c6796398a1d83c1d99a0b3fa6cfb8d"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:bb11809b0de643a292a82f728c494a2bbef0e30a7c42d37464abbd6bef7ca7b1"},
{file = "rpds_py-0.22.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c1c21030ed494deb10226f90e2dbd84a012d59810c409832714a3dd576527be2"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:64a0c965a1e299c9b280006bdb15c276c427c45360aed676305dc36bcaa4d13c"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:2498ff422823be087b48bc82710deb87ac34f6b7c8034ee39920647647de1e60"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59e63da174ff287db05ef7c21d75974a5bac727ed60452aeb3a14278477842a8"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e1c04fb380bc8efaae2fdf17ed6cd5d223da78a8b0b18a610f53d4c5d6e31dfd"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e04919ffa9a728c446b27b6b625fa1d00ece221bdb9d633e978a7e0353a12c0e"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:24c28df05bd284879d0fac850ba697077d2a33b7ebcaea6318d6b6cdfdc86ddc"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d33622dc63c295788eed09dbb1d11bed178909d3267b02d873116ee6be368244"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7539dbb8f705e13629ba6f23388976aad809e387f32a6e5c0712e4e8d9bfcce7"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:b8906f537978da3f7f0bd1ba37b69f6a877bb43312023b086582707d2835bf2f"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:62ab12fe03ffc49978d29de9c31bbb216610157f7e5ca8e172fed6642aead3be"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:762206ba3bf1d6c8c9e0055871d3c0d5b074b7c3120193e6c067e7866f106ab1"},
{file = "rpds_py-0.22.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ed0102146574e5e9f079b2e1a06e6b5b12a691f9c74a65b93b7f3d4feda566c6"},
{file = "rpds_py-0.22.0.tar.gz", hash = "sha256:32de71c393f126d8203e9815557c7ff4d72ed1ad3aa3f52f6c7938413176750a"},
{file = "rpds_py-0.22.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ab27dd4edd84b13309f268ffcdfc07aef8339135ffab7b6d43f16884307a2a48"},
{file = "rpds_py-0.22.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9d5b925156a746dc1f5f52376fdd1fbdd3f6ffe1fcd6f5e06f77ca79abb940a3"},
{file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:201650b309c419143775c15209c620627de3c09a27c7fb58375325aec5cce260"},
{file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:31264187fc934ff1024a4f56775f33c9252d3f4f3e27ec07d1995a26b52702c3"},
{file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:97c5ffe47ccf92d8b17e10f8a5ce28d015aa1196edc3359684cf31504eae6a14"},
{file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9ac7280bd045f472b50306d7efeee051b69e3a2dd1b90f46bd7e86e63b1efa2"},
{file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f941fb86195f97be7f6efe04a21b223f05dfe4d1dfb159999e2f8d101e44cc4"},
{file = "rpds_py-0.22.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f91bfc39f7a64168e08ab831fa497ec5438c1d6c6e2f9e12848d95ad11ac8523"},
{file = "rpds_py-0.22.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:effcae2152afe7937a28376dbabb25c770ef99ed4e16a4ffeb8e6a4f7c4f06aa"},
{file = "rpds_py-0.22.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:2177e59c033bf0d1bf7de1ced561205963583caf3242c6c700a723034bfb5f8e"},
{file = "rpds_py-0.22.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:66f4f48a89cdd30ab3a47335df81c76e9a63799d0d84b29c0618371c66fa37b0"},
{file = "rpds_py-0.22.1-cp310-cp310-win32.whl", hash = "sha256:b07fa9e634234e84096adfa4be3828c8f26e238679c122824b2b3d7131bec578"},
{file = "rpds_py-0.22.1-cp310-cp310-win_amd64.whl", hash = "sha256:ca4657e9fd0b1b5376942d403d634ce188f79064f0873aa853ab05b10185ceec"},
{file = "rpds_py-0.22.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:608c84699b2db09c6a8743845b1a3dad36fae53eaaecb241d45b13dff74405fb"},
{file = "rpds_py-0.22.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9dae4eb9b5534e09ba6c6ab496a757e5e394b7e7b08767d25ca37e8d36491114"},
{file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09a1f000c5f6e08b298275bae00921e9fbbf2a35dae0a86db2821c058c2201a9"},
{file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:580ccbf11f02f948add4cb641843030a89f1463d7c0740cbfc9aca91e9dc34b3"},
{file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96559e05bdf938b2048353e10a7920b98f853cefe4482c2064a718d7d0a50bd7"},
{file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:128cbaed7ba26116820bcb992405d6a13ea18c8fca1b8c4f59906d858e91e979"},
{file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:734783dd7da58f76222f458346ddebdb3621686a1a2a667db5049caf0c9956b9"},
{file = "rpds_py-0.22.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c9ce6b83597d45bec44a2690857ede62fc98223772135f8a7fa90884eb726501"},
{file = "rpds_py-0.22.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bca4428c4a957b78ded3e6e62884ab03f029dce8fa8d34818da0f80f61332b49"},
{file = "rpds_py-0.22.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1ded65691a1d3fd7d2aa89d2c91aa51f941601bb2ce099739909034d957fef4b"},
{file = "rpds_py-0.22.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:72407065ad459db9f3d052ea8c51e02534f02533fc61e51cbab3bd94166f086c"},
{file = "rpds_py-0.22.1-cp311-cp311-win32.whl", hash = "sha256:eb013aa01b404219f28dc973d9e6310fd4db216d7299253dd355629952e0564e"},
{file = "rpds_py-0.22.1-cp311-cp311-win_amd64.whl", hash = "sha256:8bd9ec1db79a664f4cbb12878693b73416f4d2cb425d3e27eccc1bdfbdc826ef"},
{file = "rpds_py-0.22.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:8ec41049c90d204a6561238a9ad6c7263ebb7009d9759c98b58078d9d2fec9ba"},
{file = "rpds_py-0.22.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:102be79c4cc47a4aeb5912401185c404cd2601c15a7163bbecff7f1bfe20b669"},
{file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a603155db408f773637f9e3a712c6e3cbc521aaa8fa2b99f9ba6106c59a2496"},
{file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5dbff9402c2bdf00bf0df9905694b3c292a3847c725651938a72f554351a5fcb"},
{file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96b3759d8ab2323324e0a92b2f44834f9d88089b8d1ab6f533b61f4be3411cef"},
{file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3029f481b31f329b1fdb4ec4b56935d82210ddd9c6f86ea5a87c06f1e97b161"},
{file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d280b4bf09f719b89fd9aab3b71067acc0d0449b7d1eba99a2ade4939cef8296"},
{file = "rpds_py-0.22.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6c8e97e19aa7b0b0d801a159f932ce4435f1049c8c38e2bb372bb5bee559ce50"},
{file = "rpds_py-0.22.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:50e4b5d291105f7063259fe0125b1af902fb34499444d7c5c521dd8328b00939"},
{file = "rpds_py-0.22.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d3777c446bb1c5fcd82dc3f8776e1a146cd91e80cc1892f8634575ace438d22f"},
{file = "rpds_py-0.22.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:447ae1104fb32197b9262f772d565d38e834cc2e9edd89350b37b88fed636e70"},
{file = "rpds_py-0.22.1-cp312-cp312-win32.whl", hash = "sha256:55d371b9d8b0c2a68a50413a8cb01c3c3ce1ea4f768bf77b66669a9a486e101e"},
{file = "rpds_py-0.22.1-cp312-cp312-win_amd64.whl", hash = "sha256:413a30a99d8683dace3765885920ed27ab662efbb6c98d81db76c397ad1ffd71"},
{file = "rpds_py-0.22.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:aa2ba0176037c915d8660a4e46581d645e2c22b5373e466bc8640a794d45861a"},
{file = "rpds_py-0.22.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4ba6c66fbc6015b2f99e7176fec41793cecb00c4cc357cad038dff85e6ac42ab"},
{file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15fa4ca658f8ad22645d3531682b17e5580832efbfa87304c3e62214c79c1e8a"},
{file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d7833ef6f5d6cb634f296abfd93452fb3eb44c4e9a6ae95c1021eab704c1cee2"},
{file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c0467838c90435b80793cde486a318fc916ee57f2af54e4b10c72b20cbdcbaa9"},
{file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d962e2e89b3a95e3597a34b8c93ced1e98958502c5b8096c9fd69deff279f561"},
{file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ce729f1dc8a4a190c34b69f75377bddc004079b2963ab722ab91fafe040be6d"},
{file = "rpds_py-0.22.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8080467df22feca0fc9c46567001777c6fbc2b4a2683a7137420896051874ca1"},
{file = "rpds_py-0.22.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0f9eb37d3a60b262a98ab51ee899cac039de9ca0ce68dcf1a6518a09719020b0"},
{file = "rpds_py-0.22.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:153248f48d6f90a295a502f53ec544a3ffbd21b0bb32f5dca39c4b93a764d6a2"},
{file = "rpds_py-0.22.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0a53592cdf98cec3dfcdb24ffec8a4797e7656b65700099af43ec7df023b6de4"},
{file = "rpds_py-0.22.1-cp313-cp313-win32.whl", hash = "sha256:e8056adcefa2dcb67e8bc91ea5eee26df66e8b297a8cd6ff0903f85c70908fa0"},
{file = "rpds_py-0.22.1-cp313-cp313-win_amd64.whl", hash = "sha256:a451dba533be77454ebcffc85189108fc05f279100835ac76e7989edacb89156"},
{file = "rpds_py-0.22.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:2ea23f1525d4f64286dbe0947c929d45c3ffe963b2dbed1d3844a2e4938bda42"},
{file = "rpds_py-0.22.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3aaa22487477de9618ce3b37f99fbe81219ba96f3c2ca84f576f0ab451b83aba"},
{file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8954b9ffe60f479a0c0ba40987db2546c735ab02a725ea7fd89342152d4d821d"},
{file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c8502a02ae3ae67084f5a0bf5a8253b19fa7a887f824e41e016cdb0ac532a06f"},
{file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a083221b6a4ecdef38a60c95d8d3223d99449cb4da2544e9644958dc16664eb9"},
{file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:542eb246d5be31b5e0a9c8ddb9539416f9b31f58f75bd4ee328bff2b5c58d6fd"},
{file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffae97d28ea4f2c613a751d087b75a97fb78311b38cc2e9a2f4587e473ace167"},
{file = "rpds_py-0.22.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0ff8d5b13ce2357fa8b33a0a2e3775aa71df5bf7c8ba060634c9d15ab12f357"},
{file = "rpds_py-0.22.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:0f057a0c546c42964836b209d8de9ea1a4f4b0432006c6343cbe633d8ca14571"},
{file = "rpds_py-0.22.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:48ee97c7c6027fd423058675b5a39d0b5f7a1648250b671563d5c9f74ff13ff0"},
{file = "rpds_py-0.22.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:babec324e8654a59122aaa66936a9a483faa03276db9792f51332475c2dddc4a"},
{file = "rpds_py-0.22.1-cp313-cp313t-win32.whl", hash = "sha256:e69acdbc132c9592c8dc393af85e38e206ca847c7019a953ff625191c3a12312"},
{file = "rpds_py-0.22.1-cp313-cp313t-win_amd64.whl", hash = "sha256:c783e4ed68200f4e03c125690d23158b1c49c4b186d458a18debc109bbdc3c2e"},
{file = "rpds_py-0.22.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:2143c3aed85992604d758bbe67da839fb4aab3dd2e1c6dddab5b3ca7162b34a2"},
{file = "rpds_py-0.22.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f57e2d0f8022783426121b586d7c842ea40ea832a29e28ca36c881b54c74fb28"},
{file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c0c324879d483504b07f7b18eb1b50567c434263bbe4866ecce33056162668a"},
{file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1c40e02cc4f3e18fd39344edb10eebe04bd11cfd13119606b5771e5ea51630d3"},
{file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f76c6f319e57007ad52e671ec741d801324760a377e3d4992c9bb8200333ebac"},
{file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f5cae9b415ea8a6a563566dbf46650222eccc5971c7daa16fbee63aef92ae543"},
{file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b09209cdfcacf5eba9cf80367130532e6c02e695252e1f64d3cfcc2356e6e19f"},
{file = "rpds_py-0.22.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dbe428d0ac6eacaf05402adbaf137f59ad6063848182d1ff294f95ce0f24005b"},
{file = "rpds_py-0.22.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:626b9feb01bff049a5aec4804f0c58db12585778b4902e5376a95b01f80a7a16"},
{file = "rpds_py-0.22.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ec1ccc2a9f764cd632fb8ab28fdde166250df54fc8d97315a4a6948dc5367639"},
{file = "rpds_py-0.22.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ef92b1fbe6aa2e7885eb90853cc016b1fc95439a8cc8da6d526880e9e2148695"},
{file = "rpds_py-0.22.1-cp39-cp39-win32.whl", hash = "sha256:c88535f83f7391cf3a45af990237e3939a6fdfbedaed2571633bfdd0bceb36b0"},
{file = "rpds_py-0.22.1-cp39-cp39-win_amd64.whl", hash = "sha256:7839b7528faa4d134c183b1f2dd1ee4dc2ca2f899f4f0cfdf00fc04c255262a7"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a0ed14a4162c2c2b21a162c9fcf90057e3e7da18cd171ab344c1e1664f75090e"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:05fdeae9010533e47715c37df83264df0122584e40d691d50cf3607c060952a3"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4659b2e4a5008715099e216050f5c6976e5a4329482664411789968b82e3f17d"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a18aedc032d6468b73ebbe4437129cb30d54fe543cde2f23671ecad76c3aea24"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:149b4d875ef9b12a8f5e303e86a32a58f8ef627e57ec97a7d0e4be819069d141"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fdaee3947eaaa52dae3ceb9d9f66329e13d8bae35682b1e5dd54612938693934"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36ce951800ed2acc6772fd9f42150f29d567f0423989748052fdb39d9e2b5795"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ab784621d3e2a41916e21f13a483602cc989fd45fff637634b9231ba43d4383b"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:c2a214bf5b79bd39a9de1c991353aaaacafda83ba1374178309e92be8e67d411"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:85060e96953647871957d41707adb8d7bff4e977042fd0deb4fc1881b98dd2fe"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:c6f3fd617db422c9d4e12cb8d84c984fe07d6d9cb0950cbf117f3bccc6268d05"},
{file = "rpds_py-0.22.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f2d1b58a0c3a73f0361759642e80260a6d28eee6501b40fe25b82af33ef83f21"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:76eaa4c087a061a2c8a0a92536405069878a8f530c00e84a9eaf332e70f5561f"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:959ae04ed30cde606f3a0320f0a1f4167a107e685ef5209cce28c5080590bd31"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:198067aa6f3d942ff5d0d655bb1e91b59ae85279d47590682cba2834ac1b97d2"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3e7e99e2af59c56c59b6c964d612511b8203480d39d1ef83edc56f2cb42a3f5d"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0545928bdf53dfdfcab284468212efefb8a6608ca3b6910c7fb2e5ed8bdc2dc0"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef7282d8a14b60dd515e47060638687710b1d518f4b5e961caad43fb3a3606f9"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe3f245c2f39a5692d9123c174bc48f6f9fe3e96407e67c6d04541a767d99e72"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:efb2ad60ca8637d5f9f653f9a9a8d73964059972b6b95036be77e028bffc68a3"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:d8306f27418361b788e3fca9f47dec125457f80122e7e31ba7ff5cdba98343f8"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:4c8dc7331e8cbb1c0ea2bcb550adb1777365944ffd125c69aa1117fdef4887f5"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:776a06cb5720556a549829896a49acebb5bdd96c7bba100191a994053546975a"},
{file = "rpds_py-0.22.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:e4f91d702b9ce1388660b3d4a28aa552614a1399e93f718ed0dacd68f23b3d32"},
{file = "rpds_py-0.22.1.tar.gz", hash = "sha256:157a023bded0618a1eea54979fe2e0f9309e9ddc818ef4b8fc3b884ff38fedd5"},
]
[[package]]
@ -7647,4 +7644,4 @@ tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "33ee730cf750e618ec005ad44ad09617bc8f95632b30ac02b5290a03a33bdf5b"
content-hash = "0d9d498f50601c95a8616797441f00597acdea1e6a70d3b9642c17ffacc1bb45"

View File

@ -26,9 +26,10 @@ packages = [{include = "docling"}]
######################
python = "^3.9"
pydantic = ">=2.0.0,<2.10"
docling-core = "^2.6.1"
docling-ibm-models = "^2.0.6"
deepsearch-glm = "^0.26.1"
docling-core = { git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "feat-add-legacy-convert" }
docling-ibm-models = { git = "ssh://git@github.com/DS4SD/docling-ibm-models.git", branch = "nli/performance" }
deepsearch-glm = { git = "ssh://git@github.com/DS4SD/deepsearch-glm.git", branch = "cau/layout-processing-children-payloads" }
docling-parse = { git = "ssh://git@github.com/DS4SD/docling-parse.git", branch = "dev/expose-cell-sanitisation-via-python" }
filetype = "^1.2.0"
pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0"
@ -36,7 +37,6 @@ huggingface_hub = ">=0.23,<1"
requests = "^2.32.3"
easyocr = "^1.7"
tesserocr = { version = "^2.7.1", optional = true }
docling-parse = "^2.0.5"
certifi = ">=2024.7.4"
rtree = "^1.3.0"
scipy = "^1.6.0"