mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-16 08:38:14 +00:00
Merge branch 'cau/experimental-format' of github.com:DS4SD/docling into cau/input-format-abstraction
This commit is contained in:
@@ -3,7 +3,7 @@ from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
|
||||
|
||||
from docling_core.types.experimental.base import BoundingBox, Size
|
||||
from docling_core.types.experimental import BoundingBox, Size
|
||||
from PIL import Image
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
||||
@@ -5,7 +5,7 @@ from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
|
||||
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
|
||||
from docling_parse.docling_parse import pdf_parser
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_c
|
||||
from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
|
||||
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage, PdfTextPage
|
||||
from pypdfium2._helpers.misc import PdfiumError
|
||||
|
||||
@@ -9,7 +9,6 @@ from typing import Annotated, Iterable, List, Optional
|
||||
|
||||
import typer
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from pydantic import AnyUrl
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
@@ -181,58 +180,25 @@ def convert(
|
||||
else:
|
||||
input_doc_paths.append(source)
|
||||
|
||||
###########################################################################
|
||||
match backend:
|
||||
case Backend.PYPDFIUM2:
|
||||
do_cell_matching = ocr # only do cell matching when OCR enabled
|
||||
pdf_backend = PyPdfiumDocumentBackend
|
||||
case Backend.DOCLING:
|
||||
do_cell_matching = True
|
||||
pdf_backend = DoclingParseDocumentBackend
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||
|
||||
# The following sections contain a combination of PipelineOptions
|
||||
# and PDF Backends for various configurations.
|
||||
# Uncomment one section at the time to see the differences in the output.
|
||||
|
||||
doc_converter = None
|
||||
if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = False
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=PyPdfiumDocumentBackend,
|
||||
)
|
||||
|
||||
elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=PyPdfiumDocumentBackend,
|
||||
)
|
||||
|
||||
elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
###########################################################################
|
||||
pipeline_options = PipelineOptions(
|
||||
do_ocr=ocr,
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=pdf_backend,
|
||||
)
|
||||
|
||||
# Define input files
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
@@ -4,9 +4,9 @@ from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from docling_core.types.experimental.base import BoundingBox, Size
|
||||
from docling_core.types.experimental.document import BaseFigureData, TableCell
|
||||
from docling_core.types.experimental.labels import PageLabel
|
||||
from docling_core.types.experimental import BoundingBox, Size
|
||||
from docling_core.types.experimental.document import BasePictureData, TableCell
|
||||
from docling_core.types.experimental.labels import DocItemLabel
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
from typing_extensions import Self
|
||||
@@ -59,14 +59,14 @@ class OcrCell(Cell):
|
||||
|
||||
class Cluster(BaseModel):
|
||||
id: int
|
||||
label: PageLabel
|
||||
label: DocItemLabel
|
||||
bbox: BoundingBox
|
||||
confidence: float = 1.0
|
||||
cells: List[Cell] = []
|
||||
|
||||
|
||||
class BasePageElement(BaseModel):
|
||||
label: PageLabel
|
||||
label: DocItemLabel
|
||||
id: int
|
||||
page_no: int
|
||||
cluster: Cluster
|
||||
@@ -92,7 +92,7 @@ class TextElement(BasePageElement): ...
|
||||
|
||||
|
||||
class FigureElement(BasePageElement):
|
||||
data: Optional[BaseFigureData] = None
|
||||
data: Optional[BasePictureData] = None
|
||||
provenance: Optional[str] = None
|
||||
predicted_class: Optional[str] = None
|
||||
confidence: Optional[float] = None
|
||||
|
||||
@@ -11,8 +11,11 @@ from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||
from docling_core.types import Table as DsSchemaTable
|
||||
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
|
||||
from docling_core.types.doc.base import Figure, TableCell
|
||||
from docling_core.types.experimental.document import DoclingDocument, FileInfo
|
||||
from docling_core.types.experimental.labels import PageLabel
|
||||
from docling_core.types.experimental import (
|
||||
DescriptionItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
)
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
@@ -36,21 +39,21 @@ from docling.utils.utils import create_file_hash
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
layout_label_to_ds_type = {
|
||||
PageLabel.TITLE: "title",
|
||||
PageLabel.DOCUMENT_INDEX: "table-of-contents",
|
||||
PageLabel.SECTION_HEADER: "subtitle-level-1",
|
||||
PageLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
||||
PageLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
||||
PageLabel.CAPTION: "caption",
|
||||
PageLabel.PAGE_HEADER: "page-header",
|
||||
PageLabel.PAGE_FOOTER: "page-footer",
|
||||
PageLabel.FOOTNOTE: "footnote",
|
||||
PageLabel.TABLE: "table",
|
||||
PageLabel.FORMULA: "equation",
|
||||
PageLabel.LIST_ITEM: "paragraph",
|
||||
PageLabel.CODE: "paragraph",
|
||||
PageLabel.PICTURE: "figure",
|
||||
PageLabel.TEXT: "paragraph",
|
||||
DocItemLabel.TITLE: "title",
|
||||
DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
|
||||
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
|
||||
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
||||
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
||||
DocItemLabel.CAPTION: "caption",
|
||||
DocItemLabel.PAGE_HEADER: "page-header",
|
||||
DocItemLabel.PAGE_FOOTER: "page-footer",
|
||||
DocItemLabel.FOOTNOTE: "footnote",
|
||||
DocItemLabel.TABLE: "table",
|
||||
DocItemLabel.FORMULA: "equation",
|
||||
DocItemLabel.LIST_ITEM: "paragraph",
|
||||
DocItemLabel.CODE: "paragraph",
|
||||
DocItemLabel.PICTURE: "figure",
|
||||
DocItemLabel.TEXT: "paragraph",
|
||||
}
|
||||
|
||||
_EMPTY_DOC = DsDocument(
|
||||
@@ -63,7 +66,7 @@ _EMPTY_DOC = DsDocument(
|
||||
)
|
||||
|
||||
_EMPTY_DOCLING_DOC = DoclingDocument(
|
||||
description={}, file_info=FileInfo(document_hash="123xyz")
|
||||
description=DescriptionItem(), name="dummy"
|
||||
) # TODO: Stub
|
||||
|
||||
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||
@@ -341,8 +344,10 @@ class ConvertedDocument(BaseModel):
|
||||
"paragraph",
|
||||
"caption",
|
||||
"table",
|
||||
"figure",
|
||||
],
|
||||
strict_text: bool = False,
|
||||
image_placeholder: str = "<!-- image -->",
|
||||
):
|
||||
return self.output.export_to_markdown(
|
||||
delim=delim,
|
||||
@@ -350,6 +355,7 @@ class ConvertedDocument(BaseModel):
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
strict_text=strict_text,
|
||||
image_placeholder=image_placeholder,
|
||||
)
|
||||
|
||||
def render_as_text(
|
||||
|
||||
@@ -5,7 +5,7 @@ from typing import Iterable, List, Tuple
|
||||
|
||||
import numpy
|
||||
import numpy as np
|
||||
from docling_core.types.experimental.base import BoundingBox, CoordOrigin
|
||||
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||
from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import find_objects, label
|
||||
|
||||
@@ -11,7 +11,7 @@ from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_mode
|
||||
from docling_core.types import BaseText
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import Ref
|
||||
from docling_core.types.experimental.base import BoundingBox, CoordOrigin
|
||||
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||
from docling_core.types.experimental.document import DoclingDocument
|
||||
from PIL import ImageDraw
|
||||
|
||||
@@ -57,7 +57,7 @@ class GlmModel:
|
||||
if arr == "tables":
|
||||
prov = ds_document.tables[index].prov[0]
|
||||
elif arr == "figures":
|
||||
prov = ds_document.figures[index].prov[0]
|
||||
prov = ds_document.pictures[index].prov[0]
|
||||
else:
|
||||
prov = None
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ import logging
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
from docling_core.types.experimental.base import BoundingBox, CoordOrigin
|
||||
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
@@ -4,8 +4,8 @@ import random
|
||||
import time
|
||||
from typing import Iterable, List
|
||||
|
||||
from docling_core.types.experimental.base import CoordOrigin
|
||||
from docling_core.types.experimental.labels import PageLabel
|
||||
from docling_core.types.experimental import CoordOrigin
|
||||
from docling_core.types.experimental.labels import DocItemLabel
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
from PIL import ImageDraw
|
||||
|
||||
@@ -24,23 +24,23 @@ _log = logging.getLogger(__name__)
|
||||
class LayoutModel:
|
||||
|
||||
TEXT_ELEM_LABELS = [
|
||||
PageLabel.TEXT,
|
||||
PageLabel.FOOTNOTE,
|
||||
PageLabel.CAPTION,
|
||||
PageLabel.CHECKBOX_UNSELECTED,
|
||||
PageLabel.CHECKBOX_SELECTED,
|
||||
PageLabel.SECTION_HEADER,
|
||||
PageLabel.PAGE_HEADER,
|
||||
PageLabel.PAGE_FOOTER,
|
||||
PageLabel.CODE,
|
||||
PageLabel.LIST_ITEM,
|
||||
DocItemLabel.TEXT,
|
||||
DocItemLabel.FOOTNOTE,
|
||||
DocItemLabel.CAPTION,
|
||||
DocItemLabel.CHECKBOX_UNSELECTED,
|
||||
DocItemLabel.CHECKBOX_SELECTED,
|
||||
DocItemLabel.SECTION_HEADER,
|
||||
DocItemLabel.PAGE_HEADER,
|
||||
DocItemLabel.PAGE_FOOTER,
|
||||
DocItemLabel.CODE,
|
||||
DocItemLabel.LIST_ITEM,
|
||||
# "Formula",
|
||||
]
|
||||
PAGE_HEADER_LABELS = [PageLabel.PAGE_HEADER, PageLabel.PAGE_FOOTER]
|
||||
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
|
||||
|
||||
TABLE_LABEL = PageLabel.TABLE
|
||||
FIGURE_LABEL = PageLabel.PICTURE
|
||||
FORMULA_LABEL = PageLabel.FORMULA
|
||||
TABLE_LABEL = DocItemLabel.TABLE
|
||||
FIGURE_LABEL = DocItemLabel.PICTURE
|
||||
FORMULA_LABEL = DocItemLabel.FORMULA
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
@@ -51,27 +51,27 @@ class LayoutModel:
|
||||
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
|
||||
MIN_INTERSECTION = 0.2
|
||||
CLASS_THRESHOLDS = {
|
||||
PageLabel.CAPTION: 0.35,
|
||||
PageLabel.FOOTNOTE: 0.35,
|
||||
PageLabel.FORMULA: 0.35,
|
||||
PageLabel.LIST_ITEM: 0.35,
|
||||
PageLabel.PAGE_FOOTER: 0.35,
|
||||
PageLabel.PAGE_HEADER: 0.35,
|
||||
PageLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
|
||||
PageLabel.SECTION_HEADER: 0.45,
|
||||
PageLabel.TABLE: 0.35,
|
||||
PageLabel.TEXT: 0.45,
|
||||
PageLabel.TITLE: 0.45,
|
||||
PageLabel.DOCUMENT_INDEX: 0.45,
|
||||
PageLabel.CODE: 0.45,
|
||||
PageLabel.CHECKBOX_SELECTED: 0.45,
|
||||
PageLabel.CHECKBOX_UNSELECTED: 0.45,
|
||||
PageLabel.FORM: 0.45,
|
||||
PageLabel.KEY_VALUE_REGION: 0.45,
|
||||
DocItemLabel.CAPTION: 0.35,
|
||||
DocItemLabel.FOOTNOTE: 0.35,
|
||||
DocItemLabel.FORMULA: 0.35,
|
||||
DocItemLabel.LIST_ITEM: 0.35,
|
||||
DocItemLabel.PAGE_FOOTER: 0.35,
|
||||
DocItemLabel.PAGE_HEADER: 0.35,
|
||||
DocItemLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
|
||||
DocItemLabel.SECTION_HEADER: 0.45,
|
||||
DocItemLabel.TABLE: 0.35,
|
||||
DocItemLabel.TEXT: 0.45,
|
||||
DocItemLabel.TITLE: 0.45,
|
||||
DocItemLabel.DOCUMENT_INDEX: 0.45,
|
||||
DocItemLabel.CODE: 0.45,
|
||||
DocItemLabel.CHECKBOX_SELECTED: 0.45,
|
||||
DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
|
||||
DocItemLabel.FORM: 0.45,
|
||||
DocItemLabel.KEY_VALUE_REGION: 0.45,
|
||||
}
|
||||
|
||||
CLASS_REMAPPINGS = {
|
||||
PageLabel.DOCUMENT_INDEX: PageLabel.TABLE,
|
||||
DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
|
||||
}
|
||||
|
||||
_log.debug("================= Start postprocess function ====================")
|
||||
@@ -258,7 +258,7 @@ class LayoutModel:
|
||||
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(page_height),
|
||||
confidence=c["confidence"],
|
||||
label=PageLabel(c["type"]),
|
||||
label=DocItemLabel(c["type"]),
|
||||
cells=cluster_cells,
|
||||
)
|
||||
clusters_out_new.append(c_new)
|
||||
@@ -271,7 +271,7 @@ class LayoutModel:
|
||||
for ix, pred_item in enumerate(
|
||||
self.layout_predictor.predict(page.get_image(scale=1.0))
|
||||
):
|
||||
label = PageLabel(
|
||||
label = DocItemLabel(
|
||||
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
|
||||
) # Temporary, until docling-ibm-model uses docling-core types
|
||||
cluster = Cluster(
|
||||
|
||||
@@ -2,8 +2,9 @@ import copy
|
||||
from typing import Iterable, List
|
||||
|
||||
import numpy
|
||||
from docling_core.types.experimental.base import BoundingBox
|
||||
from docling_core.types.experimental import BoundingBox
|
||||
from docling_core.types.experimental.document import TableCell
|
||||
from docling_core.types.experimental.labels import DocItemLabel
|
||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||
from PIL import ImageDraw
|
||||
|
||||
@@ -74,7 +75,7 @@ class TableStructureModel:
|
||||
],
|
||||
)
|
||||
for cluster in page.predictions.layout.clusters
|
||||
if cluster.label == "Table"
|
||||
if cluster.label == DocItemLabel.TABLE
|
||||
]
|
||||
if not len(in_tables):
|
||||
yield page
|
||||
@@ -138,7 +139,7 @@ class TableStructureModel:
|
||||
id=table_cluster.id,
|
||||
page_no=page.page_no,
|
||||
cluster=table_cluster,
|
||||
label="Table",
|
||||
label=DocItemLabel.TABLE,
|
||||
)
|
||||
|
||||
page.predictions.tablestructure.table_map[table_cluster.id] = tbl
|
||||
|
||||
@@ -2,7 +2,7 @@ import logging
|
||||
from typing import Any, Dict, Iterable, List, Tuple, Union
|
||||
|
||||
from docling_core.types.doc.base import BaseCell, BaseText, BoundingBox, Ref, Table
|
||||
from docling_core.types.experimental.base import CoordOrigin
|
||||
from docling_core.types.experimental import CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell
|
||||
from docling.datamodel.document import ConversionResult, Page
|
||||
|
||||
@@ -2,7 +2,7 @@ import copy
|
||||
import logging
|
||||
|
||||
import networkx as nx
|
||||
from docling_core.types.experimental.labels import PageLabel
|
||||
from docling_core.types.experimental.labels import DocItemLabel
|
||||
|
||||
logger = logging.getLogger("layout_utils")
|
||||
|
||||
@@ -371,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
|
||||
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
|
||||
)
|
||||
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
|
||||
if len(cluster["cell_ids"]) == 0 and cluster["type"] != PageLabel.PICTURE:
|
||||
if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
|
||||
logger.debug(" Empty non-picture, removed")
|
||||
continue ## Skip this former cluster, now without cells.
|
||||
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
|
||||
@@ -381,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
|
||||
|
||||
|
||||
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
|
||||
if not (cluster["type"] in [PageLabel.TABLE, PageLabel.PICTURE]):
|
||||
if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
|
||||
## A text-like cluster. The bbox only needs to be around the text cells:
|
||||
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
||||
new_bbox = surrounding_list(
|
||||
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
||||
)
|
||||
logger.debug(" New bounding box:" + str(new_bbox))
|
||||
if cluster["type"] == PageLabel.PICTURE:
|
||||
if cluster["type"] == DocItemLabel.PICTURE:
|
||||
## We only make the bbox completely comprise included text cells:
|
||||
logger.debug(" Picture")
|
||||
if len(cluster["cell_ids"]) != 0:
|
||||
@@ -588,7 +588,7 @@ def set_orphan_as_text(
|
||||
max_id = -1
|
||||
figures = []
|
||||
for cluster in cluster_predictions:
|
||||
if cluster["type"] == PageLabel.PICTURE:
|
||||
if cluster["type"] == DocItemLabel.PICTURE:
|
||||
figures.append(cluster)
|
||||
|
||||
if cluster["id"] > max_id:
|
||||
@@ -639,13 +639,13 @@ def set_orphan_as_text(
|
||||
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
|
||||
if fig_flag == False and lines_detector == False:
|
||||
# get class from low confidence detections if not set as text:
|
||||
class_type = PageLabel.TEXT
|
||||
class_type = DocItemLabel.TEXT
|
||||
|
||||
for cluster in cluster_predictions_low:
|
||||
intersection = compute_intersection(
|
||||
orph_cell["bbox"], cluster["bbox"]
|
||||
)
|
||||
class_type = PageLabel.TEXT
|
||||
class_type = DocItemLabel.TEXT
|
||||
if (
|
||||
cluster["confidence"] > 0.1
|
||||
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
|
||||
@@ -719,7 +719,9 @@ def merge_cells(cluster_predictions):
|
||||
if cluster["id"] == node:
|
||||
lines.append(cluster)
|
||||
cluster_predictions.remove(cluster)
|
||||
new_merged_cluster = build_cluster_from_lines(lines, PageLabel.TEXT, max_id)
|
||||
new_merged_cluster = build_cluster_from_lines(
|
||||
lines, DocItemLabel.TEXT, max_id
|
||||
)
|
||||
cluster_predictions.append(new_merged_cluster)
|
||||
return cluster_predictions
|
||||
|
||||
@@ -754,9 +756,9 @@ def clean_up_clusters(
|
||||
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
|
||||
elif img_table == True:
|
||||
if (
|
||||
cluster_1["type"] == PageLabel.TEXT
|
||||
and cluster_2["type"] == PageLabel.PICTURE
|
||||
or cluster_2["type"] == PageLabel.TABLE
|
||||
cluster_1["type"] == DocItemLabel.TEXT
|
||||
and cluster_2["type"] == DocItemLabel.PICTURE
|
||||
or cluster_2["type"] == DocItemLabel.TABLE
|
||||
):
|
||||
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
|
||||
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
||||
@@ -773,7 +775,7 @@ def clean_up_clusters(
|
||||
# remove tables that have one pdf cell
|
||||
if one_cell_table == True:
|
||||
if (
|
||||
cluster_1["type"] == PageLabel.TABLE
|
||||
cluster_1["type"] == DocItemLabel.TABLE
|
||||
and len(cluster_1["cell_ids"]) < 2
|
||||
):
|
||||
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
||||
|
||||
Reference in New Issue
Block a user