mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
Add profiling code to all models
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
a00f01cf07
commit
0814f32ae4
@ -6,6 +6,7 @@ from pathlib import Path, PurePath
|
|||||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
|
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
import filetype
|
import filetype
|
||||||
|
import numpy as np
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItem,
|
DocItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@ -179,6 +180,29 @@ class DocumentFormat(str, Enum):
|
|||||||
V1 = "v1"
|
V1 = "v1"
|
||||||
|
|
||||||
|
|
||||||
|
class ProfilingScope(str, Enum):
|
||||||
|
PAGE = "page"
|
||||||
|
DOCUMENT = "document"
|
||||||
|
|
||||||
|
|
||||||
|
class ProfilingItem(BaseModel):
|
||||||
|
scope: ProfilingScope
|
||||||
|
count: int = 0
|
||||||
|
times: List[float] = []
|
||||||
|
|
||||||
|
def avg(self) -> float:
|
||||||
|
return np.average(self.times) # type: ignore
|
||||||
|
|
||||||
|
def std(self) -> float:
|
||||||
|
return np.std(self.times) # type: ignore
|
||||||
|
|
||||||
|
def mean(self) -> float:
|
||||||
|
return np.mean(self.times) # type: ignore
|
||||||
|
|
||||||
|
def percentile(self, perc: float) -> float:
|
||||||
|
return np.percentile(self.times, perc) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
class ConversionResult(BaseModel):
|
class ConversionResult(BaseModel):
|
||||||
input: InputDocument
|
input: InputDocument
|
||||||
|
|
||||||
@ -187,6 +211,7 @@ class ConversionResult(BaseModel):
|
|||||||
|
|
||||||
pages: List[Page] = []
|
pages: List[Page] = []
|
||||||
assembled: AssembledUnit = AssembledUnit()
|
assembled: AssembledUnit = AssembledUnit()
|
||||||
|
timings: Dict[str, ProfilingItem] = {}
|
||||||
|
|
||||||
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||||
|
|
||||||
|
@ -32,6 +32,8 @@ class DebugSettings(BaseModel):
|
|||||||
visualize_layout: bool = False
|
visualize_layout: bool = False
|
||||||
visualize_tables: bool = False
|
visualize_tables: bool = False
|
||||||
|
|
||||||
|
profile_pipeline_timings: bool = False
|
||||||
|
|
||||||
|
|
||||||
class AppSettings(BaseSettings):
|
class AppSettings(BaseSettings):
|
||||||
perf: BatchConcurrencySettings
|
perf: BatchConcurrencySettings
|
||||||
|
@ -1,14 +1,19 @@
|
|||||||
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Iterable
|
from typing import Any, Callable, Iterable, Type
|
||||||
|
|
||||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
|
from docling.datamodel.document import ConversionResult, ProfilingItem, ProfilingScope
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
|
||||||
|
|
||||||
class BasePageModel(ABC):
|
class BasePageModel(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@ -23,3 +28,28 @@ class BaseEnrichmentModel(ABC):
|
|||||||
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
||||||
) -> Iterable[Any]:
|
) -> Iterable[Any]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TimeRecorder:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
conv_res: ConversionResult,
|
||||||
|
key: str,
|
||||||
|
scope: ProfilingScope = ProfilingScope.PAGE,
|
||||||
|
):
|
||||||
|
if settings.debug.profile_pipeline_timings:
|
||||||
|
if key not in conv_res.timings.keys():
|
||||||
|
conv_res.timings[key] = ProfilingItem(scope=scope)
|
||||||
|
self.conv_res = conv_res
|
||||||
|
self.key = key
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
if settings.debug.profile_pipeline_timings:
|
||||||
|
self.start = time.monotonic()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, *args):
|
||||||
|
if settings.debug.profile_pipeline_timings:
|
||||||
|
elapsed = time.monotonic() - self.start
|
||||||
|
self.conv_res.timings[self.key].times.append(elapsed)
|
||||||
|
self.conv_res.timings[self.key].count += 1
|
||||||
|
@ -10,12 +10,14 @@ from rtree import index
|
|||||||
from scipy.ndimage import find_objects, label
|
from scipy.ndimage import find_objects, label
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import OcrCell, Page
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import OcrOptions
|
from docling.datamodel.pipeline_options import OcrOptions
|
||||||
|
from docling.models.base_model import BasePageModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BaseOcrModel:
|
class BaseOcrModel(BasePageModel):
|
||||||
def __init__(self, enabled: bool, options: OcrOptions):
|
def __init__(self, enabled: bool, options: OcrOptions):
|
||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
self.options = options
|
self.options = options
|
||||||
@ -133,5 +135,7 @@ class BaseOcrModel:
|
|||||||
image.show()
|
image.show()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
pass
|
pass
|
||||||
|
@ -27,6 +27,7 @@ from pydantic import BaseModel, ConfigDict
|
|||||||
|
|
||||||
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
||||||
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
||||||
|
from docling.models.base_model import TimeRecorder
|
||||||
from docling.utils.utils import create_hash
|
from docling.utils.utils import create_hash
|
||||||
|
|
||||||
|
|
||||||
@ -226,12 +227,13 @@ class GlmModel:
|
|||||||
return ds_doc
|
return ds_doc
|
||||||
|
|
||||||
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
||||||
ds_doc = self._to_legacy_document(conv_res)
|
with TimeRecorder(conv_res, "glm"):
|
||||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
ds_doc = self._to_legacy_document(conv_res)
|
||||||
|
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||||
|
|
||||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||||
|
|
||||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_clusters_and_cells(ds_document, page_no):
|
def draw_clusters_and_cells(ds_document, page_no):
|
||||||
|
@ -1,12 +1,15 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import OcrCell, Page
|
||||||
|
from docling.datamodel.document import ConversionResult, ProfilingItem
|
||||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.models.base_model import TimeRecorder
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -34,56 +37,62 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
download_enabled=self.options.download_enabled,
|
download_enabled=self.options.download_enabled,
|
||||||
)
|
)
|
||||||
|
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
|
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
yield from page_batch
|
yield from page_batch
|
||||||
return
|
return
|
||||||
|
|
||||||
for page in page_batch:
|
for page in page_batch:
|
||||||
|
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
ocr_rects = self.get_ocr_rects(page)
|
with TimeRecorder(conv_res, "ocr"):
|
||||||
|
ocr_rects = self.get_ocr_rects(page)
|
||||||
|
|
||||||
all_ocr_cells = []
|
all_ocr_cells = []
|
||||||
for ocr_rect in ocr_rects:
|
for ocr_rect in ocr_rects:
|
||||||
# Skip zero area boxes
|
# Skip zero area boxes
|
||||||
if ocr_rect.area() == 0:
|
if ocr_rect.area() == 0:
|
||||||
continue
|
continue
|
||||||
high_res_image = page._backend.get_page_image(
|
high_res_image = page._backend.get_page_image(
|
||||||
scale=self.scale, cropbox=ocr_rect
|
scale=self.scale, cropbox=ocr_rect
|
||||||
)
|
|
||||||
im = numpy.array(high_res_image)
|
|
||||||
result = self.reader.readtext(im)
|
|
||||||
|
|
||||||
del high_res_image
|
|
||||||
del im
|
|
||||||
|
|
||||||
cells = [
|
|
||||||
OcrCell(
|
|
||||||
id=ix,
|
|
||||||
text=line[1],
|
|
||||||
confidence=line[2],
|
|
||||||
bbox=BoundingBox.from_tuple(
|
|
||||||
coord=(
|
|
||||||
(line[0][0][0] / self.scale) + ocr_rect.l,
|
|
||||||
(line[0][0][1] / self.scale) + ocr_rect.t,
|
|
||||||
(line[0][2][0] / self.scale) + ocr_rect.l,
|
|
||||||
(line[0][2][1] / self.scale) + ocr_rect.t,
|
|
||||||
),
|
|
||||||
origin=CoordOrigin.TOPLEFT,
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
for ix, line in enumerate(result)
|
im = numpy.array(high_res_image)
|
||||||
]
|
result = self.reader.readtext(im)
|
||||||
all_ocr_cells.extend(cells)
|
|
||||||
|
|
||||||
## Remove OCR cells which overlap with programmatic cells.
|
del high_res_image
|
||||||
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
del im
|
||||||
|
|
||||||
page.cells.extend(filtered_ocr_cells)
|
cells = [
|
||||||
|
OcrCell(
|
||||||
|
id=ix,
|
||||||
|
text=line[1],
|
||||||
|
confidence=line[2],
|
||||||
|
bbox=BoundingBox.from_tuple(
|
||||||
|
coord=(
|
||||||
|
(line[0][0][0] / self.scale) + ocr_rect.l,
|
||||||
|
(line[0][0][1] / self.scale) + ocr_rect.t,
|
||||||
|
(line[0][2][0] / self.scale) + ocr_rect.l,
|
||||||
|
(line[0][2][1] / self.scale) + ocr_rect.t,
|
||||||
|
),
|
||||||
|
origin=CoordOrigin.TOPLEFT,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
for ix, line in enumerate(result)
|
||||||
|
]
|
||||||
|
all_ocr_cells.extend(cells)
|
||||||
|
|
||||||
|
## Remove OCR cells which overlap with programmatic cells.
|
||||||
|
filtered_ocr_cells = self.filter_ocr_cells(
|
||||||
|
all_ocr_cells, page.cells
|
||||||
|
)
|
||||||
|
|
||||||
|
page.cells.extend(filtered_ocr_cells)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
if settings.debug.visualize_ocr:
|
if settings.debug.visualize_ocr:
|
||||||
|
@ -16,8 +16,9 @@ from docling.datamodel.base_models import (
|
|||||||
LayoutPrediction,
|
LayoutPrediction,
|
||||||
Page,
|
Page,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||||
from docling.utils import layout_utils as lu
|
from docling.utils import layout_utils as lu
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -272,77 +273,86 @@ class LayoutModel(BasePageModel):
|
|||||||
|
|
||||||
return clusters_out_new, cells_out_new
|
return clusters_out_new, cells_out_new
|
||||||
|
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
|
|
||||||
for page in page_batch:
|
for page in page_batch:
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
assert page.size is not None
|
with TimeRecorder(conv_res, "layout"):
|
||||||
|
assert page.size is not None
|
||||||
|
|
||||||
clusters = []
|
clusters = []
|
||||||
for ix, pred_item in enumerate(
|
for ix, pred_item in enumerate(
|
||||||
self.layout_predictor.predict(page.get_image(scale=1.0))
|
self.layout_predictor.predict(page.get_image(scale=1.0))
|
||||||
):
|
):
|
||||||
label = DocItemLabel(
|
label = DocItemLabel(
|
||||||
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
|
pred_item["label"]
|
||||||
) # Temporary, until docling-ibm-model uses docling-core types
|
.lower()
|
||||||
cluster = Cluster(
|
.replace(" ", "_")
|
||||||
id=ix,
|
.replace("-", "_")
|
||||||
label=label,
|
) # Temporary, until docling-ibm-model uses docling-core types
|
||||||
confidence=pred_item["confidence"],
|
cluster = Cluster(
|
||||||
bbox=BoundingBox.model_validate(pred_item),
|
id=ix,
|
||||||
cells=[],
|
label=label,
|
||||||
|
confidence=pred_item["confidence"],
|
||||||
|
bbox=BoundingBox.model_validate(pred_item),
|
||||||
|
cells=[],
|
||||||
|
)
|
||||||
|
clusters.append(cluster)
|
||||||
|
|
||||||
|
# Map cells to clusters
|
||||||
|
# TODO: Remove, postprocess should take care of it anyway.
|
||||||
|
for cell in page.cells:
|
||||||
|
for cluster in clusters:
|
||||||
|
if not cell.bbox.area() > 0:
|
||||||
|
overlap_frac = 0.0
|
||||||
|
else:
|
||||||
|
overlap_frac = (
|
||||||
|
cell.bbox.intersection_area_with(cluster.bbox)
|
||||||
|
/ cell.bbox.area()
|
||||||
|
)
|
||||||
|
|
||||||
|
if overlap_frac > 0.5:
|
||||||
|
cluster.cells.append(cell)
|
||||||
|
|
||||||
|
# Pre-sort clusters
|
||||||
|
# clusters = self.sort_clusters_by_cell_order(clusters)
|
||||||
|
|
||||||
|
# DEBUG code:
|
||||||
|
def draw_clusters_and_cells(show: bool = True):
|
||||||
|
image = copy.deepcopy(page.image)
|
||||||
|
if image is not None:
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
for c in clusters:
|
||||||
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||||
|
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
||||||
|
|
||||||
|
cell_color = (
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
)
|
||||||
|
for tc in c.cells: # [:1]:
|
||||||
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||||
|
draw.rectangle(
|
||||||
|
[(x0, y0), (x1, y1)], outline=cell_color
|
||||||
|
)
|
||||||
|
if show:
|
||||||
|
image.show()
|
||||||
|
|
||||||
|
# draw_clusters_and_cells()
|
||||||
|
|
||||||
|
clusters, page.cells = self.postprocess(
|
||||||
|
clusters, page.cells, page.size.height
|
||||||
)
|
)
|
||||||
clusters.append(cluster)
|
|
||||||
|
|
||||||
# Map cells to clusters
|
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
||||||
# TODO: Remove, postprocess should take care of it anyway.
|
|
||||||
for cell in page.cells:
|
|
||||||
for cluster in clusters:
|
|
||||||
if not cell.bbox.area() > 0:
|
|
||||||
overlap_frac = 0.0
|
|
||||||
else:
|
|
||||||
overlap_frac = (
|
|
||||||
cell.bbox.intersection_area_with(cluster.bbox)
|
|
||||||
/ cell.bbox.area()
|
|
||||||
)
|
|
||||||
|
|
||||||
if overlap_frac > 0.5:
|
|
||||||
cluster.cells.append(cell)
|
|
||||||
|
|
||||||
# Pre-sort clusters
|
|
||||||
# clusters = self.sort_clusters_by_cell_order(clusters)
|
|
||||||
|
|
||||||
# DEBUG code:
|
|
||||||
def draw_clusters_and_cells(show: bool = True):
|
|
||||||
image = copy.deepcopy(page.image)
|
|
||||||
if image is not None:
|
|
||||||
draw = ImageDraw.Draw(image)
|
|
||||||
for c in clusters:
|
|
||||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
|
||||||
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
|
||||||
|
|
||||||
cell_color = (
|
|
||||||
random.randint(30, 140),
|
|
||||||
random.randint(30, 140),
|
|
||||||
random.randint(30, 140),
|
|
||||||
)
|
|
||||||
for tc in c.cells: # [:1]:
|
|
||||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
|
||||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
|
||||||
if show:
|
|
||||||
image.show()
|
|
||||||
|
|
||||||
# draw_clusters_and_cells()
|
|
||||||
|
|
||||||
clusters, page.cells = self.postprocess(
|
|
||||||
clusters, page.cells, page.size.height
|
|
||||||
)
|
|
||||||
|
|
||||||
if settings.debug.visualize_layout:
|
if settings.debug.visualize_layout:
|
||||||
draw_clusters_and_cells()
|
draw_clusters_and_cells()
|
||||||
|
|
||||||
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
@ -12,7 +12,8 @@ from docling.datamodel.base_models import (
|
|||||||
Table,
|
Table,
|
||||||
TextElement,
|
TextElement,
|
||||||
)
|
)
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.datamodel.document import ConversionResult
|
||||||
|
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -51,122 +52,122 @@ class PageAssembleModel(BasePageModel):
|
|||||||
|
|
||||||
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
||||||
|
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
for page in page_batch:
|
for page in page_batch:
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
assert page.predictions.layout is not None
|
with TimeRecorder(conv_res, "page_assemble"):
|
||||||
|
|
||||||
# assembles some JSON output page by page.
|
assert page.predictions.layout is not None
|
||||||
|
|
||||||
elements: List[PageElement] = []
|
# assembles some JSON output page by page.
|
||||||
headers: List[PageElement] = []
|
|
||||||
body: List[PageElement] = []
|
|
||||||
|
|
||||||
for cluster in page.predictions.layout.clusters:
|
elements: List[PageElement] = []
|
||||||
# _log.info("Cluster label seen:", cluster.label)
|
headers: List[PageElement] = []
|
||||||
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
body: List[PageElement] = []
|
||||||
|
|
||||||
textlines = [
|
for cluster in page.predictions.layout.clusters:
|
||||||
cell.text.replace("\x02", "-").strip()
|
# _log.info("Cluster label seen:", cluster.label)
|
||||||
for cell in cluster.cells
|
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
||||||
if len(cell.text.strip()) > 0
|
|
||||||
]
|
|
||||||
text = self.sanitize_text(textlines)
|
|
||||||
text_el = TextElement(
|
|
||||||
label=cluster.label,
|
|
||||||
id=cluster.id,
|
|
||||||
text=text,
|
|
||||||
page_no=page.page_no,
|
|
||||||
cluster=cluster,
|
|
||||||
)
|
|
||||||
elements.append(text_el)
|
|
||||||
|
|
||||||
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
textlines = [
|
||||||
headers.append(text_el)
|
cell.text.replace("\x02", "-").strip()
|
||||||
else:
|
for cell in cluster.cells
|
||||||
body.append(text_el)
|
if len(cell.text.strip()) > 0
|
||||||
elif cluster.label == LayoutModel.TABLE_LABEL:
|
]
|
||||||
tbl = None
|
text = self.sanitize_text(textlines)
|
||||||
if page.predictions.tablestructure:
|
text_el = TextElement(
|
||||||
tbl = page.predictions.tablestructure.table_map.get(
|
|
||||||
cluster.id, None
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
not tbl
|
|
||||||
): # fallback: add table without structure, if it isn't present
|
|
||||||
tbl = Table(
|
|
||||||
label=cluster.label,
|
label=cluster.label,
|
||||||
id=cluster.id,
|
id=cluster.id,
|
||||||
text="",
|
|
||||||
otsl_seq=[],
|
|
||||||
table_cells=[],
|
|
||||||
cluster=cluster,
|
|
||||||
page_no=page.page_no,
|
|
||||||
)
|
|
||||||
|
|
||||||
elements.append(tbl)
|
|
||||||
body.append(tbl)
|
|
||||||
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
|
||||||
fig = None
|
|
||||||
if page.predictions.figures_classification:
|
|
||||||
fig = (
|
|
||||||
page.predictions.figures_classification.figure_map.get(
|
|
||||||
cluster.id, None
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
not fig
|
|
||||||
): # fallback: add figure without classification, if it isn't present
|
|
||||||
fig = FigureElement(
|
|
||||||
label=cluster.label,
|
|
||||||
id=cluster.id,
|
|
||||||
text="",
|
|
||||||
data=None,
|
|
||||||
cluster=cluster,
|
|
||||||
page_no=page.page_no,
|
|
||||||
)
|
|
||||||
elements.append(fig)
|
|
||||||
body.append(fig)
|
|
||||||
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
|
||||||
equation = None
|
|
||||||
if page.predictions.equations_prediction:
|
|
||||||
equation = (
|
|
||||||
page.predictions.equations_prediction.equation_map.get(
|
|
||||||
cluster.id, None
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
not equation
|
|
||||||
): # fallback: add empty formula, if it isn't present
|
|
||||||
text = self.sanitize_text(
|
|
||||||
[
|
|
||||||
cell.text.replace("\x02", "-").strip()
|
|
||||||
for cell in cluster.cells
|
|
||||||
if len(cell.text.strip()) > 0
|
|
||||||
]
|
|
||||||
)
|
|
||||||
equation = TextElement(
|
|
||||||
label=cluster.label,
|
|
||||||
id=cluster.id,
|
|
||||||
cluster=cluster,
|
|
||||||
page_no=page.page_no,
|
|
||||||
text=text,
|
text=text,
|
||||||
|
page_no=page.page_no,
|
||||||
|
cluster=cluster,
|
||||||
)
|
)
|
||||||
elements.append(equation)
|
elements.append(text_el)
|
||||||
body.append(equation)
|
|
||||||
|
|
||||||
page.assembled = AssembledUnit(
|
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
||||||
elements=elements, headers=headers, body=body
|
headers.append(text_el)
|
||||||
)
|
else:
|
||||||
|
body.append(text_el)
|
||||||
|
elif cluster.label == LayoutModel.TABLE_LABEL:
|
||||||
|
tbl = None
|
||||||
|
if page.predictions.tablestructure:
|
||||||
|
tbl = page.predictions.tablestructure.table_map.get(
|
||||||
|
cluster.id, None
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
not tbl
|
||||||
|
): # fallback: add table without structure, if it isn't present
|
||||||
|
tbl = Table(
|
||||||
|
label=cluster.label,
|
||||||
|
id=cluster.id,
|
||||||
|
text="",
|
||||||
|
otsl_seq=[],
|
||||||
|
table_cells=[],
|
||||||
|
cluster=cluster,
|
||||||
|
page_no=page.page_no,
|
||||||
|
)
|
||||||
|
|
||||||
# Remove page images (can be disabled)
|
elements.append(tbl)
|
||||||
if not self.options.keep_images:
|
body.append(tbl)
|
||||||
page._image_cache = {}
|
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
||||||
|
fig = None
|
||||||
|
if page.predictions.figures_classification:
|
||||||
|
fig = page.predictions.figures_classification.figure_map.get(
|
||||||
|
cluster.id, None
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
not fig
|
||||||
|
): # fallback: add figure without classification, if it isn't present
|
||||||
|
fig = FigureElement(
|
||||||
|
label=cluster.label,
|
||||||
|
id=cluster.id,
|
||||||
|
text="",
|
||||||
|
data=None,
|
||||||
|
cluster=cluster,
|
||||||
|
page_no=page.page_no,
|
||||||
|
)
|
||||||
|
elements.append(fig)
|
||||||
|
body.append(fig)
|
||||||
|
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
||||||
|
equation = None
|
||||||
|
if page.predictions.equations_prediction:
|
||||||
|
equation = page.predictions.equations_prediction.equation_map.get(
|
||||||
|
cluster.id, None
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
not equation
|
||||||
|
): # fallback: add empty formula, if it isn't present
|
||||||
|
text = self.sanitize_text(
|
||||||
|
[
|
||||||
|
cell.text.replace("\x02", "-").strip()
|
||||||
|
for cell in cluster.cells
|
||||||
|
if len(cell.text.strip()) > 0
|
||||||
|
]
|
||||||
|
)
|
||||||
|
equation = TextElement(
|
||||||
|
label=cluster.label,
|
||||||
|
id=cluster.id,
|
||||||
|
cluster=cluster,
|
||||||
|
page_no=page.page_no,
|
||||||
|
text=text,
|
||||||
|
)
|
||||||
|
elements.append(equation)
|
||||||
|
body.append(equation)
|
||||||
|
|
||||||
# Unload backend
|
page.assembled = AssembledUnit(
|
||||||
page._backend.unload()
|
elements=elements, headers=headers, body=body
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove page images (can be disabled)
|
||||||
|
if not self.options.keep_images:
|
||||||
|
page._image_cache = {}
|
||||||
|
|
||||||
|
# Unload backend
|
||||||
|
page._backend.unload()
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
@ -4,7 +4,8 @@ from PIL import ImageDraw
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.datamodel.document import ConversionResult
|
||||||
|
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||||
|
|
||||||
|
|
||||||
class PagePreprocessingOptions(BaseModel):
|
class PagePreprocessingOptions(BaseModel):
|
||||||
@ -15,14 +16,17 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
def __init__(self, options: PagePreprocessingOptions):
|
def __init__(self, options: PagePreprocessingOptions):
|
||||||
self.options = options
|
self.options = options
|
||||||
|
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
for page in page_batch:
|
for page in page_batch:
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
page = self._populate_page_images(page)
|
with TimeRecorder(conv_res, "page_parse"):
|
||||||
page = self._parse_page_cells(page)
|
page = self._populate_page_images(page)
|
||||||
|
page = self._parse_page_cells(page)
|
||||||
yield page
|
yield page
|
||||||
|
|
||||||
# Generate the page image and store it in the page object
|
# Generate the page image and store it in the page object
|
||||||
|
@ -8,9 +8,10 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
|
|||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel, TimeRecorder
|
||||||
|
|
||||||
|
|
||||||
class TableStructureModel(BasePageModel):
|
class TableStructureModel(BasePageModel):
|
||||||
@ -64,7 +65,9 @@ class TableStructureModel(BasePageModel):
|
|||||||
|
|
||||||
image.show()
|
image.show()
|
||||||
|
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
|
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
yield from page_batch
|
yield from page_batch
|
||||||
@ -75,96 +78,105 @@ class TableStructureModel(BasePageModel):
|
|||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
|
with TimeRecorder(conv_res, "table_structure"):
|
||||||
|
|
||||||
assert page.predictions.layout is not None
|
assert page.predictions.layout is not None
|
||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
|
|
||||||
page.predictions.tablestructure = TableStructurePrediction() # dummy
|
page.predictions.tablestructure = (
|
||||||
|
TableStructurePrediction()
|
||||||
|
) # dummy
|
||||||
|
|
||||||
in_tables = [
|
in_tables = [
|
||||||
(
|
(
|
||||||
cluster,
|
cluster,
|
||||||
[
|
[
|
||||||
round(cluster.bbox.l) * self.scale,
|
round(cluster.bbox.l) * self.scale,
|
||||||
round(cluster.bbox.t) * self.scale,
|
round(cluster.bbox.t) * self.scale,
|
||||||
round(cluster.bbox.r) * self.scale,
|
round(cluster.bbox.r) * self.scale,
|
||||||
round(cluster.bbox.b) * self.scale,
|
round(cluster.bbox.b) * self.scale,
|
||||||
],
|
],
|
||||||
|
)
|
||||||
|
for cluster in page.predictions.layout.clusters
|
||||||
|
if cluster.label == DocItemLabel.TABLE
|
||||||
|
]
|
||||||
|
if not len(in_tables):
|
||||||
|
yield page
|
||||||
|
continue
|
||||||
|
|
||||||
|
tokens = []
|
||||||
|
for c in page.cells:
|
||||||
|
for cluster, _ in in_tables:
|
||||||
|
if c.bbox.area() > 0:
|
||||||
|
if (
|
||||||
|
c.bbox.intersection_area_with(cluster.bbox)
|
||||||
|
/ c.bbox.area()
|
||||||
|
> 0.2
|
||||||
|
):
|
||||||
|
# Only allow non empty stings (spaces) into the cells of a table
|
||||||
|
if len(c.text.strip()) > 0:
|
||||||
|
new_cell = copy.deepcopy(c)
|
||||||
|
new_cell.bbox = new_cell.bbox.scaled(
|
||||||
|
scale=self.scale
|
||||||
|
)
|
||||||
|
|
||||||
|
tokens.append(new_cell.model_dump())
|
||||||
|
|
||||||
|
page_input = {
|
||||||
|
"tokens": tokens,
|
||||||
|
"width": page.size.width * self.scale,
|
||||||
|
"height": page.size.height * self.scale,
|
||||||
|
}
|
||||||
|
page_input["image"] = numpy.asarray(
|
||||||
|
page.get_image(scale=self.scale)
|
||||||
)
|
)
|
||||||
for cluster in page.predictions.layout.clusters
|
|
||||||
if cluster.label == DocItemLabel.TABLE
|
|
||||||
]
|
|
||||||
if not len(in_tables):
|
|
||||||
yield page
|
|
||||||
continue
|
|
||||||
|
|
||||||
tokens = []
|
table_clusters, table_bboxes = zip(*in_tables)
|
||||||
for c in page.cells:
|
|
||||||
for cluster, _ in in_tables:
|
if len(table_bboxes):
|
||||||
if c.bbox.area() > 0:
|
tf_output = self.tf_predictor.multi_table_predict(
|
||||||
if (
|
page_input, table_bboxes, do_matching=self.do_cell_matching
|
||||||
c.bbox.intersection_area_with(cluster.bbox)
|
)
|
||||||
/ c.bbox.area()
|
|
||||||
> 0.2
|
for table_cluster, table_out in zip(table_clusters, tf_output):
|
||||||
):
|
table_cells = []
|
||||||
# Only allow non empty stings (spaces) into the cells of a table
|
for element in table_out["tf_responses"]:
|
||||||
if len(c.text.strip()) > 0:
|
|
||||||
new_cell = copy.deepcopy(c)
|
if not self.do_cell_matching:
|
||||||
new_cell.bbox = new_cell.bbox.scaled(
|
the_bbox = BoundingBox.model_validate(
|
||||||
scale=self.scale
|
element["bbox"]
|
||||||
|
).scaled(1 / self.scale)
|
||||||
|
text_piece = page._backend.get_text_in_rect(
|
||||||
|
the_bbox
|
||||||
)
|
)
|
||||||
|
element["bbox"]["token"] = text_piece
|
||||||
|
|
||||||
tokens.append(new_cell.model_dump())
|
tc = TableCell.model_validate(element)
|
||||||
|
if self.do_cell_matching and tc.bbox is not None:
|
||||||
|
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
||||||
|
table_cells.append(tc)
|
||||||
|
|
||||||
page_input = {
|
# Retrieving cols/rows, after post processing:
|
||||||
"tokens": tokens,
|
num_rows = table_out["predict_details"]["num_rows"]
|
||||||
"width": page.size.width * self.scale,
|
num_cols = table_out["predict_details"]["num_cols"]
|
||||||
"height": page.size.height * self.scale,
|
otsl_seq = table_out["predict_details"]["prediction"][
|
||||||
}
|
"rs_seq"
|
||||||
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
|
]
|
||||||
|
|
||||||
table_clusters, table_bboxes = zip(*in_tables)
|
tbl = Table(
|
||||||
|
otsl_seq=otsl_seq,
|
||||||
|
table_cells=table_cells,
|
||||||
|
num_rows=num_rows,
|
||||||
|
num_cols=num_cols,
|
||||||
|
id=table_cluster.id,
|
||||||
|
page_no=page.page_no,
|
||||||
|
cluster=table_cluster,
|
||||||
|
label=DocItemLabel.TABLE,
|
||||||
|
)
|
||||||
|
|
||||||
if len(table_bboxes):
|
page.predictions.tablestructure.table_map[
|
||||||
tf_output = self.tf_predictor.multi_table_predict(
|
table_cluster.id
|
||||||
page_input, table_bboxes, do_matching=self.do_cell_matching
|
] = tbl
|
||||||
)
|
|
||||||
|
|
||||||
for table_cluster, table_out in zip(table_clusters, tf_output):
|
|
||||||
table_cells = []
|
|
||||||
for element in table_out["tf_responses"]:
|
|
||||||
|
|
||||||
if not self.do_cell_matching:
|
|
||||||
the_bbox = BoundingBox.model_validate(
|
|
||||||
element["bbox"]
|
|
||||||
).scaled(1 / self.scale)
|
|
||||||
text_piece = page._backend.get_text_in_rect(the_bbox)
|
|
||||||
element["bbox"]["token"] = text_piece
|
|
||||||
|
|
||||||
tc = TableCell.model_validate(element)
|
|
||||||
if self.do_cell_matching and tc.bbox is not None:
|
|
||||||
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
|
||||||
table_cells.append(tc)
|
|
||||||
|
|
||||||
# Retrieving cols/rows, after post processing:
|
|
||||||
num_rows = table_out["predict_details"]["num_rows"]
|
|
||||||
num_cols = table_out["predict_details"]["num_cols"]
|
|
||||||
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
|
|
||||||
|
|
||||||
tbl = Table(
|
|
||||||
otsl_seq=otsl_seq,
|
|
||||||
table_cells=table_cells,
|
|
||||||
num_rows=num_rows,
|
|
||||||
num_cols=num_cols,
|
|
||||||
id=table_cluster.id,
|
|
||||||
page_no=page.page_no,
|
|
||||||
cluster=table_cluster,
|
|
||||||
label=DocItemLabel.TABLE,
|
|
||||||
)
|
|
||||||
|
|
||||||
page.predictions.tablestructure.table_map[table_cluster.id] = (
|
|
||||||
tbl
|
|
||||||
)
|
|
||||||
|
|
||||||
# For debugging purposes:
|
# For debugging purposes:
|
||||||
if settings.debug.visualize_tables:
|
if settings.debug.visualize_tables:
|
||||||
|
@ -8,8 +8,10 @@ import pandas as pd
|
|||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import OcrCell, Page
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.models.base_model import TimeRecorder
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -103,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
|
|
||||||
return df_filtered
|
return df_filtered
|
||||||
|
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
|
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
yield from page_batch
|
yield from page_batch
|
||||||
@ -114,60 +118,64 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
ocr_rects = self.get_ocr_rects(page)
|
with TimeRecorder(conv_res, "ocr"):
|
||||||
|
|
||||||
all_ocr_cells = []
|
ocr_rects = self.get_ocr_rects(page)
|
||||||
for ocr_rect in ocr_rects:
|
|
||||||
# Skip zero area boxes
|
all_ocr_cells = []
|
||||||
if ocr_rect.area() == 0:
|
for ocr_rect in ocr_rects:
|
||||||
continue
|
# Skip zero area boxes
|
||||||
high_res_image = page._backend.get_page_image(
|
if ocr_rect.area() == 0:
|
||||||
scale=self.scale, cropbox=ocr_rect
|
continue
|
||||||
|
high_res_image = page._backend.get_page_image(
|
||||||
|
scale=self.scale, cropbox=ocr_rect
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
suffix=".png", mode="w"
|
||||||
|
) as image_file:
|
||||||
|
fname = image_file.name
|
||||||
|
high_res_image.save(fname)
|
||||||
|
|
||||||
|
df = self._run_tesseract(fname)
|
||||||
|
|
||||||
|
# _log.info(df)
|
||||||
|
|
||||||
|
# Print relevant columns (bounding box and text)
|
||||||
|
for ix, row in df.iterrows():
|
||||||
|
text = row["text"]
|
||||||
|
conf = row["conf"]
|
||||||
|
|
||||||
|
l = float(row["left"])
|
||||||
|
b = float(row["top"])
|
||||||
|
w = float(row["width"])
|
||||||
|
h = float(row["height"])
|
||||||
|
|
||||||
|
t = b + h
|
||||||
|
r = l + w
|
||||||
|
|
||||||
|
cell = OcrCell(
|
||||||
|
id=ix,
|
||||||
|
text=text,
|
||||||
|
confidence=conf / 100.0,
|
||||||
|
bbox=BoundingBox.from_tuple(
|
||||||
|
coord=(
|
||||||
|
(l / self.scale) + ocr_rect.l,
|
||||||
|
(b / self.scale) + ocr_rect.t,
|
||||||
|
(r / self.scale) + ocr_rect.l,
|
||||||
|
(t / self.scale) + ocr_rect.t,
|
||||||
|
),
|
||||||
|
origin=CoordOrigin.TOPLEFT,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
all_ocr_cells.append(cell)
|
||||||
|
|
||||||
|
## Remove OCR cells which overlap with programmatic cells.
|
||||||
|
filtered_ocr_cells = self.filter_ocr_cells(
|
||||||
|
all_ocr_cells, page.cells
|
||||||
)
|
)
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(
|
page.cells.extend(filtered_ocr_cells)
|
||||||
suffix=".png", mode="w"
|
|
||||||
) as image_file:
|
|
||||||
fname = image_file.name
|
|
||||||
high_res_image.save(fname)
|
|
||||||
|
|
||||||
df = self._run_tesseract(fname)
|
|
||||||
|
|
||||||
# _log.info(df)
|
|
||||||
|
|
||||||
# Print relevant columns (bounding box and text)
|
|
||||||
for ix, row in df.iterrows():
|
|
||||||
text = row["text"]
|
|
||||||
conf = row["conf"]
|
|
||||||
|
|
||||||
l = float(row["left"])
|
|
||||||
b = float(row["top"])
|
|
||||||
w = float(row["width"])
|
|
||||||
h = float(row["height"])
|
|
||||||
|
|
||||||
t = b + h
|
|
||||||
r = l + w
|
|
||||||
|
|
||||||
cell = OcrCell(
|
|
||||||
id=ix,
|
|
||||||
text=text,
|
|
||||||
confidence=conf / 100.0,
|
|
||||||
bbox=BoundingBox.from_tuple(
|
|
||||||
coord=(
|
|
||||||
(l / self.scale) + ocr_rect.l,
|
|
||||||
(b / self.scale) + ocr_rect.t,
|
|
||||||
(r / self.scale) + ocr_rect.l,
|
|
||||||
(t / self.scale) + ocr_rect.t,
|
|
||||||
),
|
|
||||||
origin=CoordOrigin.TOPLEFT,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
all_ocr_cells.append(cell)
|
|
||||||
|
|
||||||
## Remove OCR cells which overlap with programmatic cells.
|
|
||||||
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
|
||||||
|
|
||||||
page.cells.extend(filtered_ocr_cells)
|
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
if settings.debug.visualize_ocr:
|
if settings.debug.visualize_ocr:
|
||||||
|
@ -4,8 +4,10 @@ from typing import Iterable
|
|||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import OcrCell, Page
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.models.base_model import TimeRecorder
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -62,7 +64,9 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
# Finalize the tesseractAPI
|
# Finalize the tesseractAPI
|
||||||
self.reader.End()
|
self.reader.End()
|
||||||
|
|
||||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def __call__(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
|
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
yield from page_batch
|
yield from page_batch
|
||||||
@ -73,57 +77,63 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
assert self.reader is not None
|
with TimeRecorder(conv_res, "ocr"):
|
||||||
|
|
||||||
ocr_rects = self.get_ocr_rects(page)
|
assert self.reader is not None
|
||||||
|
|
||||||
all_ocr_cells = []
|
ocr_rects = self.get_ocr_rects(page)
|
||||||
for ocr_rect in ocr_rects:
|
|
||||||
# Skip zero area boxes
|
|
||||||
if ocr_rect.area() == 0:
|
|
||||||
continue
|
|
||||||
high_res_image = page._backend.get_page_image(
|
|
||||||
scale=self.scale, cropbox=ocr_rect
|
|
||||||
)
|
|
||||||
|
|
||||||
# Retrieve text snippets with their bounding boxes
|
all_ocr_cells = []
|
||||||
self.reader.SetImage(high_res_image)
|
for ocr_rect in ocr_rects:
|
||||||
boxes = self.reader.GetComponentImages(
|
# Skip zero area boxes
|
||||||
self.reader_RIL.TEXTLINE, True
|
if ocr_rect.area() == 0:
|
||||||
)
|
continue
|
||||||
|
high_res_image = page._backend.get_page_image(
|
||||||
cells = []
|
scale=self.scale, cropbox=ocr_rect
|
||||||
for ix, (im, box, _, _) in enumerate(boxes):
|
|
||||||
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
|
||||||
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
|
|
||||||
|
|
||||||
# Extract text within the bounding box
|
|
||||||
text = self.reader.GetUTF8Text().strip()
|
|
||||||
confidence = self.reader.MeanTextConf()
|
|
||||||
left = box["x"] / self.scale
|
|
||||||
bottom = box["y"] / self.scale
|
|
||||||
right = (box["x"] + box["w"]) / self.scale
|
|
||||||
top = (box["y"] + box["h"]) / self.scale
|
|
||||||
|
|
||||||
cells.append(
|
|
||||||
OcrCell(
|
|
||||||
id=ix,
|
|
||||||
text=text,
|
|
||||||
confidence=confidence,
|
|
||||||
bbox=BoundingBox.from_tuple(
|
|
||||||
coord=(left, top, right, bottom),
|
|
||||||
origin=CoordOrigin.TOPLEFT,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# del high_res_image
|
# Retrieve text snippets with their bounding boxes
|
||||||
all_ocr_cells.extend(cells)
|
self.reader.SetImage(high_res_image)
|
||||||
|
boxes = self.reader.GetComponentImages(
|
||||||
|
self.reader_RIL.TEXTLINE, True
|
||||||
|
)
|
||||||
|
|
||||||
## Remove OCR cells which overlap with programmatic cells.
|
cells = []
|
||||||
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
for ix, (im, box, _, _) in enumerate(boxes):
|
||||||
|
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
||||||
|
self.reader.SetRectangle(
|
||||||
|
box["x"], box["y"], box["w"], box["h"]
|
||||||
|
)
|
||||||
|
|
||||||
page.cells.extend(filtered_ocr_cells)
|
# Extract text within the bounding box
|
||||||
|
text = self.reader.GetUTF8Text().strip()
|
||||||
|
confidence = self.reader.MeanTextConf()
|
||||||
|
left = box["x"] / self.scale
|
||||||
|
bottom = box["y"] / self.scale
|
||||||
|
right = (box["x"] + box["w"]) / self.scale
|
||||||
|
top = (box["y"] + box["h"]) / self.scale
|
||||||
|
|
||||||
|
cells.append(
|
||||||
|
OcrCell(
|
||||||
|
id=ix,
|
||||||
|
text=text,
|
||||||
|
confidence=confidence,
|
||||||
|
bbox=BoundingBox.from_tuple(
|
||||||
|
coord=(left, top, right, bottom),
|
||||||
|
origin=CoordOrigin.TOPLEFT,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# del high_res_image
|
||||||
|
all_ocr_cells.extend(cells)
|
||||||
|
|
||||||
|
## Remove OCR cells which overlap with programmatic cells.
|
||||||
|
filtered_ocr_cells = self.filter_ocr_cells(
|
||||||
|
all_ocr_cells, page.cells
|
||||||
|
)
|
||||||
|
|
||||||
|
page.cells.extend(filtered_ocr_cells)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
if settings.debug.visualize_ocr:
|
if settings.debug.visualize_ocr:
|
||||||
|
@ -15,10 +15,15 @@ from docling.datamodel.base_models import (
|
|||||||
ErrorItem,
|
ErrorItem,
|
||||||
Page,
|
Page,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import (
|
||||||
|
ConversionResult,
|
||||||
|
InputDocument,
|
||||||
|
ProfilingItem,
|
||||||
|
ProfilingScope,
|
||||||
|
)
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BaseEnrichmentModel
|
from docling.models.base_model import BaseEnrichmentModel, TimeRecorder
|
||||||
from docling.utils.utils import chunkify
|
from docling.utils.utils import chunkify
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -37,11 +42,11 @@ class BasePipeline(ABC):
|
|||||||
try:
|
try:
|
||||||
# These steps are building and assembling the structure of the
|
# These steps are building and assembling the structure of the
|
||||||
# output DoclingDocument
|
# output DoclingDocument
|
||||||
conv_res = self._build_document(in_doc, conv_res)
|
conv_res = self._build_document(conv_res)
|
||||||
conv_res = self._assemble_document(in_doc, conv_res)
|
conv_res = self._assemble_document(conv_res)
|
||||||
# From this stage, all operations should rely only on conv_res.output
|
# From this stage, all operations should rely only on conv_res.output
|
||||||
conv_res = self._enrich_document(in_doc, conv_res)
|
conv_res = self._enrich_document(conv_res)
|
||||||
conv_res.status = self._determine_status(in_doc, conv_res)
|
conv_res.status = self._determine_status(conv_res)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
if raises_on_error:
|
if raises_on_error:
|
||||||
@ -50,19 +55,13 @@ class BasePipeline(ABC):
|
|||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _build_document(
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _assemble_document(
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def _enrich_document(
|
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
|
|
||||||
def _filter_elements(
|
def _filter_elements(
|
||||||
doc: DoclingDocument, model: BaseEnrichmentModel
|
doc: DoclingDocument, model: BaseEnrichmentModel
|
||||||
@ -71,24 +70,23 @@ class BasePipeline(ABC):
|
|||||||
if model.is_processable(doc=doc, element=element):
|
if model.is_processable(doc=doc, element=element):
|
||||||
yield element
|
yield element
|
||||||
|
|
||||||
for model in self.enrichment_pipe:
|
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
|
||||||
for element_batch in chunkify(
|
for model in self.enrichment_pipe:
|
||||||
_filter_elements(conv_res.document, model),
|
for element_batch in chunkify(
|
||||||
settings.perf.elements_batch_size,
|
_filter_elements(conv_res.document, model),
|
||||||
):
|
settings.perf.elements_batch_size,
|
||||||
# TODO: currently we assume the element itself is modified, because
|
):
|
||||||
# we don't have an interface to save the element back to the document
|
# TODO: currently we assume the element itself is modified, because
|
||||||
for element in model(
|
# we don't have an interface to save the element back to the document
|
||||||
doc=conv_res.document, element_batch=element_batch
|
for element in model(
|
||||||
): # Must exhaust!
|
doc=conv_res.document, element_batch=element_batch
|
||||||
pass
|
): # Must exhaust!
|
||||||
|
pass
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _determine_status(
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionStatus:
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -110,66 +108,68 @@ class BasePipeline(ABC):
|
|||||||
|
|
||||||
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||||
|
|
||||||
def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
def _apply_on_pages(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
for model in self.build_pipe:
|
for model in self.build_pipe:
|
||||||
page_batch = model(page_batch)
|
page_batch = model(conv_res, page_batch)
|
||||||
|
|
||||||
yield from page_batch
|
yield from page_batch
|
||||||
|
|
||||||
def _build_document(
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
|
|
||||||
if not isinstance(in_doc._backend, PdfDocumentBackend):
|
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
|
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
||||||
f"Can not convert this with a PDF pipeline. "
|
f"Can not convert this with a PDF pipeline. "
|
||||||
f"Please check your format configuration on DocumentConverter."
|
f"Please check your format configuration on DocumentConverter."
|
||||||
)
|
)
|
||||||
# conv_res.status = ConversionStatus.FAILURE
|
# conv_res.status = ConversionStatus.FAILURE
|
||||||
# return conv_res
|
# return conv_res
|
||||||
|
|
||||||
for i in range(0, in_doc.page_count):
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||||
conv_res.pages.append(Page(page_no=i))
|
|
||||||
|
|
||||||
try:
|
for i in range(0, conv_res.input.page_count):
|
||||||
# Iterate batches of pages (page_batch_size) in the doc
|
conv_res.pages.append(Page(page_no=i))
|
||||||
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
|
||||||
start_pb_time = time.time()
|
|
||||||
|
|
||||||
# 1. Initialise the page resources
|
try:
|
||||||
init_pages = map(
|
# Iterate batches of pages (page_batch_size) in the doc
|
||||||
functools.partial(self.initialize_page, in_doc), page_batch
|
for page_batch in chunkify(
|
||||||
|
conv_res.pages, settings.perf.page_batch_size
|
||||||
|
):
|
||||||
|
start_pb_time = time.time()
|
||||||
|
|
||||||
|
# 1. Initialise the page resources
|
||||||
|
init_pages = map(
|
||||||
|
functools.partial(self.initialize_page, conv_res), page_batch
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Run pipeline stages
|
||||||
|
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
||||||
|
|
||||||
|
for p in pipeline_pages: # Must exhaust!
|
||||||
|
pass
|
||||||
|
|
||||||
|
end_pb_time = time.time() - start_pb_time
|
||||||
|
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
|
trace = "\n".join(traceback.format_exception(e))
|
||||||
|
_log.warning(
|
||||||
|
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
|
||||||
|
f"{trace}"
|
||||||
)
|
)
|
||||||
|
raise e
|
||||||
|
|
||||||
# 2. Run pipeline stages
|
finally:
|
||||||
pipeline_pages = self._apply_on_pages(init_pages)
|
# Always unload the PDF backend, even in case of failure
|
||||||
|
if conv_res.input._backend:
|
||||||
for p in pipeline_pages: # Must exhaust!
|
conv_res.input._backend.unload()
|
||||||
pass
|
|
||||||
|
|
||||||
end_pb_time = time.time() - start_pb_time
|
|
||||||
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
|
||||||
trace = "\n".join(traceback.format_exception(e))
|
|
||||||
_log.warning(
|
|
||||||
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
|
||||||
f"{trace}"
|
|
||||||
)
|
|
||||||
raise e
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Always unload the PDF backend, even in case of failure
|
|
||||||
if in_doc._backend:
|
|
||||||
in_doc._backend.unload()
|
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def _determine_status(
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionStatus:
|
|
||||||
status = ConversionStatus.SUCCESS
|
status = ConversionStatus.SUCCESS
|
||||||
for page in conv_res.pages:
|
for page in conv_res.pages:
|
||||||
if page._backend is None or not page._backend.is_valid():
|
if page._backend is None or not page._backend.is_valid():
|
||||||
@ -186,5 +186,5 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
|
|
||||||
# Initialise and load resources for a page
|
# Initialise and load resources for a page
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||||
pass
|
pass
|
||||||
|
@ -5,8 +5,9 @@ from docling.backend.abstract_backend import (
|
|||||||
DeclarativeDocumentBackend,
|
DeclarativeDocumentBackend,
|
||||||
)
|
)
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
|
from docling.models.base_model import TimeRecorder
|
||||||
from docling.pipeline.base_pipeline import BasePipeline
|
from docling.pipeline.base_pipeline import BasePipeline
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline):
|
|||||||
def __init__(self, pipeline_options: PipelineOptions):
|
def __init__(self, pipeline_options: PipelineOptions):
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
|
|
||||||
def _build_document(
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
|
|
||||||
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
|
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
|
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
|
||||||
f"Can not convert this with simple pipeline. "
|
f"Can not convert this with simple pipeline. "
|
||||||
f"Please check your format configuration on DocumentConverter."
|
f"Please check your format configuration on DocumentConverter."
|
||||||
)
|
)
|
||||||
@ -38,13 +37,11 @@ class SimplePipeline(BasePipeline):
|
|||||||
# Instead of running a page-level pipeline to build up the document structure,
|
# Instead of running a page-level pipeline to build up the document structure,
|
||||||
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
||||||
# a DoclingDocument straight.
|
# a DoclingDocument straight.
|
||||||
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||||
conv_res.document = in_doc._backend.convert()
|
conv_res.document = conv_res.input._backend.convert()
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def _determine_status(
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionStatus:
|
|
||||||
# This is called only if the previous steps didn't raise.
|
# This is called only if the previous steps didn't raise.
|
||||||
# Since we don't have anything else to evaluate, we can
|
# Since we don't have anything else to evaluate, we can
|
||||||
# safely return SUCCESS.
|
# safely return SUCCESS.
|
||||||
|
@ -7,13 +7,14 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import AssembledUnit, Page
|
from docling.datamodel.base_models import AssembledUnit, Page
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument, ProfilingScope
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
|
from docling.models.base_model import TimeRecorder
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
@ -119,73 +120,75 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||||
page._backend = doc._backend.load_page(page.page_no) # type: ignore
|
with TimeRecorder(conv_res, "init_page"):
|
||||||
if page._backend is not None and page._backend.is_valid():
|
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
||||||
page.size = page._backend.get_size()
|
if page._backend is not None and page._backend.is_valid():
|
||||||
|
page.size = page._backend.get_size()
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
def _assemble_document(
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
|
||||||
) -> ConversionResult:
|
|
||||||
all_elements = []
|
all_elements = []
|
||||||
all_headers = []
|
all_headers = []
|
||||||
all_body = []
|
all_body = []
|
||||||
|
|
||||||
for p in conv_res.pages:
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
||||||
if p.assembled is not None:
|
for p in conv_res.pages:
|
||||||
for el in p.assembled.body:
|
if p.assembled is not None:
|
||||||
all_body.append(el)
|
for el in p.assembled.body:
|
||||||
for el in p.assembled.headers:
|
all_body.append(el)
|
||||||
all_headers.append(el)
|
for el in p.assembled.headers:
|
||||||
for el in p.assembled.elements:
|
all_headers.append(el)
|
||||||
all_elements.append(el)
|
for el in p.assembled.elements:
|
||||||
|
all_elements.append(el)
|
||||||
|
|
||||||
conv_res.assembled = AssembledUnit(
|
conv_res.assembled = AssembledUnit(
|
||||||
elements=all_elements, headers=all_headers, body=all_body
|
elements=all_elements, headers=all_headers, body=all_body
|
||||||
)
|
)
|
||||||
|
|
||||||
conv_res.document = self.glm_model(conv_res)
|
conv_res.document = self.glm_model(conv_res)
|
||||||
|
|
||||||
# Generate page images in the output
|
# Generate page images in the output
|
||||||
if self.pipeline_options.generate_page_images:
|
if self.pipeline_options.generate_page_images:
|
||||||
for page in conv_res.pages:
|
for page in conv_res.pages:
|
||||||
assert page.image is not None
|
|
||||||
page_no = page.page_no + 1
|
|
||||||
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
|
||||||
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate images of the requested element types
|
|
||||||
if (
|
|
||||||
self.pipeline_options.generate_picture_images
|
|
||||||
or self.pipeline_options.generate_table_images
|
|
||||||
):
|
|
||||||
scale = self.pipeline_options.images_scale
|
|
||||||
for element, _level in conv_res.document.iterate_items():
|
|
||||||
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
|
||||||
continue
|
|
||||||
if (
|
|
||||||
isinstance(element, PictureItem)
|
|
||||||
and self.pipeline_options.generate_picture_images
|
|
||||||
) or (
|
|
||||||
isinstance(element, TableItem)
|
|
||||||
and self.pipeline_options.generate_table_images
|
|
||||||
):
|
|
||||||
page_ix = element.prov[0].page_no - 1
|
|
||||||
page = conv_res.pages[page_ix]
|
|
||||||
assert page.size is not None
|
|
||||||
assert page.image is not None
|
assert page.image is not None
|
||||||
|
page_no = page.page_no + 1
|
||||||
crop_bbox = (
|
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
||||||
element.prov[0]
|
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
||||||
.bbox.scaled(scale=scale)
|
|
||||||
.to_top_left_origin(page_height=page.size.height * scale)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
# Generate images of the requested element types
|
||||||
element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
|
if (
|
||||||
|
self.pipeline_options.generate_picture_images
|
||||||
|
or self.pipeline_options.generate_table_images
|
||||||
|
):
|
||||||
|
scale = self.pipeline_options.images_scale
|
||||||
|
for element, _level in conv_res.document.iterate_items():
|
||||||
|
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
||||||
|
continue
|
||||||
|
if (
|
||||||
|
isinstance(element, PictureItem)
|
||||||
|
and self.pipeline_options.generate_picture_images
|
||||||
|
) or (
|
||||||
|
isinstance(element, TableItem)
|
||||||
|
and self.pipeline_options.generate_table_images
|
||||||
|
):
|
||||||
|
page_ix = element.prov[0].page_no - 1
|
||||||
|
page = conv_res.pages[page_ix]
|
||||||
|
assert page.size is not None
|
||||||
|
assert page.image is not None
|
||||||
|
|
||||||
|
crop_bbox = (
|
||||||
|
element.prov[0]
|
||||||
|
.bbox.scaled(scale=scale)
|
||||||
|
.to_top_left_origin(page_height=page.size.height * scale)
|
||||||
|
)
|
||||||
|
|
||||||
|
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
||||||
|
element.image = ImageRef.from_pil(
|
||||||
|
cropped_im, dpi=int(72 * scale)
|
||||||
|
)
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user