perf: Clean up resources with docling-parse v4, no parsed_page output by default (#2105)

* Call PdfDocument.unload_pages from the pipelines where needed, delete parsed_page data unless requested to keep

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* pin docling-parse and update lock

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Reinstate pipeline_options.generate_parsed_page

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-08-20 10:46:31 +02:00
committed by GitHub
parent c5f2e2fdd6
commit 5f57ff2a45
8 changed files with 110 additions and 65 deletions

View File

@@ -22,15 +22,52 @@ _log = logging.getLogger(__name__)
class DoclingParseV4PageBackend(PdfPageBackend):
def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
def __init__(
self,
*,
dp_doc: PdfDocument,
page_obj: PdfPage,
page_no: int,
create_words: bool = True,
create_textlines: bool = True,
):
self._ppage = page_obj
self._dpage = parsed_page
self.valid = parsed_page is not None
self._dp_doc = dp_doc
self._page_no = page_no
self._create_words = create_words
self._create_textlines = create_textlines
self._dpage: Optional[SegmentedPdfPage] = None
self._unloaded = False
self.valid = (self._ppage is not None) and (self._dp_doc is not None)
def _ensure_parsed(self) -> None:
if self._dpage is not None:
return
seg_page = self._dp_doc.get_page(
self._page_no + 1,
create_words=self._create_words,
create_textlines=self._create_textlines,
)
# In Docling, all TextCell instances are expected with top-left origin.
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.textline_cells
]
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.char_cells]
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.word_cells]
self._dpage = seg_page
def is_valid(self) -> bool:
return self.valid
def get_text_in_rect(self, bbox: BoundingBox) -> str:
self._ensure_parsed()
assert self._dpage is not None
# Find intersecting cells on the page
text_piece = ""
page_size = self.get_size()
@@ -56,12 +93,19 @@ class DoclingParseV4PageBackend(PdfPageBackend):
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
self._ensure_parsed()
return self._dpage
def get_text_cells(self) -> Iterable[TextCell]:
self._ensure_parsed()
assert self._dpage is not None
return self._dpage.textline_cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
self._ensure_parsed()
assert self._dpage is not None
AREA_THRESHOLD = 0 # 32 * 32
images = self._dpage.bitmap_resources
@@ -123,8 +167,13 @@ class DoclingParseV4PageBackend(PdfPageBackend):
# )
def unload(self):
if not self._unloaded and self._dp_doc is not None:
self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
self._unloaded = True
self._ppage = None
self._dpage = None
self._dp_doc = None
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
@@ -157,30 +206,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> DoclingParseV4PageBackend:
with pypdfium2_lock:
seg_page = self.dp_doc.get_page(
page_no + 1,
create_words=create_words,
create_textlines=create_textlines,
)
ppage = self._pdoc[page_no]
# In Docling, all TextCell instances are expected with top-left origin.
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.textline_cells
]
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.char_cells
]
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.word_cells
]
return DoclingParseV4PageBackend(
seg_page,
self._pdoc[page_no],
)
return DoclingParseV4PageBackend(
dp_doc=self.dp_doc,
page_obj=ppage,
page_no=page_no,
create_words=create_words,
create_textlines=create_textlines,
)
def is_valid(self) -> bool:
return self.page_count() > 0

View File

@@ -323,9 +323,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
),
)
generate_parsed_pages: Literal[True] = (
True # Always True since parsed_page is now mandatory
)
generate_parsed_pages: bool = False
class ProcessingPipeline(str, Enum):

View File

@@ -20,7 +20,7 @@ from docling.datamodel.base_models import (
Page,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.datamodel.settings import settings
from docling.models.base_model import GenericEnrichmentModel
from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -168,6 +168,12 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
# Cleanup page backends
if not self.keep_backend and p._backend is not None:
p._backend.unload()
if (
isinstance(self.pipeline_options, PdfPipelineOptions)
and not self.pipeline_options.generate_parsed_pages
):
del p.parsed_page
p.parsed_page = None
end_batch_time = time.monotonic()
total_elapsed_time += end_batch_time - start_batch_time

View File

@@ -565,10 +565,12 @@ class ThreadedStandardPdfPipeline(BasePipeline):
if not self.keep_images:
for p in conv_res.pages:
p._image_cache = {}
if not self.keep_backend:
for p in conv_res.pages:
if p._backend is not None:
p._backend.unload()
for p in conv_res.pages:
if not self.keep_backend and p._backend is not None:
p._backend.unload()
if not self.pipeline_options.generate_parsed_pages:
del p.parsed_page
p.parsed_page = None
# ---------------------------------------------------------------- assemble
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: