mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
perf: Clean up resources with docling-parse v4, no parsed_page output by default (#2105)
* Call PdfDocument.unload_pages from the pipelines where needed, delete parsed_page data unless requested to keep Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * pin docling-parse and update lock Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Reinstate pipeline_options.generate_parsed_page Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -22,15 +22,52 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DoclingParseV4PageBackend(PdfPageBackend):
|
||||
def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
dp_doc: PdfDocument,
|
||||
page_obj: PdfPage,
|
||||
page_no: int,
|
||||
create_words: bool = True,
|
||||
create_textlines: bool = True,
|
||||
):
|
||||
self._ppage = page_obj
|
||||
self._dpage = parsed_page
|
||||
self.valid = parsed_page is not None
|
||||
self._dp_doc = dp_doc
|
||||
self._page_no = page_no
|
||||
self._create_words = create_words
|
||||
self._create_textlines = create_textlines
|
||||
|
||||
self._dpage: Optional[SegmentedPdfPage] = None
|
||||
self._unloaded = False
|
||||
self.valid = (self._ppage is not None) and (self._dp_doc is not None)
|
||||
|
||||
def _ensure_parsed(self) -> None:
|
||||
if self._dpage is not None:
|
||||
return
|
||||
|
||||
seg_page = self._dp_doc.get_page(
|
||||
self._page_no + 1,
|
||||
create_words=self._create_words,
|
||||
create_textlines=self._create_textlines,
|
||||
)
|
||||
|
||||
# In Docling, all TextCell instances are expected with top-left origin.
|
||||
[
|
||||
tc.to_top_left_origin(seg_page.dimension.height)
|
||||
for tc in seg_page.textline_cells
|
||||
]
|
||||
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.char_cells]
|
||||
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.word_cells]
|
||||
|
||||
self._dpage = seg_page
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
self._ensure_parsed()
|
||||
assert self._dpage is not None
|
||||
|
||||
# Find intersecting cells on the page
|
||||
text_piece = ""
|
||||
page_size = self.get_size()
|
||||
@@ -56,12 +93,19 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
self._ensure_parsed()
|
||||
return self._dpage
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
self._ensure_parsed()
|
||||
assert self._dpage is not None
|
||||
|
||||
return self._dpage.textline_cells
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
self._ensure_parsed()
|
||||
assert self._dpage is not None
|
||||
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
||||
images = self._dpage.bitmap_resources
|
||||
@@ -123,8 +167,13 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
||||
# )
|
||||
|
||||
def unload(self):
|
||||
if not self._unloaded and self._dp_doc is not None:
|
||||
self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
|
||||
self._unloaded = True
|
||||
|
||||
self._ppage = None
|
||||
self._dpage = None
|
||||
self._dp_doc = None
|
||||
|
||||
|
||||
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
||||
@@ -157,30 +206,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
||||
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
||||
) -> DoclingParseV4PageBackend:
|
||||
with pypdfium2_lock:
|
||||
seg_page = self.dp_doc.get_page(
|
||||
page_no + 1,
|
||||
create_words=create_words,
|
||||
create_textlines=create_textlines,
|
||||
)
|
||||
ppage = self._pdoc[page_no]
|
||||
|
||||
# In Docling, all TextCell instances are expected with top-left origin.
|
||||
[
|
||||
tc.to_top_left_origin(seg_page.dimension.height)
|
||||
for tc in seg_page.textline_cells
|
||||
]
|
||||
[
|
||||
tc.to_top_left_origin(seg_page.dimension.height)
|
||||
for tc in seg_page.char_cells
|
||||
]
|
||||
[
|
||||
tc.to_top_left_origin(seg_page.dimension.height)
|
||||
for tc in seg_page.word_cells
|
||||
]
|
||||
|
||||
return DoclingParseV4PageBackend(
|
||||
seg_page,
|
||||
self._pdoc[page_no],
|
||||
)
|
||||
return DoclingParseV4PageBackend(
|
||||
dp_doc=self.dp_doc,
|
||||
page_obj=ppage,
|
||||
page_no=page_no,
|
||||
create_words=create_words,
|
||||
create_textlines=create_textlines,
|
||||
)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.page_count() > 0
|
||||
|
||||
@@ -323,9 +323,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
),
|
||||
)
|
||||
|
||||
generate_parsed_pages: Literal[True] = (
|
||||
True # Always True since parsed_page is now mandatory
|
||||
)
|
||||
generate_parsed_pages: bool = False
|
||||
|
||||
|
||||
class ProcessingPipeline(str, Enum):
|
||||
|
||||
@@ -20,7 +20,7 @@ from docling.datamodel.base_models import (
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import GenericEnrichmentModel
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
@@ -168,6 +168,12 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
# Cleanup page backends
|
||||
if not self.keep_backend and p._backend is not None:
|
||||
p._backend.unload()
|
||||
if (
|
||||
isinstance(self.pipeline_options, PdfPipelineOptions)
|
||||
and not self.pipeline_options.generate_parsed_pages
|
||||
):
|
||||
del p.parsed_page
|
||||
p.parsed_page = None
|
||||
|
||||
end_batch_time = time.monotonic()
|
||||
total_elapsed_time += end_batch_time - start_batch_time
|
||||
|
||||
@@ -565,10 +565,12 @@ class ThreadedStandardPdfPipeline(BasePipeline):
|
||||
if not self.keep_images:
|
||||
for p in conv_res.pages:
|
||||
p._image_cache = {}
|
||||
if not self.keep_backend:
|
||||
for p in conv_res.pages:
|
||||
if p._backend is not None:
|
||||
p._backend.unload()
|
||||
for p in conv_res.pages:
|
||||
if not self.keep_backend and p._backend is not None:
|
||||
p._backend.unload()
|
||||
if not self.pipeline_options.generate_parsed_pages:
|
||||
del p.parsed_page
|
||||
p.parsed_page = None
|
||||
|
||||
# ---------------------------------------------------------------- assemble
|
||||
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
|
||||
Reference in New Issue
Block a user