From b49d1ad4f1af6eeadc3f8d0e35123dc52c6e228e Mon Sep 17 00:00:00 2001 From: "Peter W. J. Staar" <91719829+PeterStaar-IBM@users.noreply.github.com> Date: Fri, 5 Sep 2025 14:06:21 +0200 Subject: [PATCH] feat: updating default parameters to get better performance with docling-parse (#2208) * updated the code Signed-off-by: Peter Staar * updated the parameters Signed-off-by: Peter Staar --------- Signed-off-by: Peter Staar --- docling/backend/docling_parse_v4_backend.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/docling/backend/docling_parse_v4_backend.py b/docling/backend/docling_parse_v4_backend.py index 5b42e531..6875ca1f 100644 --- a/docling/backend/docling_parse_v4_backend.py +++ b/docling/backend/docling_parse_v4_backend.py @@ -30,13 +30,21 @@ class DoclingParseV4PageBackend(PdfPageBackend): page_no: int, create_words: bool = True, create_textlines: bool = True, + keep_chars: bool = False, + keep_lines: bool = False, + keep_images: bool = True, ): self._ppage = page_obj self._dp_doc = dp_doc self._page_no = page_no + self._create_words = create_words self._create_textlines = create_textlines + self._keep_chars = keep_chars + self._keep_lines = keep_lines + self._keep_images = keep_images + self._dpage: Optional[SegmentedPdfPage] = None self._unloaded = False self.valid = (self._ppage is not None) and (self._dp_doc is not None) @@ -47,9 +55,9 @@ class DoclingParseV4PageBackend(PdfPageBackend): seg_page = self._dp_doc.get_page( self._page_no + 1, - keep_chars=True, - keep_lines=True, - keep_bitmaps=True, + keep_chars=self._keep_chars, + keep_lines=self._keep_lines, + keep_bitmaps=self._keep_images, create_words=self._create_words, create_textlines=self._create_textlines, enforce_same_font=True,