feat: updating default parameters to get better performance with docling-parse (#2208)

* updated the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the parameters

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-09-05 14:06:21 +02:00
committed by GitHub
parent a9f41b088e
commit b49d1ad4f1

View File

@@ -30,13 +30,21 @@ class DoclingParseV4PageBackend(PdfPageBackend):
page_no: int, page_no: int,
create_words: bool = True, create_words: bool = True,
create_textlines: bool = True, create_textlines: bool = True,
keep_chars: bool = False,
keep_lines: bool = False,
keep_images: bool = True,
): ):
self._ppage = page_obj self._ppage = page_obj
self._dp_doc = dp_doc self._dp_doc = dp_doc
self._page_no = page_no self._page_no = page_no
self._create_words = create_words self._create_words = create_words
self._create_textlines = create_textlines self._create_textlines = create_textlines
self._keep_chars = keep_chars
self._keep_lines = keep_lines
self._keep_images = keep_images
self._dpage: Optional[SegmentedPdfPage] = None self._dpage: Optional[SegmentedPdfPage] = None
self._unloaded = False self._unloaded = False
self.valid = (self._ppage is not None) and (self._dp_doc is not None) self.valid = (self._ppage is not None) and (self._dp_doc is not None)
@@ -47,9 +55,9 @@ class DoclingParseV4PageBackend(PdfPageBackend):
seg_page = self._dp_doc.get_page( seg_page = self._dp_doc.get_page(
self._page_no + 1, self._page_no + 1,
keep_chars=True, keep_chars=self._keep_chars,
keep_lines=True, keep_lines=self._keep_lines,
keep_bitmaps=True, keep_bitmaps=self._keep_images,
create_words=self._create_words, create_words=self._create_words,
create_textlines=self._create_textlines, create_textlines=self._create_textlines,
enforce_same_font=True, enforce_same_font=True,