mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: updating default parameters to get better performance with docling-parse (#2208)
* updated the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updated the parameters Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
a9f41b088e
commit
b49d1ad4f1
@@ -30,13 +30,21 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
||||
page_no: int,
|
||||
create_words: bool = True,
|
||||
create_textlines: bool = True,
|
||||
keep_chars: bool = False,
|
||||
keep_lines: bool = False,
|
||||
keep_images: bool = True,
|
||||
):
|
||||
self._ppage = page_obj
|
||||
self._dp_doc = dp_doc
|
||||
self._page_no = page_no
|
||||
|
||||
self._create_words = create_words
|
||||
self._create_textlines = create_textlines
|
||||
|
||||
self._keep_chars = keep_chars
|
||||
self._keep_lines = keep_lines
|
||||
self._keep_images = keep_images
|
||||
|
||||
self._dpage: Optional[SegmentedPdfPage] = None
|
||||
self._unloaded = False
|
||||
self.valid = (self._ppage is not None) and (self._dp_doc is not None)
|
||||
@@ -47,9 +55,9 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
||||
|
||||
seg_page = self._dp_doc.get_page(
|
||||
self._page_no + 1,
|
||||
keep_chars=True,
|
||||
keep_lines=True,
|
||||
keep_bitmaps=True,
|
||||
keep_chars=self._keep_chars,
|
||||
keep_lines=self._keep_lines,
|
||||
keep_bitmaps=self._keep_images,
|
||||
create_words=self._create_words,
|
||||
create_textlines=self._create_textlines,
|
||||
enforce_same_font=True,
|
||||
|
||||
Reference in New Issue
Block a user