mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Heuristic updates
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
16c90b64f5
commit
75b284f35a
@ -125,8 +125,8 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
penalty += 0.1 * len(frag_matches)
|
penalty += 0.1 * len(frag_matches)
|
||||||
|
|
||||||
# Additional heuristic: if the average token length is below 2, add a penalty.
|
# Additional heuristic: if the average token length is below 2, add a penalty.
|
||||||
tokens = text.split()
|
# tokens = text.split()
|
||||||
if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
|
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
|
||||||
penalty += 0.2
|
# penalty += 0.2
|
||||||
|
|
||||||
return max(1.0 - penalty, 0.0)
|
return max(1.0 - penalty, 0.0)
|
||||||
|
Loading…
Reference in New Issue
Block a user