Heuristic updates

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-04-22 08:48:22 +02:00
parent 16c90b64f5
commit 75b284f35a

View File

@ -125,8 +125,8 @@ class PagePreprocessingModel(BasePageModel):
penalty += 0.1 * len(frag_matches) penalty += 0.1 * len(frag_matches)
# Additional heuristic: if the average token length is below 2, add a penalty. # Additional heuristic: if the average token length is below 2, add a penalty.
tokens = text.split() # tokens = text.split()
if tokens and (sum(map(len, tokens)) / len(tokens)) < 2: # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
penalty += 0.2 # penalty += 0.2
return max(1.0 - penalty, 0.0) return max(1.0 - penalty, 0.0)