From 923f766adad3f9ebdcd5a10fcbf6eb12c782bac0 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 24 Feb 2025 16:54:59 +0100 Subject: [PATCH] Replaced remaining strings to appropriate enums Signed-off-by: Maksym Lysak --- docling/pipeline/vlm_pipeline.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 5e641ddc..6c79860f 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -313,16 +313,28 @@ class VlmPipeline(PaginatedPipeline): tokens = [ token for token in tokens - if not (token.startswith("", ""]) - # if not (token.startswith(DocumentToken.BEG_LOC) or token in [DocumentToken.BEG_OTSL, DocumentToken.END_OTSL]) + if not ( + token.startswith(rf"<{DocumentToken.LOC.value}") + or token + in [ + rf"<{DocumentToken.OTSL.value}>", + rf"", + ] + ) ] # Split the string by those tokens to get the in-between text text_parts = re.split(pattern, s) text_parts = [ token for token in text_parts - if not (token.startswith("", ""]) - # if not (token.startswith(DocumentToken.BEG_LOC) or token in [DocumentToken.BEG_OTSL, DocumentToken.END_OTSL]) + if not ( + token.startswith(rf"<{DocumentToken.LOC.value}") + or token + in [ + rf"<{DocumentToken.OTSL.value}>", + rf"", + ] + ) ] # Remove any empty or purely whitespace strings from text_parts text_parts = [part for part in text_parts if part.strip()] @@ -372,8 +384,9 @@ class VlmPipeline(PaginatedPipeline): rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|" rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|" rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|" - rf"{DocItemLabel.SECTION_HEADER}_level_1|otsl)>.*?" + rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?" ) + # DocumentToken.OTSL pattern = re.compile(tag_pattern, re.DOTALL) @@ -390,11 +403,11 @@ class VlmPipeline(PaginatedPipeline): if bbox: bounding_boxes.append((bbox, color)) - if tag_name == "otsl": + if tag_name == DocumentToken.OTSL.value: table_data = parse_table_content(full_chunk) doc.add_table(data=table_data) - elif tag_name == "picture": + elif tag_name == DocItemLabel.PICTURE: text_caption_content = extract_inner_text(full_chunk) if image: if bbox: