From 923f766adad3f9ebdcd5a10fcbf6eb12c782bac0 Mon Sep 17 00:00:00 2001
From: Maksym Lysak <mly@zurich.ibm.com>
Date: Mon, 24 Feb 2025 16:54:59 +0100
Subject: [PATCH] Replaced remaining strings to appropriate enums

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
---
 docling/pipeline/vlm_pipeline.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index 5e641ddc..6c79860f 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -313,16 +313,28 @@ class VlmPipeline(PaginatedPipeline):
             tokens = [
                 token
                 for token in tokens
-                if not (token.startswith("<loc_") or token in ["<otsl>", "</otsl>"])
-                # if not (token.startswith(DocumentToken.BEG_LOC) or token in [DocumentToken.BEG_OTSL, DocumentToken.END_OTSL])
+                if not (
+                    token.startswith(rf"<{DocumentToken.LOC.value}")
+                    or token
+                    in [
+                        rf"<{DocumentToken.OTSL.value}>",
+                        rf"</{DocumentToken.OTSL.value}>",
+                    ]
+                )
             ]
             # Split the string by those tokens to get the in-between text
             text_parts = re.split(pattern, s)
             text_parts = [
                 token
                 for token in text_parts
-                if not (token.startswith("<loc_") or token in ["<otsl>", "</otsl>"])
-                # if not (token.startswith(DocumentToken.BEG_LOC) or token in [DocumentToken.BEG_OTSL, DocumentToken.END_OTSL])
+                if not (
+                    token.startswith(rf"<{DocumentToken.LOC.value}")
+                    or token
+                    in [
+                        rf"<{DocumentToken.OTSL.value}>",
+                        rf"</{DocumentToken.OTSL.value}>",
+                    ]
+                )
             ]
             # Remove any empty or purely whitespace strings from text_parts
             text_parts = [part for part in text_parts if part.strip()]
@@ -372,8 +384,9 @@ class VlmPipeline(PaginatedPipeline):
                 rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
                 rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
                 rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
-                rf"{DocItemLabel.SECTION_HEADER}_level_1|otsl)>.*?</(?P=tag)>"
+                rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
             )
+
             # DocumentToken.OTSL
             pattern = re.compile(tag_pattern, re.DOTALL)
 
@@ -390,11 +403,11 @@ class VlmPipeline(PaginatedPipeline):
                 if bbox:
                     bounding_boxes.append((bbox, color))
 
-                if tag_name == "otsl":
+                if tag_name == DocumentToken.OTSL.value:
                     table_data = parse_table_content(full_chunk)
                     doc.add_table(data=table_data)
 
-                elif tag_name == "picture":
+                elif tag_name == DocItemLabel.PICTURE:
                     text_caption_content = extract_inner_text(full_chunk)
                     if image:
                         if bbox: