### Document Loading using Docling & adding Tables and Figures Discription

###### This notebook creates enrichments for tables and images and places it back in the markdown, for creating more meaningful search indexes and ensuring promising results to reduce false positives in semantic searches of tables and images

In [2]:
"""
This code loads environment variables using the `dotenv` library and sets the necessary environment variables for Azure services.
The environment variables are loaded from the `.env` file in the same directory as this notebook.
"""

import os
from dotenv import load_dotenv
from openai import AzureOpenAI

load_dotenv()

aoai_api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_api_key= os.getenv("AZURE_OPENAI_API_KEY")
aoai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
aoai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")

In [3]:
import base64
from mimetypes import guess_type

# Function to encode a local image into data URL 
def local_image_to_data_url(image_path):
 # Guess the MIME type of the image based on the file extension
 mime_type, _ = guess_type(image_path)
 if mime_type is None:
 mime_type = 'application/octet-stream' # Default MIME type if none is found

 # Read and encode the image file
 with open(image_path, "rb") as image_file:
 base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')

 # Construct the data URL
 return f"data:{mime_type};base64,{base64_encoded_data}"


##### Use Azure OpenAI (GPT-4V model) to understand the semantics of the figure content

In [52]:
MAX_TOKENS = 2000

def understand_image_with_gptv(api_base, api_key, deployment_name, api_version, image_path):
 """
 Generates a description for an image using the GPT-4V model.

 Parameters:
 - api_base (str): The base URL of the API.
 - api_key (str): The API key for authentication.
 - deployment_name (str): The name of the deployment.
 - api_version (str): The version of the API.
 - image_path (str): The path to the image file.
 - caption (str): The caption for the image.

 Returns:
 - img_description (str): The generated description for the image.
 """
 client = AzureOpenAI(
 api_key=api_key, 
 api_version=api_version,
 base_url=f"{api_base}/openai/deployments/{deployment_name}"
 )

 data_url = local_image_to_data_url(image_path)

 response = client.chat.completions.create(
 model=deployment_name,
 messages=[
 { "role": "system", "content": "You are a helpful assistant." },
 { "role": "user", "content": [ 
 { 
 "type": "text", 
 "text": "Describe this image in English:" 
 },
 { 
 "type": "image_url",
 "image_url": {
 "url": data_url
 }
 }
 ] } 
 ],
 max_tokens=MAX_TOKENS
 )

 img_description = response.choices[0].message.content
 
 return img_description

##### Use Azure OpenAI (GPT-4V model) to understand the semantics of the table content (Abstractions of the table content and images)

In [53]:
MAX_TOKENS = 2000

def understand_table_with_gptv(api_base, api_key, deployment_name, api_version, image_path):
 """
 Generates a description for an image using the GPT-4V model.

 Parameters:
 - api_base (str): The base URL of the API.
 - api_key (str): The API key for authentication.
 - deployment_name (str): The name of the deployment.
 - api_version (str): The version of the API.
 - image_path (str): The path to the image file.
 - caption (str): The caption for the image.

 Returns:
 - img_description (str): The generated description for the image.
 """
 client = AzureOpenAI(
 api_key=api_key, 
 api_version=api_version,
 base_url=f"{api_base}/openai/deployments/{deployment_name}"
 )

 data_url = local_image_to_data_url(image_path)

 response = client.chat.completions.create(
 model=deployment_name,
 messages=[
 { "role": "system", "content": "You are a helpful assistant." },
 { "role": "user", "content": [ 
 { 
 "type": "text", 
 "text": "Describe this Table image in English:" 
 },
 { 
 "type": "image_url",
 "image_url": {
 "url": data_url
 }
 }
 ] } 
 ],
 max_tokens=MAX_TOKENS
 )

 img_description = response.choices[0].message.content
 
 return img_description

##### Update the markdown with the abstractions of images as generated earlier, updates are being done exactly above each respective image in the markdown

In [54]:
def update_figure_description(md_content, img_description, idx):
 """
 Updates the figure description in the Markdown content.

 Args:
 md_content (str): The original Markdown content.
 img_description (str): The new description for the image.
 idx (int): The index of the figure.

 Returns:
 str: The updated Markdown content with the new figure description.
 """

 # The substring you're looking for
 start_substring = f""
 end_substring = ""
 new_string = f""
 
 new_md_content = md_content
 # Find the start and end indices of the part to replace
 start_index = md_content.find(start_substring)
 if start_index != -1: # if start_substring is found
 start_index += len(start_substring) # move the index to the end of start_substring
 end_index = md_content.find(end_substring, start_index)
 if end_index != -1: # if end_substring is found
 # Replace the old string with the new string
 new_md_content = md_content[:start_index] + new_string + md_content[end_index:]
 
 return new_md_content


##### Update the markdown with the abstractions of tables as generated earlier, updates are being done exactly above each respective table in the markdown

In [55]:
def update_table_description(md_content, img_description, idx):
 """
 Updates the figure description in the Markdown content.

 Args:
 md_content (str): The original Markdown content.
 img_description (str): The new description for the image.
 idx (int): The index of the figure.

 Returns:
 str: The updated Markdown content with the new figure description.
 """

 # The substring you're looking for
 start_substring = f""
 new_string = f""
 
 new_md_content = md_content
 new_md_content = new_md_content.replace(start_substring,f'{start_substring}\n{new_string}')
 
 return new_md_content


##### Tagging the order to figures and tables - which makes tracebility and explainability easier

In [56]:
import re
def add_figures_indexes(string):
 figures = re.finditer(r"(.*?)", string, re.DOTALL)
 updated_string = string
 offset = 0
 for index, match in enumerate(figures):
 full_match = match.group(1) # Full ...
 replacement = f"\n\n\n\n"
 start, end = match.start() + offset, match.end() + offset
 updated_string = updated_string[:start] + replacement + updated_string[end:]
 offset += len(replacement) - len(full_match)

 return updated_string


In [57]:
import re

def add_tables_indexes(content):
 # Match markdown tables using a pattern that looks for at least two pipe-separated rows
 # - one for headers (with optional alignment)
 # - one or more for data
 table_pattern = re.compile(
 r"(?P\n\n|^)(?P(?:\|.*\|\n)+\|(?: *[-:]+[-| :]*?)\|\n(?:\|.*\|\n?)+)",
 re.MULTILINE
 )

 updated_content = content
 offset = 0

 for index, match in enumerate(table_pattern.finditer(content)):
 start, end = match.start("table") + offset, match.end("table") + offset
 replacement = f"\n" + match.group("table")
 updated_content = updated_content[:start] + replacement + updated_content[end:]
 offset += len(replacement) - len(match.group("table"))

 return updated_content


##### docling reader - calls the above functions and generates the markdowns

In [58]:
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
IMAGE_RESOLUTION_SCALE = 2.0

def docling_reader(input_file_path):
 input_file_path = Path(f"{input_file_path}")
 

 pipeline_options = PdfPipelineOptions()
 pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
 pipeline_options.generate_page_images = True
 pipeline_options.generate_table_images = True
 pipeline_options.generate_picture_images = True

 doc_converter = DocumentConverter(
 format_options={
 InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
 }
 )

 conv_res = doc_converter.convert(input_file_path)
 md_content = conv_res.document.export_to_markdown(image_placeholder='',page_break_placeholder='')
 
 doc_filename = conv_res.input.file.stem
 output_dir = Path(f"Output_Markdowns/{doc_filename}")
 output_dir.mkdir(parents=True, exist_ok=True)

 md_filename = output_dir / f"md_content_{doc_filename}.md"
 with md_filename.open("w", encoding="utf-8") as fp:
 fp.write(md_content)

 md_content = add_figures_indexes(md_content)
 md_content = add_tables_indexes(md_content)

 # Save page images
 for page_no, page in conv_res.document.pages.items():
 page_no = page.page_no
 page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
 with page_image_filename.open("wb") as fp:
 page.image.pil_image.save(fp, format="PNG")

 # Save images of figures and tables
 table_counter = 0
 picture_counter = 0
 for element, _level in conv_res.document.iterate_items(): 
 if isinstance(element, TableItem):
 element_image_filename = (
 output_dir / f"{doc_filename}-table-{table_counter}.png"
 )
 with element_image_filename.open("wb") as fp:
 element.image.pil_image.save(fp, "PNG")
 table_desc = understand_table_with_gptv(aoai_api_base, aoai_api_key, aoai_deployment_name, aoai_api_version, element_image_filename)
 md_content = update_table_description(md_content, table_desc, table_counter)
 table_counter += 1
 
 if isinstance(element, PictureItem):
 element_image_filename = (
 output_dir / f"{doc_filename}-picture-{picture_counter}.png"
 )
 with element_image_filename.open("wb") as fp:
 element.image.pil_image.save(fp, "PNG")
 picture_desc = understand_image_with_gptv(aoai_api_base, aoai_api_key, aoai_deployment_name, aoai_api_version, element_image_filename)
 md_content = update_figure_description(md_content, picture_desc, picture_counter)
 picture_counter += 1
 

 # Save markdown with embedded pictures
 md_filename = output_dir / f"md_content_tf_description_{doc_filename}.md"
 with md_filename.open("w", encoding="utf-8") as fp:
 fp.write(md_content)
 return md_content

In [59]:
if __name__ == "__main__":
 input_file_paths = ["Datasheet_AR71UW2SW.pdf","Datasheet_EOE8P39X.pdf","Datasheet_ESS47400SX.pdf","Datasheet_EWUS052B5B.pdf"]
 for file in input_file_paths:
 docling_reader(file)
 print(f'Completed process for {file}')

Completed process for Datasheet_AR71UW2SW.pdf
Completed process for Datasheet_EOE8P39X.pdf
Completed process for Datasheet_ESS47400SX.pdf
Completed process for Datasheet_EWUS052B5B.pdf
