feat: Introduce plugin support for document conversion

This change enables users to extend the document conversion process with custom logic through plugins.

- Introduced a PluginManager to handle preprocessing and postprocessing plugins in DocumentConverter.
- Updated DocumentConverter to accept and initialize a list of plugins.
- Implemented plugin execution within the document processing pipeline, enabling custom modifications before and after conversion.
- Updated ConversionResult to include metadata about the plugins used during conversion.
- Updated the CLI to accept plugin paths and load them dynamically.
- Expanded documentation with examples for creating and using plugins.
- Added test cases to verify plugin integration and ensure correct execution in various scenarios.

Signed-off-by: Ayoub El Bouchtili <Ayoub.elbouchtili@gmail.com>
This commit is contained in:
Ayoub El Bouchtili
2025-01-18 15:49:51 +01:00
parent 2cbc5ce521
commit 224d633b7e
11 changed files with 665 additions and 6 deletions

View File

@@ -0,0 +1,66 @@
from datetime import datetime
from docling.datamodel.document import InputDocument, ConversionResult
from docling.document_converter import DocumentConverter
from docling.plugins import DoclingPlugin, PluginMetadata
class BasicPlugin(DoclingPlugin):
"""Example plugin that adds metadata and modifies text."""
def __init__(self):
super().__init__(
name="BasicPlugin",
metadata=PluginMetadata(
version="0.1.0",
description="A basic plugin that adds processing metadata and modifies text after conversion.",
author="Ayoub EL BOUCHTILI",
preprocess={},
postprocess={}
)
)
def preprocess(self, input_doc: InputDocument) -> InputDocument:
"""Add custom metadata during preprocessing."""
if not hasattr(input_doc, '_plugin_metadata'):
input_doc._plugin_metadata = {}
self.metadata.preprocess = {
"timestamp": datetime.now().isoformat()
}
return input_doc
def postprocess(self, result: ConversionResult) -> ConversionResult:
"""Add metadata during postprocessing and modify text."""
extra_text = f"[Processed by {self.name}]"
if result.document and result.document.texts:
# Add a note to the first text item
first_text = result.document.texts[0]
first_text.text = f"{extra_text} {first_text.text}"
# Update postprocessing metadata properly
self.metadata.postprocess = {
"appended_text": extra_text,
"timestamp": datetime.now().isoformat()
}
# Append plugin metadata to the result
if self.name not in result.plugins:
result.plugins[self.name] = self.metadata.model_dump()
return result
def main():
# Create plugin instance
basic_plugin = BasicPlugin()
# Initialize converter with a plugin
converter = DocumentConverter(plugins=[basic_plugin])
# Convert a document
result = converter.convert("./tests/data/docx/word_sample.docx")
print(f"Conversion completed with status: {result.status}")
print(f"Plugins metadata: {result.plugins}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,105 @@
from typing import Optional
from datetime import datetime
from docling.datamodel.document import InputDocument, ConversionResult
from docling.plugins import DoclingPlugin, PluginMetadata
from docling_core.types.doc import TextItem, TableItem
class TranslationPlugin(DoclingPlugin):
"""Plugin that translates document text to a target language."""
def __init__(self, target_lang: str, source_lang: Optional[str] = None):
"""Initialize the translation plugin.
Args:
target_lang: Target language code (e.g. 'fr' for French)
source_lang: Optional source language code. If not provided,
will be auto-detected during translation
"""
super().__init__(
name="TranslationPlugin",
metadata=PluginMetadata(
version="0.1.0",
description=f"Translates document text to {target_lang}",
author="Ayoub EL BOUCHTILI",
preprocess={},
postprocess={}
)
)
self.target_lang = target_lang
self.source_lang = source_lang
def translate_text(self, text: str) -> tuple[str, str]:
"""Translate text to target language.
Args:
text: Text to translate
Returns:
Tuple of (translated_text, detected_source_lang)
"""
# IMPLEMENT YOUR TRANSLATION LOGIC HERE
# FOR EXAMPLE USING GOOGLE TRANSLATE:
# from googletrans import Translator
# translator = Translator()
# if self.source_lang:
# result = translator.translate(text, src=self.source_lang, dest=self.target_lang)
# else:
# result = translator.translate(text, dest=self.target_lang)
# return result.text, result.src
# END OF PLACEHOLDER IMPLEMENTATION
return text, self.source_lang or "en"
def postprocess(self, result: ConversionResult) -> ConversionResult:
"""Translate document text after conversion."""
if result.document and result.document.texts:
detected_langs = set()
# Translate all text items
for element in result.document.iterate_items():
if isinstance(element[0], TextItem):
# Translate
translated, detected = self.translate_text(element[0].text)
element[0].text = translated
detected_langs.add(detected)
elif isinstance(element[0], TableItem):
# Handle table cells
for cell in element[0].data.table_cells:
translated, detected = self.translate_text(cell.text)
cell.text = translated
detected_langs.add(detected)
# Add translation metadata
self.metadata.postprocess = {
"target_language": self.target_lang,
"source_languages": list(detected_langs),
"timestamp": datetime.now().isoformat()
}
# Add plugin metadata to result
if self.name not in result.plugins:
result.plugins[self.name] = self.metadata.model_dump()
return result
def main():
# Example usage
from docling.document_converter import DocumentConverter
# Create plugin instance
translation_plugin = TranslationPlugin(target_lang="fr")
# Initialize converter with plugin
converter = DocumentConverter(plugins=[translation_plugin])
# Convert a document
result = converter.convert("./tests/data/docx/word_sample.docx")
print(f"Conversion completed with status: {result.status}")
print(f"Plugin metadata: {result.plugins}")
if __name__ == "__main__":
main()