docling/docs/examples/plugin_translation.py
Ayoub El Bouchtili 224d633b7e feat: Introduce plugin support for document conversion
This change enables users to extend the document conversion process with custom logic through plugins.

- Introduced a PluginManager to handle preprocessing and postprocessing plugins in DocumentConverter.
- Updated DocumentConverter to accept and initialize a list of plugins.
- Implemented plugin execution within the document processing pipeline, enabling custom modifications before and after conversion.
- Updated ConversionResult to include metadata about the plugins used during conversion.
- Updated the CLI to accept plugin paths and load them dynamically.
- Expanded documentation with examples for creating and using plugins.
- Added test cases to verify plugin integration and ensure correct execution in various scenarios.

Signed-off-by: Ayoub El Bouchtili <Ayoub.elbouchtili@gmail.com>
2025-01-20 12:11:05 +01:00

106 lines
3.8 KiB
Python

from typing import Optional
from datetime import datetime
from docling.datamodel.document import InputDocument, ConversionResult
from docling.plugins import DoclingPlugin, PluginMetadata
from docling_core.types.doc import TextItem, TableItem
class TranslationPlugin(DoclingPlugin):
"""Plugin that translates document text to a target language."""
def __init__(self, target_lang: str, source_lang: Optional[str] = None):
"""Initialize the translation plugin.
Args:
target_lang: Target language code (e.g. 'fr' for French)
source_lang: Optional source language code. If not provided,
will be auto-detected during translation
"""
super().__init__(
name="TranslationPlugin",
metadata=PluginMetadata(
version="0.1.0",
description=f"Translates document text to {target_lang}",
author="Ayoub EL BOUCHTILI",
preprocess={},
postprocess={}
)
)
self.target_lang = target_lang
self.source_lang = source_lang
def translate_text(self, text: str) -> tuple[str, str]:
"""Translate text to target language.
Args:
text: Text to translate
Returns:
Tuple of (translated_text, detected_source_lang)
"""
# IMPLEMENT YOUR TRANSLATION LOGIC HERE
# FOR EXAMPLE USING GOOGLE TRANSLATE:
# from googletrans import Translator
# translator = Translator()
# if self.source_lang:
# result = translator.translate(text, src=self.source_lang, dest=self.target_lang)
# else:
# result = translator.translate(text, dest=self.target_lang)
# return result.text, result.src
# END OF PLACEHOLDER IMPLEMENTATION
return text, self.source_lang or "en"
def postprocess(self, result: ConversionResult) -> ConversionResult:
"""Translate document text after conversion."""
if result.document and result.document.texts:
detected_langs = set()
# Translate all text items
for element in result.document.iterate_items():
if isinstance(element[0], TextItem):
# Translate
translated, detected = self.translate_text(element[0].text)
element[0].text = translated
detected_langs.add(detected)
elif isinstance(element[0], TableItem):
# Handle table cells
for cell in element[0].data.table_cells:
translated, detected = self.translate_text(cell.text)
cell.text = translated
detected_langs.add(detected)
# Add translation metadata
self.metadata.postprocess = {
"target_language": self.target_lang,
"source_languages": list(detected_langs),
"timestamp": datetime.now().isoformat()
}
# Add plugin metadata to result
if self.name not in result.plugins:
result.plugins[self.name] = self.metadata.model_dump()
return result
def main():
# Example usage
from docling.document_converter import DocumentConverter
# Create plugin instance
translation_plugin = TranslationPlugin(target_lang="fr")
# Initialize converter with plugin
converter = DocumentConverter(plugins=[translation_plugin])
# Convert a document
result = converter.convert("./tests/data/docx/word_sample.docx")
print(f"Conversion completed with status: {result.status}")
print(f"Plugin metadata: {result.plugins}")
if __name__ == "__main__":
main()