mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
This change enables users to extend the document conversion process with custom logic through plugins. - Introduced a PluginManager to handle preprocessing and postprocessing plugins in DocumentConverter. - Updated DocumentConverter to accept and initialize a list of plugins. - Implemented plugin execution within the document processing pipeline, enabling custom modifications before and after conversion. - Updated ConversionResult to include metadata about the plugins used during conversion. - Updated the CLI to accept plugin paths and load them dynamically. - Expanded documentation with examples for creating and using plugins. - Added test cases to verify plugin integration and ensure correct execution in various scenarios. Signed-off-by: Ayoub El Bouchtili <Ayoub.elbouchtili@gmail.com>
106 lines
3.8 KiB
Python
106 lines
3.8 KiB
Python
from typing import Optional
|
|
from datetime import datetime
|
|
|
|
from docling.datamodel.document import InputDocument, ConversionResult
|
|
from docling.plugins import DoclingPlugin, PluginMetadata
|
|
from docling_core.types.doc import TextItem, TableItem
|
|
|
|
class TranslationPlugin(DoclingPlugin):
|
|
"""Plugin that translates document text to a target language."""
|
|
|
|
def __init__(self, target_lang: str, source_lang: Optional[str] = None):
|
|
"""Initialize the translation plugin.
|
|
|
|
Args:
|
|
target_lang: Target language code (e.g. 'fr' for French)
|
|
source_lang: Optional source language code. If not provided,
|
|
will be auto-detected during translation
|
|
"""
|
|
super().__init__(
|
|
name="TranslationPlugin",
|
|
metadata=PluginMetadata(
|
|
version="0.1.0",
|
|
description=f"Translates document text to {target_lang}",
|
|
author="Ayoub EL BOUCHTILI",
|
|
preprocess={},
|
|
postprocess={}
|
|
)
|
|
)
|
|
self.target_lang = target_lang
|
|
self.source_lang = source_lang
|
|
|
|
def translate_text(self, text: str) -> tuple[str, str]:
|
|
"""Translate text to target language.
|
|
|
|
Args:
|
|
text: Text to translate
|
|
|
|
Returns:
|
|
Tuple of (translated_text, detected_source_lang)
|
|
"""
|
|
# IMPLEMENT YOUR TRANSLATION LOGIC HERE
|
|
# FOR EXAMPLE USING GOOGLE TRANSLATE:
|
|
|
|
# from googletrans import Translator
|
|
# translator = Translator()
|
|
# if self.source_lang:
|
|
# result = translator.translate(text, src=self.source_lang, dest=self.target_lang)
|
|
# else:
|
|
# result = translator.translate(text, dest=self.target_lang)
|
|
# return result.text, result.src
|
|
|
|
# END OF PLACEHOLDER IMPLEMENTATION
|
|
return text, self.source_lang or "en"
|
|
|
|
def postprocess(self, result: ConversionResult) -> ConversionResult:
|
|
"""Translate document text after conversion."""
|
|
|
|
if result.document and result.document.texts:
|
|
detected_langs = set()
|
|
|
|
# Translate all text items
|
|
for element in result.document.iterate_items():
|
|
if isinstance(element[0], TextItem):
|
|
# Translate
|
|
translated, detected = self.translate_text(element[0].text)
|
|
element[0].text = translated
|
|
detected_langs.add(detected)
|
|
|
|
elif isinstance(element[0], TableItem):
|
|
# Handle table cells
|
|
for cell in element[0].data.table_cells:
|
|
translated, detected = self.translate_text(cell.text)
|
|
cell.text = translated
|
|
detected_langs.add(detected)
|
|
|
|
# Add translation metadata
|
|
self.metadata.postprocess = {
|
|
"target_language": self.target_lang,
|
|
"source_languages": list(detected_langs),
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
# Add plugin metadata to result
|
|
if self.name not in result.plugins:
|
|
result.plugins[self.name] = self.metadata.model_dump()
|
|
|
|
return result
|
|
|
|
def main():
|
|
# Example usage
|
|
from docling.document_converter import DocumentConverter
|
|
|
|
# Create plugin instance
|
|
translation_plugin = TranslationPlugin(target_lang="fr")
|
|
|
|
# Initialize converter with plugin
|
|
converter = DocumentConverter(plugins=[translation_plugin])
|
|
|
|
# Convert a document
|
|
result = converter.convert("./tests/data/docx/word_sample.docx")
|
|
print(f"Conversion completed with status: {result.status}")
|
|
print(f"Plugin metadata: {result.plugins}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|