Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(plugin): Introduce plugin support for document conversion #772

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.plugins import DoclingPlugin

warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
Expand Down Expand Up @@ -146,6 +147,26 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
return re.split(r"[;,]", raw)


def _load_plugin(plugin_spec: str) -> DoclingPlugin:
"""Load a plugin from a module path specification.

Format: 'module.path:PluginClass'
Example: 'myapp.plugins:CustomPlugin'
"""
try:
module_path, class_name = plugin_spec.split(":")
module = importlib.import_module(module_path)
plugin_class = getattr(module, class_name)

if not issubclass(plugin_class, DoclingPlugin):
raise ValueError(f"Class {class_name} is not a DoclingPlugin subclass")

return plugin_class()
except Exception as e:
err_console.print(f"[red]Error loading plugin {plugin_spec}: {str(e)}[/red]")
raise typer.Abort()


@app.command(no_args_is_help=True)
def convert(
input_sources: Annotated[
Expand Down Expand Up @@ -268,6 +289,14 @@ def convert(
device: Annotated[
AcceleratorDevice, typer.Option(..., help="Accelerator device")
] = AcceleratorDevice.AUTO,
plugins: Annotated[
Optional[List[str]],
typer.Option(
None,
"--plugin", "-p",
help="Names of plugins to use during conversion. Must be in the format 'module.path:PluginClass'",
),
] = None,
):
if verbose == 0:
logging.basicConfig(level=logging.WARNING)
Expand Down Expand Up @@ -394,9 +423,23 @@ def convert(
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}

loaded_plugins = []
if plugins:
for plugin_spec in plugins:
try:
plugin = _load_plugin(plugin_spec)
loaded_plugins.append(plugin)
except Exception as e:
if abort_on_error:
raise
_log.warning(f"Failed to load plugin {plugin_spec}: {e}")
continue

doc_converter = DocumentConverter(
allowed_formats=from_formats,
format_options=format_options,
plugins=loaded_plugins,
)

start_time = time.time()
Expand Down
8 changes: 6 additions & 2 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path, PurePath
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
List,
Expand Down Expand Up @@ -44,7 +45,7 @@
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_source_to_stream
from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel
from pydantic import BaseModel, Field
from typing_extensions import deprecated

from docling.backend.abstract_backend import (
Expand Down Expand Up @@ -104,7 +105,7 @@ class InputDocument(BaseModel):

filesize: Optional[int] = None
page_count: int = 0

_backend: AbstractDocumentBackend # Internal PDF backend used

def __init__(
Expand Down Expand Up @@ -198,6 +199,9 @@ class ConversionResult(BaseModel):
timings: Dict[str, ProfilingItem] = {}

document: DoclingDocument = _EMPTY_DOCLING_DOC

# Metadata object for tracking plugins details and pre/post processing related data
plugins: Dict[str, Any] = Field(default_factory=dict)

@property
@deprecated("Use document instead.")
Expand Down
29 changes: 25 additions & 4 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.utils.utils import chunkify
from docling.plugins import PluginManager, DoclingPlugin

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -148,8 +149,9 @@ class DocumentConverter:

def __init__(
self,
allowed_formats: Optional[List[InputFormat]] = None,
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
allowed_formats: Optional[List[InputFormat]] = None,
plugins: Optional[List[DoclingPlugin]] = None,
):
self.allowed_formats = (
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
Expand All @@ -163,6 +165,11 @@ def __init__(
for format in self.allowed_formats
}
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
self.plugin_manager = PluginManager()

if plugins:
for plugin in plugins:
self.plugin_manager.register_plugin(plugin)

def initialize_pipeline(self, format: InputFormat):
"""Initialize the conversion pipeline for the selected format."""
Expand Down Expand Up @@ -280,6 +287,14 @@ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
def _process_document(
self, in_doc: InputDocument, raises_on_error: bool
) -> ConversionResult:
conv_res = ConversionResult(input=in_doc)

try:
in_doc = self.plugin_manager.execute_preprocessors(in_doc)
except Exception as e:
if raises_on_error:
raise e
_log.error(f"Plugin preprocessing failed: {str(e)}")

valid = (
self.allowed_formats is not None and in_doc.format in self.allowed_formats
Expand All @@ -296,9 +311,15 @@ def _process_document(
module_name="",
error_message=error_message,
)
conv_res = ConversionResult(
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
)
conv_res.status = ConversionStatus.SKIPPED
conv_res.errors.append(error_item)

try:
conv_res = self.plugin_manager.execute_postprocessors(conv_res)
except Exception as e:
if raises_on_error:
raise e
_log.error(f"Plugin postprocessing failed: {str(e)}")

return conv_res

Expand Down
7 changes: 7 additions & 0 deletions docling/plugins/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Docling plugin system for extending document processing capabilities."""

from .base import DoclingPlugin
from .manager import PluginManager
from .models import PluginMetadata

__all__ = ["DoclingPlugin", "PluginManager", "PluginMetadata"]
20 changes: 20 additions & 0 deletions docling/plugins/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Base plugin class for Docling plugins."""

from docling.datamodel.document import InputDocument, ConversionResult
from docling.plugins.models import PluginMetadata

class DoclingPlugin:
"""Base class for Docling plugins."""

def __init__(self, name: str, metadata: PluginMetadata):
"""Initialize the plugin."""
self.name = name
self.metadata = metadata

def preprocess(self, input_doc: InputDocument) -> InputDocument:
"""Preprocess the input document. Default implementation returns the input unmodified."""
return input_doc

def postprocess(self, result: ConversionResult) -> ConversionResult:
"""Postprocess the conversion result. Default implementation returns the result unmodified."""
return result
90 changes: 90 additions & 0 deletions docling/plugins/manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Plugin manager for Docling plugins."""

import re
from typing import List, Dict
from docling.datamodel.document import InputDocument, ConversionResult
from docling.plugins.base import DoclingPlugin
from docling.plugins.models import PluginMetadata

class PluginManager:
"""Manages the registration and execution of Docling plugins."""

NAME_PATTERN = re.compile(r'^[a-zA-Z][a-zA-Z0-9_-]*$')

def __init__(self):
self.preprocessors: List[DoclingPlugin] = []
self.postprocessors: List[DoclingPlugin] = []
self._registered_names: Dict[str, DoclingPlugin] = {}

def _validate_plugin_name(self, name: str) -> None:
"""Validate plugin name format."""
if not name or name.isspace():
raise ValueError("Plugin name cannot be empty or whitespace")
if not self.NAME_PATTERN.match(name):
raise ValueError(
"Plugin name must start with a letter and contain only "
"letters, numbers, underscores, or hyphens"
)

def _validate_plugin(self, plugin: DoclingPlugin) -> None:
"""Validate all aspects of a plugin."""
if not isinstance(plugin, DoclingPlugin):
raise ValueError(f"Plugin must be an instance of DoclingPlugin, got {type(plugin)}")

self._validate_plugin_name(plugin.name)

if plugin.name in self._registered_names:
raise ValueError(f"A plugin with name '{plugin.name}' is already registered")

if not plugin.metadata:
raise ValueError(f"Plugin '{plugin.name}' must have metadata")

# Validate metadata against PluginMetadata model
try:
# Convert metadata to dict if it's already a PluginMetadata instance
metadata_dict = (
plugin.metadata.model_dump()
if isinstance(plugin.metadata, PluginMetadata)
else plugin.metadata
)
PluginMetadata(**metadata_dict)
except Exception as e:
raise ValueError(f"Invalid metadata for plugin '{plugin.name}': {str(e)}")

# Check if the plugin implements at least one of the processing steps
if plugin.preprocess.__func__ is DoclingPlugin.preprocess and plugin.postprocess.__func__ is DoclingPlugin.postprocess:
raise ValueError(
f"Plugin '{plugin.name}' must implement at least a preprocessing or postprocessing step"
)

def register_plugin(self, plugin: DoclingPlugin) -> None:
"""Register a plugin."""
if plugin is None:
raise ValueError("Plugin cannot be None")

self._validate_plugin(plugin)
self._registered_names[plugin.name] = plugin
self.preprocessors.append(plugin)
self.postprocessors.append(plugin)

def _execute_plugins(self, items: List[DoclingPlugin], data):
"""Execute a sequence of plugins."""
for plugin in items:
try:
data = plugin.preprocess(data) if isinstance(data, InputDocument) else plugin.postprocess(data)
except Exception as e:
stage = "preprocessor" if isinstance(data, InputDocument) else "postprocessor"
raise RuntimeError(f"Error in {stage} {plugin.__class__.__name__}: {str(e)}") from e
return data

def execute_preprocessors(self, input_doc: InputDocument) -> InputDocument:
"""Execute all preprocessors."""
if input_doc is None:
raise ValueError("Input document cannot be None")
return self._execute_plugins(self.preprocessors, input_doc)

def execute_postprocessors(self, result: ConversionResult) -> ConversionResult:
"""Execute all postprocessors."""
if result is None:
raise ValueError("Conversion result cannot be None")
return self._execute_plugins(self.postprocessors, result)
36 changes: 36 additions & 0 deletions docling/plugins/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Data models for DoclingPlugin."""

from typing import Dict, Any
from pydantic import BaseModel, Field

class PluginMetadata(BaseModel):
"""Model for plugin metadata validation.

Attributes:
version: The plugin version following semantic versioning
description: A brief description of the plugin's functionality
author: The plugin author's name
preprocess: Metadata for preprocessing step
postprocess: Metadata for postprocessing step
"""
version: str = Field(
default="",
pattern=r"^\d+\.\d+\.\d+$",
description="Plugin version (semantic versioning)"
)
description: str = Field(
default="",
description="Brief description of the plugin"
)
author: str = Field(
default="",
description="Plugin author's name"
)
preprocess: Dict[str, Any] = Field(
default_factory=dict,
description="Preprocessing related metadata"
)
postprocess: Dict[str, Any] = Field(
default_factory=dict,
description="Postprocessing related metadata"
)
Loading