diff --git a/.gitignore b/.gitignore index eeec90ba..e73e8c1e 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,7 @@ node_modules/ .DS_Store .venv examples/venv/ -error_log.txt \ No newline at end of file +error_log.txt +docs/* +!docs/*.rst +!docs/conf.py diff --git a/README.md b/README.md index 9cba0f27..e258934a 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@

- Open-source DevSecOps for Generative AI Systems.
+ Open-source PII Detection & Anonymization.

diff --git a/datafog/client.py b/datafog/client.py index 42c0ec1e..9ae88012 100644 --- a/datafog/client.py +++ b/datafog/client.py @@ -1,4 +1,8 @@ -# client.py +""" +Client module for DataFog. + +Provides CLI commands for scanning images and text using DataFog's OCR and PII detection capabilities. +""" import asyncio import logging @@ -25,7 +29,18 @@ def scan_image( ), operations: str = typer.Option("annotate_pii", help="Operation to perform"), ): - """Extract text from images.""" + """ + Scan images for text and PII. + + Extracts text from images using OCR, then detects PII entities. + Handles both remote URLs and local file paths. + + Args: + image_urls: List of image URLs or file paths + operations: Pipeline operations to run (default: annotate_pii) + + Prints results or exits with error on failure. + """ if not image_urls: typer.echo("No image URLs or file paths provided. Please provide at least one.") raise typer.Exit(code=1) @@ -48,7 +63,17 @@ def scan_text( ), operations: str = typer.Option("annotate_pii", help="Operation to perform"), ): - """Annotate texts to detect PII entities.""" + """ + Scan texts for PII. + + Detects PII entities in a list of input texts. + + Args: + str_list: List of texts to analyze + operations: Pipeline operations to run (default: annotate_pii) + + Prints results or exits with error on failure. + """ if not str_list: typer.echo("No texts provided.") raise typer.Exit(code=1) @@ -66,19 +91,34 @@ def scan_text( @app.command() def health(): - """Check DataFog service health.""" + """ + Check DataFog service health. + + Prints a message indicating that DataFog is running. + """ typer.echo("DataFog is running.") @app.command() def show_config(): - """Show current configuration.""" + """ + Show current configuration. + + Prints the current DataFog configuration. + """ typer.echo(get_config()) @app.command() def download_model(model_name: str = typer.Argument(..., help="Model to download")): - """Download a model.""" + """ + Download a spaCy model. + + Args: + model_name: Name of the model to download. + + Prints a confirmation message after downloading. + """ SpacyAnnotator.download_model(model_name) typer.echo(f"Model {model_name} downloaded.") @@ -87,21 +127,36 @@ def download_model(model_name: str = typer.Argument(..., help="Model to download def show_spacy_model_directory( model_name: str = typer.Argument(..., help="Model to check") ): - """Show model path.""" + """ + Show the directory path for a spaCy model. + + Args: + model_name: Name of the model to check. + + Prints the directory path of the specified model. + """ annotator = SpacyAnnotator(model_name) typer.echo(annotator.show_model_path()) @app.command() def list_spacy_models(): - """List available models.""" + """ + List available spaCy models. + + Prints a list of all available spaCy models. + """ annotator = SpacyAnnotator() typer.echo(annotator.list_models()) @app.command() def list_entities(): - """List available entities.""" + """ + List available entities. + + Prints a list of all available entities that can be recognized. + """ annotator = SpacyAnnotator() typer.echo(annotator.list_entities()) diff --git a/datafog/config.py b/datafog/config.py index 4c7f9f84..9f2f6df2 100644 --- a/datafog/config.py +++ b/datafog/config.py @@ -1,3 +1,10 @@ +""" +Configuration module for DataFog. + +Defines settings and provides a global config instance. +Includes API keys, URLs, timeouts, and other options. +""" + import os from enum import Enum from typing import Optional @@ -7,7 +14,17 @@ class DataFogConfig(BaseSettings): - """Configuration settings for DataFog SDK""" + """ + Configuration settings for DataFog SDK. + + This class defines all the configuration options used throughout the DataFog SDK. + It includes settings for API authentication, service URLs, timeouts, retries, + rate limiting, and logging. The configuration can be updated at runtime using + environment variables or programmatically via the update method. + + All settings have default values that can be overridden as needed. The class + uses Pydantic for data validation and settings management. + """ # API Keys and Authentication api_key: str = os.environ.get("DATAFOG_API_KEY", "") @@ -60,6 +77,15 @@ def configure(**kwargs): class OperationType(str, Enum): + """ + Enum for supported DataFog operations. + + ANNOTATE_PII: Detect and annotate PII in text + EXTRACT_TEXT: Extract text from images + REDACT_PII: Remove PII from text + ANONYMIZE_PII: Replace PII with fake data + """ + ANNOTATE_PII = "annotate_pii" EXTRACT_TEXT = "extract_text" REDACT_PII = "redact_pii" diff --git a/datafog/exceptions.py b/datafog/exceptions.py index 36014de1..af74432a 100644 --- a/datafog/exceptions.py +++ b/datafog/exceptions.py @@ -1,31 +1,78 @@ -# exceptions.py +""" +Exceptions module for DataFog SDK. + +This module defines custom exceptions and utility functions for error handling in the DataFog SDK. +""" class DataFogException(Exception): - """Base exception for DataFog SDK""" + """ + Base exception for DataFog SDK. + + Attributes: + message (str): The error message. + status_code (int, optional): The HTTP status code associated with the error. + """ def __init__(self, message: str, status_code: int = None): + """ + Initialize a DataFogException. + + Args: + message (str): The error message. + status_code (int, optional): The HTTP status code associated with the error. + """ self.message = message self.status_code = status_code super().__init__(self.message) class BadRequestError(DataFogException): - """Exception raised for 400 Bad Request errors""" + """ + Exception raised for 400 Bad Request errors. + + Inherits from DataFogException and sets the status code to 400. + """ def __init__(self, message: str): + """ + Initialize a BadRequestError. + + Args: + message (str): The error message. + """ super().__init__(message, status_code=400) class UnprocessableEntityError(DataFogException): - """Exception raised for 422 Unprocessable Entity errors""" + """ + Exception raised for 422 Unprocessable Entity errors. + + Inherits from DataFogException and sets the status code to 422. + """ def __init__(self, message: str): + """ + Initialize an UnprocessableEntityError. + + Args: + message (str): The error message. + """ super().__init__(message, status_code=422) def raise_for_status_code(status_code: int, error_message: str): - """Raise the appropriate exception based on the status code""" + """ + Raise the appropriate exception based on the status code. + + Args: + status_code (int): The HTTP status code. + error_message (str): The error message to include in the exception. + + Raises: + BadRequestError: If the status code is 400. + UnprocessableEntityError: If the status code is 422. + """ if status_code == 400: raise BadRequestError(error_message) elif status_code == 422: diff --git a/datafog/main.py b/datafog/main.py index 7c5fcd45..e338c1e7 100644 --- a/datafog/main.py +++ b/datafog/main.py @@ -1,3 +1,14 @@ +""" +Main module for DataFog. + +This module contains the core classes for DataFog: +- DataFog: Main class for running OCR and text processing pipelines. +- TextPIIAnnotator: Class for annotating PII in text. + +These classes provide high-level interfaces for image and text processing, +including OCR, PII detection, and annotation. +""" + import json import logging from typing import List @@ -13,6 +24,18 @@ class DataFog: + """ + Main class for running OCR and text processing pipelines. + + Handles image and text processing operations, including OCR and PII detection. + + Attributes: + image_service: Service for image processing and OCR. + text_service: Service for text processing and annotation. + spark_service: Optional Spark service for distributed processing. + operations: List of operations to perform. + """ + def __init__( self, image_service=ImageService(), @@ -36,7 +59,26 @@ def __init__( self.logger.info(f"Operations: {operations}") async def run_ocr_pipeline(self, image_urls: List[str]): - """Run the OCR pipeline asynchronously on a list of images provided via url.""" + """ + Run the OCR pipeline asynchronously on a list of images provided via URL. + + This method performs optical character recognition (OCR) on the images specified by the URLs. + If PII annotation is enabled, it also annotates the extracted text for personally identifiable information. + + Args: + image_urls (List[str]): A list of URLs pointing to the images to be processed. + + Returns: + List: If PII annotation is enabled, returns a list of annotated text results. + Otherwise, returns a list of extracted text from the images. + + Raises: + Exception: Any error encountered during the OCR or annotation process. + + Note: + The method logs various stages of the process, including completion of OCR extraction + and text annotation, as well as any errors encountered. + """ try: extracted_text = await self.image_service.ocr_extract(image_urls) self.logger.info(f"OCR extraction completed for {len(image_urls)} images.") @@ -59,7 +101,26 @@ async def run_ocr_pipeline(self, image_urls: List[str]): return [f"Error: {str(e)}"] async def run_text_pipeline(self, str_list: List[str]): - """Run the text pipeline asynchronously on a list of input text.""" + """ + Run the text pipeline asynchronously on a list of input text. + + This method processes a list of text strings, potentially annotating them for personally + identifiable information (PII) if the ANNOTATE_PII operation is enabled. + + Args: + str_list (List[str]): A list of text strings to be processed. + + Returns: + List: If PII annotation is enabled, returns a list of annotated text results. + Otherwise, returns the original list of text strings. + + Raises: + Exception: Any error encountered during the text processing or annotation. + + Note: + The method logs the start of the pipeline, the completion of text annotation if applicable, + and any errors encountered during processing. + """ try: self.logger.info(f"Starting text pipeline with {len(str_list)} texts.") if OperationType.ANNOTATE_PII in self.operations: @@ -78,7 +139,28 @@ async def run_text_pipeline(self, str_list: List[str]): raise def run_text_pipeline_sync(self, str_list: List[str]): - """Run the text pipeline synchronously on a list of input text.""" + """ + Run the text pipeline synchronously on a list of input text. + + This method processes a list of text strings in a synchronous manner, potentially + annotating them for personally identifiable information (PII) if the ANNOTATE_PII + operation is enabled. + + Args: + str_list (List[str]): A list of text strings to be processed. + + Returns: + List: If PII annotation is enabled, returns a list of annotated text results. + Otherwise, returns the original list of text strings. + + Raises: + Exception: Any error encountered during the text processing or annotation. + + Note: + The method logs the start of the pipeline, the completion of text annotation if applicable, + and any errors encountered during processing. This synchronous version may be preferred + for smaller datasets or when immediate results are required. + """ try: self.logger.info(f"Starting text pipeline with {len(str_list)} texts.") if OperationType.ANNOTATE_PII in self.operations: @@ -95,12 +177,37 @@ def run_text_pipeline_sync(self, str_list: List[str]): raise def _add_attributes(self, attributes: dict): - """Add multiple attributes.""" + """ + Add multiple attributes to the DataFog instance. + + This private method allows for the dynamic addition of multiple attributes to the + DataFog instance. It iterates through the provided dictionary of attributes and + adds each key-value pair as an attribute. + + Args: + attributes (dict): A dictionary where keys are attribute names and values are + the corresponding attribute values to be added. + + Note: + This method is intended for internal use and may be used for extending the + functionality of the DataFog class dynamically. Care should be taken when + using this method to avoid overwriting existing attributes. + """ for key, value in attributes.items(): pass class TextPIIAnnotator: + """ + Class for annotating PII in text. + + Provides functionality to detect and annotate Personally Identifiable Information (PII) in text. + + Attributes: + text_annotator: SpacyPIIAnnotator instance for text annotation. + spark_processor: Optional SparkService for distributed processing. + """ + def __init__(self): self.text_annotator = SpacyPIIAnnotator.create() self.spark_processor: SparkService = None diff --git a/datafog/models/annotator.py b/datafog/models/annotator.py index bc245116..88703960 100644 --- a/datafog/models/annotator.py +++ b/datafog/models/annotator.py @@ -1,4 +1,10 @@ -# models/annotator.py +""" +Defines data models for annotation requests and results. + +Contains Pydantic models for structuring input, output, and explanations +in the annotation process. Ensures type safety and consistent data handling. +""" + from typing import List, Optional from pydantic import BaseModel, field_validator @@ -7,6 +13,13 @@ class AnnotatorRequest(BaseModel): + """ + Represents an annotation request. + + Contains text to annotate, language, and optional parameters + to customize the annotation process. + """ + text: str language: str correlation_id: Optional[str] @@ -18,6 +31,12 @@ class AnnotatorRequest(BaseModel): class AnnotationResult(BaseModel): + """ + Represents the result of an annotation. + + Includes position, score, entity type, and optional metadata. + """ + start: int end: int score: float @@ -33,6 +52,13 @@ def validate_entity_type(cls, v): class AnalysisExplanation(BaseModel): + """ + Provides detailed explanation of an annotation analysis. + + Includes information about the recognizer, patterns, scores, + and context improvements. + """ + recognizer: str pattern_name: Optional[str] pattern: Optional[str] @@ -45,4 +71,8 @@ class AnalysisExplanation(BaseModel): class AnnotationResultWithAnaysisExplanation(AnnotationResult): + """ + Extends AnnotationResult with detailed analysis explanation. + """ + analysis_explanation: Optional[AnalysisExplanation] diff --git a/datafog/models/common.py b/datafog/models/common.py index 55aa6307..0a9da2cf 100644 --- a/datafog/models/common.py +++ b/datafog/models/common.py @@ -1,4 +1,9 @@ -# models/common.py +""" +Common models for DataFog PII detection and annotation. + +Defines entity types, patterns, and metadata structures used across the library. +""" + from enum import Enum from typing import List, Optional @@ -6,7 +11,8 @@ class EntityTypes(str, Enum): - # Define your entity types here + """PII entity types recognized by DataFog.""" + PERSON = "Names similar to John Doe, Joe Biden, Donald Trump, Kamala Harris" LOCATION = "Full or partial name of a location" ORGANIZATION = "Full or partial name of an organization" @@ -21,12 +27,16 @@ class EntityTypes(str, Enum): class Pattern(BaseModel): + """Regex pattern for entity recognition.""" + name: str regex: str score: float class PatternRecognizer(BaseModel): + """Configuration for a pattern-based entity recognizer.""" + name: str supported_language: str patterns: List[Pattern] @@ -36,4 +46,6 @@ class PatternRecognizer(BaseModel): class AnnotatorMetadata(BaseModel): + """Metadata for annotation results.""" + recognizer_name: str diff --git a/datafog/models/spacy_nlp.py b/datafog/models/spacy_nlp.py index 3a70fa01..6ac37c68 100644 --- a/datafog/models/spacy_nlp.py +++ b/datafog/models/spacy_nlp.py @@ -1,3 +1,10 @@ +""" +Provides spaCy-based NLP functionality for entity recognition and annotation. + +This module implements a SpacyAnnotator class that uses spaCy models for +text annotation, entity recognition, and related NLP tasks. +""" + from typing import List, Optional from uuid import uuid4 @@ -8,6 +15,13 @@ class SpacyAnnotator: + """ + Handles text annotation using spaCy NLP models. + + Provides methods for loading models, annotating text, and managing spaCy resources. + Supports various NLP tasks including entity recognition and model management. + """ + def __init__(self, model_name: str = "en_core_web_lg"): self.model_name = model_name self.nlp = None diff --git a/datafog/processing/image_processing/donut_processor.py b/datafog/processing/image_processing/donut_processor.py index 19282d17..b3554140 100644 --- a/datafog/processing/image_processing/donut_processor.py +++ b/datafog/processing/image_processing/donut_processor.py @@ -1,3 +1,11 @@ +""" +Provides functionality for processing images using the Donut model. + +This module implements a DonutProcessor class that uses the Donut model +for document understanding tasks, particularly OCR and information extraction +from images of documents. +""" + import importlib import json import re @@ -13,6 +21,14 @@ class DonutProcessor: + """ + Handles image processing using the Donut model. + + Provides methods for loading models, preprocessing images, parsing images + for text extraction, and managing dependencies. Supports processing both + local images and images from URLs. + """ + def __init__(self, model_path="naver-clova-ix/donut-base-finetuned-cord-v2"): self.ensure_installed("torch") self.ensure_installed("transformers") diff --git a/datafog/processing/image_processing/image_downloader.py b/datafog/processing/image_processing/image_downloader.py index cdbbc6ca..90a14a20 100644 --- a/datafog/processing/image_processing/image_downloader.py +++ b/datafog/processing/image_processing/image_downloader.py @@ -1,3 +1,10 @@ +""" +Asynchronous image downloader for fetching images from URLs. + +This module provides functionality to download single or multiple images +asynchronously from given URLs using aiohttp. +""" + import asyncio from io import BytesIO from typing import List @@ -7,10 +14,18 @@ class ImageDownloader: + """ + Asynchronous image downloader. + + Provides methods to download single or multiple images from URLs. + Uses aiohttp for efficient asynchronous network operations. + """ + def __init__(self): pass async def download_image(self, image_url: str) -> Image.Image: + """Download a single image from a URL.""" async with aiohttp.ClientSession() as session: async with session.get(image_url) as response: if response.status == 200: @@ -20,4 +35,5 @@ async def download_image(self, image_url: str) -> Image.Image: raise Exception(f"Failed to download image from {image_url}") async def download_images(self, urls: List[str]) -> List[Image.Image]: + """Download multiple images from a list of URLs concurrently.""" return await asyncio.gather(*[self.download_image(url) for url in urls]) diff --git a/datafog/processing/image_processing/pytesseract_processor.py b/datafog/processing/image_processing/pytesseract_processor.py index 8c11ae04..f7291470 100644 --- a/datafog/processing/image_processing/pytesseract_processor.py +++ b/datafog/processing/image_processing/pytesseract_processor.py @@ -1,3 +1,10 @@ +""" +Provides OCR functionality using Pytesseract. + +This module contains a PytesseractProcessor class for extracting text from images +using the Pytesseract OCR engine. +""" + import logging import pytesseract @@ -5,6 +12,13 @@ class PytesseractProcessor: + """ + Processes images to extract text using Pytesseract OCR. + + Provides an asynchronous method to convert image content to text. + Handles errors and logs issues during text extraction. + """ + async def extract_text_from_image(self, image: Image.Image) -> str: try: return pytesseract.image_to_string(image) diff --git a/datafog/processing/spark_processing/pyspark_udfs.py b/datafog/processing/spark_processing/pyspark_udfs.py index a51f119a..81d6986f 100644 --- a/datafog/processing/spark_processing/pyspark_udfs.py +++ b/datafog/processing/spark_processing/pyspark_udfs.py @@ -1,3 +1,12 @@ +""" +PySpark UDFs for PII annotation and related utilities. + +This module provides functions for PII (Personally Identifiable Information) annotation +using SpaCy models in a PySpark environment. It includes utilities for installing +dependencies, creating and broadcasting PII annotator UDFs, and performing PII annotation +on text data. +""" + import importlib import subprocess import sys diff --git a/datafog/services/image_service.py b/datafog/services/image_service.py index 83d64555..37945428 100644 --- a/datafog/services/image_service.py +++ b/datafog/services/image_service.py @@ -1,3 +1,11 @@ +""" +Image processing service for OCR and other operations. + +This module provides classes for downloading images and performing OCR using +either Tesseract or Donut models. It supports processing both local images +and images from URLs. +""" + import asyncio import io import logging @@ -16,6 +24,8 @@ class ImageDownloader: + """Asynchronous image downloader with SSL support.""" + async def download_image(self, url: str) -> Image.Image: ssl_context = ssl.create_default_context(cafile=certifi.where()) async with aiohttp.ClientSession( @@ -32,6 +42,13 @@ async def download_image(self, url: str) -> Image.Image: class ImageService: + """ + Service for image processing and OCR. + + Supports Tesseract and Donut OCR models, image downloading, + and various image processing operations. + """ + def __init__(self, use_donut: bool = False, use_tesseract: bool = True): self.downloader = ImageDownloader() diff --git a/datafog/services/spark_service.py b/datafog/services/spark_service.py index bc83fd30..04bfcaf4 100644 --- a/datafog/services/spark_service.py +++ b/datafog/services/spark_service.py @@ -1,3 +1,10 @@ +""" +Spark service for data processing and analysis. + +Provides a wrapper around PySpark functionality, including session creation, +JSON reading, and package management. +""" + import importlib import json import subprocess @@ -6,6 +13,13 @@ class SparkService: + """ + Manages Spark operations and dependencies. + + Initializes a Spark session, handles imports, and provides methods for + data reading and package installation. + """ + def __init__(self): self.spark = self.create_spark_session() self.ensure_installed("pyspark") diff --git a/datafog/services/text_service.py b/datafog/services/text_service.py index 722fa56e..0ac993e2 100644 --- a/datafog/services/text_service.py +++ b/datafog/services/text_service.py @@ -1,3 +1,9 @@ +""" +Text processing service for PII annotation. + +Provides synchronous and asynchronous methods for annotating text with personally identifiable information (PII) using SpaCy. Supports chunking long texts and batch processing. +""" + import asyncio from typing import Dict, List @@ -5,6 +11,12 @@ class TextService: + """ + Manages text annotation operations. + + Handles text chunking, PII annotation, and result combination for both single texts and batches. Offers both synchronous and asynchronous interfaces. + """ + def __init__(self, text_chunk_length: int = 1000): self.annotator = SpacyPIIAnnotator.create() self.text_chunk_length = text_chunk_length diff --git a/docs/cli.rst b/docs/cli.rst new file mode 100644 index 00000000..a4c67272 --- /dev/null +++ b/docs/cli.rst @@ -0,0 +1,17 @@ +=========== +DataFog CLI +=========== + +Overview +-------- +The main entrypoint for the CLI is through the DataFog client file, defined in :mod:`datafog.client`. +We use Typer to build the CLI, with each command defined as a separate function. + +Definitions +----------- +.. automodule:: datafog.client + :members: + +.. autosummary:: + :toctree: generated/ + :template: class.rst diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..f0e3828c --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,32 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "DataFog" +copyright = "2024, DataFog Inc." +author = "Sid Mohan" +release = "v4.0.0-beta.1" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.napoleon"] + +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "alabaster" +html_static_path = ["_static"] + +autosummary_generate = True +napoleon_use_rtype = False +napoleon_use_ivar = False +napoleon_use_param = False diff --git a/docs/definitions.rst b/docs/definitions.rst new file mode 100644 index 00000000..6ac214f7 --- /dev/null +++ b/docs/definitions.rst @@ -0,0 +1,19 @@ +Class Definitions +================= + +.. toctree:: + :maxdepth: 2 + :caption: Class Definitions: + + generated/datafog.models.annotator.AnalysisExplanation + generated/datafog.models.annotator.AnnotationResult + generated/datafog.models.annotator.AnnotatorRequest + generated/datafog.models.common.AnnotatorMetadata + generated/datafog.models.common.EntityTypes + generated/datafog.models.common.Pattern + generated/datafog.models.common.PatternRecognizer + generated/datafog.models.spacy_nlp.SpacyAnnotator + generated/datafog.services.image_service.ImageDownloader + generated/datafog.services.image_service.ImageService + generated/datafog.services.spark_service.SparkService + generated/datafog.services.text_service.TextService \ No newline at end of file diff --git a/docs/important-concepts.rst b/docs/important-concepts.rst new file mode 100644 index 00000000..791fa0a0 --- /dev/null +++ b/docs/important-concepts.rst @@ -0,0 +1,119 @@ +=========== +Important Concepts +=========== + +Overview +-------- + + +Data Models +^^^^^^^^^^^ +Key data models to support PII annotation and OCR analysis. + +* AnalysisExplanation +* AnnotationResult +* AnnotatorRequest +* EntityTypes +* Pattern +* PatternRecognizer + +Processors +^^^^^^^^^^^ +Main processors: +* SpacyAnnotator + Text annotation with spaCy +* DonutProcessor + Image processing +* PytesseractProcessor + OCR + +Services +^^^^^^^^^^^ +Core services: +* ImageService + Image handling and OCR +* SparkService + PySpark wrapper +* TextService + PII annotation + + +Data Models +------------------------- + +.. automodule:: datafog.models.annotator + :members: + +.. autosummary:: + :toctree: generated/ + :template: class.rst + AnnotatorRequest + AnnotationResult + AnalysisExplanation + +.. automodule:: datafog.models.common + :members: + +.. autosummary:: + :toctree: generated/ + :template: class.rst + EntityTypes + Pattern + PatternRecognizer + AnnotatorMetadata + +.. automodule:: datafog.models.spacy_nlp + :members: + +.. autosummary:: + :toctree: generated/ + :template: class.rst + SpacyAnnotator + +Processors +------------------------- + +.. automodule:: datafog.processing.image_processing.donut_processor + :members: + +.. automodule:: datafog.processing.image_processing.image_downloader + :members: + +.. automodule:: datafog.processing.image_processing.pytesseract_processor + :members: + +.. automodule:: datafog.processing.text_processing.spacy_pii_annotator + :members: + +.. automodule:: datafog.processing.spark_processing.pyspark_udfs + :members: + + +Services +------------------------- + +.. automodule:: datafog.services.image_service + :members: + +.. autosummary:: + :toctree: generated/ + :template: class.rst + ImageDownloader + ImageService + +.. automodule:: datafog.services.spark_service + :members: + +.. autosummary:: + :toctree: generated/ + :template: class.rst + SparkService + + +.. automodule:: datafog.services.text_service + :members: + +.. autosummary:: + :toctree: generated/ + :template: class.rst + TextService diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..6ac16084 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,105 @@ +===================== +DataFog Documentation +===================== + +DataFog is an open-source tool for PII detection and anonymization of unstructured data. This documentation covers the CLI and Python SDK. + +.. toctree:: + :maxdepth: 2 + + important-concepts + cli + python-sdk + definitions + +===================== +Getting Started +===================== + +--------------------- +Installation + +Install DataFog via pip: + +.. code-block:: bash + + pip install datafog + +This installs the latest stable version with CLI support. + +--------------------- +CLI Usage +--------------------- + +For a list of available operations, run: + +.. code-block:: bash + + datafog --help + +Scan text for PII: + +.. code-block:: bash + + datafog scan-text "Your text here" + +Extract text from image: + +.. code-block:: bash + + datafog scan-image "path/to/image.png" --operations extract_text + +Scan for PII in image: + +.. code-block:: bash + + datafog scan-image "path/to/image.png" --operations annotate_pii + +For more information on the CLI, see :doc:`cli`. + +--------------------- +Python SDK Usage +--------------------- + +Scan text for PII: + +.. code-block:: python + + + import requests + from datafog import DataFog + + # For text annotation + client = DataFog(operations="annotate_pii") + + # Fetch sample medical record + doc_url = "https://gist.githubusercontent.com/sidmohan0/b43b72693226422bac5f083c941ecfdb/raw/b819affb51796204d59987893f89dee18428ed5d/note1.txt" + response = requests.get(doc_url) + text_lines = [line for line in response.text.splitlines() if line.strip()] + + # Run annotation + annotations = client.run_text_pipeline_sync(str_list=text_lines) + print(annotations) + +Scan image for PII: + +.. code-block:: python + + + import asyncio + from datafog import DataFog + + # For OCR and PII annotation + ocr_client = DataFog(operations="extract_text,annotate_pii") + + async def run_ocr_pipeline_demo(): + image_url = "https://s3.amazonaws.com/thumbnails.venngage.com/template/dc377004-1c2d-49f2-8ddf-d63f11c8d9c2.png" + results = await ocr_client.run_ocr_pipeline(image_urls=[image_url]) + print("OCR Pipeline Results:", results) + + # Run the async function + asyncio.run(run_ocr_pipeline_demo()) + +For detailed information on the Python SDK, see :doc:`python-sdk`. + + diff --git a/docs/python-sdk.rst b/docs/python-sdk.rst new file mode 100644 index 00000000..dbf1982d --- /dev/null +++ b/docs/python-sdk.rst @@ -0,0 +1,17 @@ +================== +DataFog Python SDK +================== + +Overview +-------- +The main entrypoint for the SDK is through the DataFog class, defined in :mod:`datafog.main`. +Here you can initialize the different services, including TextService, ImageService, and SparkService. + +Definitions +----------- +.. automodule:: datafog.main + :members: + +.. autosummary:: + :toctree: generated/ + :template: class.rst diff --git a/requirements.txt b/requirements.txt index a7d2432a..2c4c191e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ asyncio setuptools pydantic-settings==2.3.4 typer==0.12.3 +sphinx \ No newline at end of file diff --git a/setup.py b/setup.py index 4817566c..9d0f2920 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,7 @@ "setuptools", "pydantic-settings==2.3.4", "typer==0.12.3", + "sphinx", ], python_requires=">=3.10,<3.13", entry_points={