Skip to content

Commit

Permalink
fixed justformat
Browse files Browse the repository at this point in the history
  • Loading branch information
Sid Mohan authored and Sid Mohan committed May 18, 2024
1 parent 0781aaf commit 8adff1a
Show file tree
Hide file tree
Showing 404 changed files with 1,657 additions and 2,907 deletions.
3 changes: 1 addition & 2 deletions datafog/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .__about__ import __version__
from .config import OperationType
from .main import DataFog, OCRPIIAnnotator, TextPIIAnnotator
from .processing.image_processing.donut_processor import DonutProcessor
Expand All @@ -9,8 +10,6 @@
from .services.text_service import TextService
from .telemetry import OTelLogger

from .__about__ import __version__

__all__ = [
"DonutProcessor",
"DataFog",
Expand Down
1 change: 1 addition & 0 deletions datafog/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from enum import Enum


class OperationType(str, Enum):
ANNOTATE_PII = "annotate_pii"
EXTRACT_TEXT = "extract_text"
Expand Down
2 changes: 1 addition & 1 deletion datafog/processing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .image_processing.donut_processor import DonutProcessor
from .image_processing.image_downloader import ImageDownloader
from .image_processing.pytesseract_processor import PytesseractProcessor

# from .spark_processing.pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
from .spark_processing import get_pyspark_udfs
from .text_processing.spacy_pii_annotator import SpacyPIIAnnotator

2 changes: 2 additions & 0 deletions datafog/processing/spark_processing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# from .pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator


def get_pyspark_udfs():
from .pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator

return broadcast_pii_annotator_udf, pii_annotator
12 changes: 6 additions & 6 deletions datafog/processing/spark_processing/pyspark_udfs.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import requests
import spacy
import importlib
import subprocess
import sys

import requests
import spacy

PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
MAXIMAL_STRING_SIZE = 1000000

Expand Down Expand Up @@ -40,13 +41,14 @@ def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:


def broadcast_pii_annotator_udf(
spark_session = None, spacy_model: str = "en_spacy_pii_fast"
spark_session=None, spacy_model: str = "en_spacy_pii_fast"
):
"""Broadcast PII annotator across Spark cluster and create UDF"""
ensure_installed("pyspark")
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, StructField, StructType

if not spark_session:
spark_session = SparkSession.builder.getOrCreate()
broadcasted_nlp = spark_session.sparkContext.broadcast(spacy.load(spacy_model))
Expand All @@ -62,6 +64,4 @@ def ensure_installed(self, package_name):
try:
importlib.import_module(package_name)
except ImportError:
subprocess.check_call(
[sys.executable, "-m", "pip", "install", package_name]
)
subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
9 changes: 4 additions & 5 deletions datafog/services/spark_service.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import json
from typing import Any, List
import importlib
import json
import subprocess
import sys

from typing import Any, List


class SparkService:
Expand All @@ -14,6 +13,7 @@ def __init__(self):
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

self.SparkSession = SparkSession
self.DataFrame = DataFrame
self.udf = udf
Expand All @@ -22,7 +22,7 @@ def __init__(self):

def create_spark_session(self):
return self.SparkSession.builder.appName("datafog").getOrCreate()

def read_json(self, path: str) -> List[dict]:
return self.spark.read.json(path).collect()

Expand All @@ -33,4 +33,3 @@ def ensure_installed(self, package_name):
subprocess.check_call(
[sys.executable, "-m", "pip", "install", package_name]
)

2 changes: 1 addition & 1 deletion datafog/telemetry/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@

from .open_telemetry import OTelLogger
42 changes: 32 additions & 10 deletions datafog/telemetry/open_telemetry.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
import logging
import asyncio
import json
import logging
import os
import platform
from typing import Any

import pkg_resources
from azure.monitor.opentelemetry.exporter import AzureMonitorTraceExporter
from opentelemetry import trace
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.trace import Status, StatusCode
from azure.monitor.opentelemetry.exporter import AzureMonitorTraceExporter


class OTelLogger:
"""A class to handle anonymous telemetry for the DataFog package."""

def __init__(self, instrumentation_key: str = os.getenv('INSTRUMENTATION_KEY')):
def __init__(self, instrumentation_key: str = os.getenv("INSTRUMENTATION_KEY")):
self.ready = False
self.trace_set = False
try:
Expand All @@ -29,7 +30,10 @@ def __init__(self, instrumentation_key: str = os.getenv('INSTRUMENTATION_KEY')):
self.provider.add_span_processor(processor)
self.ready = True
except BaseException as e:
if isinstance(e, (SystemExit, KeyboardInterrupt, GeneratorExit, asyncio.CancelledError)):
if isinstance(
e,
(SystemExit, KeyboardInterrupt, GeneratorExit, asyncio.CancelledError),
):
raise
self.ready = False

Expand All @@ -48,8 +52,14 @@ def log_system_info(self):
try:
tracer = trace.get_tracer("datafog.telemetry")
with tracer.start_as_current_span("System Info") as span:
self._add_attribute(span, "datafog_version", pkg_resources.get_distribution("datafog").version)
self._add_attribute(span, "python_version", platform.python_version())
self._add_attribute(
span,
"datafog_version",
pkg_resources.get_distribution("datafog").version,
)
self._add_attribute(
span, "python_version", platform.python_version()
)
self._add_attribute(span, "os", platform.system())
self._add_attribute(span, "platform_version", platform.version())
self._add_attribute(span, "cpus", os.cpu_count())
Expand All @@ -63,8 +73,14 @@ def pipeline_execution(self, datafog, input_data, output_data):
try:
tracer = trace.get_tracer("datafog.telemetry")
with tracer.start_as_current_span("Pipeline Execution") as span:
self._add_attribute(span, "datafog_version", pkg_resources.get_distribution("datafog").version)
self._add_attribute(span, "pipeline_type", datafog.__class__.__name__)
self._add_attribute(
span,
"datafog_version",
pkg_resources.get_distribution("datafog").version,
)
self._add_attribute(
span, "pipeline_type", datafog.__class__.__name__
)
self._add_attribute(span, "input_data", input_data)
self._add_attribute(span, "output_data", output_data)
span.set_status(Status(StatusCode.OK))
Expand All @@ -76,8 +92,14 @@ def end_pipeline(self, datafog, output):
try:
tracer = trace.get_tracer("datafog.telemetry")
with tracer.start_as_current_span("Pipeline Ended") as span:
self._add_attribute(span, "datafog_version", pkg_resources.get_distribution("datafog").version)
self._add_attribute(span, "pipeline_type", datafog.__class__.__name__)
self._add_attribute(
span,
"datafog_version",
pkg_resources.get_distribution("datafog").version,
)
self._add_attribute(
span, "pipeline_type", datafog.__class__.__name__
)
self._add_attribute(span, "output", output)
span.set_status(Status(StatusCode.OK))
except Exception:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
en_spacy_pii_fast==0.0.0
# transformers==4.40.1
spacy==3.4.4
spacy==3.7.4
# torch==2.2.2
# pyspark==3.4.1
pytest==8.0.2
Expand Down
2 changes: 2 additions & 0 deletions tests/.datafog_env/bin/pip
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# -*- coding: utf-8 -*-
import re
import sys

from pip._internal.cli.main import main

if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())
2 changes: 2 additions & 0 deletions tests/.datafog_env/bin/pip3
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# -*- coding: utf-8 -*-
import re
import sys

from pip._internal.cli.main import main

if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())
2 changes: 2 additions & 0 deletions tests/.datafog_env/bin/pip3.11
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# -*- coding: utf-8 -*-
import re
import sys

from pip._internal.cli.main import main

if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# don't import any costly modules
import sys
import os

import sys

is_pypy = '__pypy__' in sys.builtin_module_names

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@
import textwrap
from collections import OrderedDict
from types import TracebackType
from typing import TYPE_CHECKING, Iterable, List, Optional, Set, Tuple, Type, Union

from pip._vendor.certifi import where
from pip._vendor.packaging.requirements import Requirement
from pip._vendor.packaging.version import Version
from typing import (TYPE_CHECKING, Iterable, List, Optional, Set, Tuple, Type,
Union)

from pip import __file__ as pip_location
from pip._internal.cli.spinners import open_spinner
from pip._internal.locations import get_platlib, get_purelib, get_scheme
from pip._internal.metadata import get_default_environment, get_environment
from pip._internal.utils.subprocess import call_subprocess
from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds
from pip._vendor.certifi import where
from pip._vendor.packaging.requirements import Requirement
from pip._vendor.packaging.version import Version

if TYPE_CHECKING:
from pip._internal.index.package_finder import PackageFinder
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
from pathlib import Path
from typing import Any, Dict, List, Optional

from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
from pip._vendor.packaging.utils import canonicalize_name

from pip._internal.exceptions import InvalidWheelFilename
from pip._internal.models.direct_url import DirectUrl
from pip._internal.models.link import Link
from pip._internal.models.wheel import Wheel
from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds
from pip._internal.utils.urls import path_to_url
from pip._vendor.packaging.tags import (Tag, interpreter_name,
interpreter_version)
from pip._vendor.packaging.utils import canonicalize_name

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,27 @@
from optparse import Values
from typing import Any, Callable, List, Optional, Tuple

from pip._vendor.rich import traceback as rich_traceback

from pip._internal.cli import cmdoptions
from pip._internal.cli.command_context import CommandContextMixIn
from pip._internal.cli.parser import ConfigOptionParser, UpdatingDefaultsHelpFormatter
from pip._internal.cli.status_codes import (
ERROR,
PREVIOUS_BUILD_DIR_ERROR,
UNKNOWN_ERROR,
VIRTUALENV_NOT_FOUND,
)
from pip._internal.exceptions import (
BadCommand,
CommandError,
DiagnosticPipError,
InstallationError,
NetworkConnectionError,
PreviousBuildDirError,
UninstallationError,
)
from pip._internal.cli.parser import (ConfigOptionParser,
UpdatingDefaultsHelpFormatter)
from pip._internal.cli.status_codes import (ERROR, PREVIOUS_BUILD_DIR_ERROR,
UNKNOWN_ERROR,
VIRTUALENV_NOT_FOUND)
from pip._internal.exceptions import (BadCommand, CommandError,
DiagnosticPipError, InstallationError,
NetworkConnectionError,
PreviousBuildDirError,
UninstallationError)
from pip._internal.utils.filesystem import check_path_owner
from pip._internal.utils.logging import BrokenStdoutLoggingError, setup_logging
from pip._internal.utils.misc import get_prog, normalize_path
from pip._internal.utils.temp_dir import TempDirectoryTypeRegistry as TempDirRegistry
from pip._internal.utils.temp_dir import global_tempdir_manager, tempdir_registry
from pip._internal.utils.temp_dir import \
TempDirectoryTypeRegistry as TempDirRegistry
from pip._internal.utils.temp_dir import (global_tempdir_manager,
tempdir_registry)
from pip._internal.utils.virtualenv import running_under_virtualenv
from pip._vendor.rich import traceback as rich_traceback

__all__ = ["Command"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
from textwrap import dedent
from typing import Any, Callable, Dict, Optional, Tuple

from pip._vendor.packaging.utils import canonicalize_name

from pip._internal.cli.parser import ConfigOptionParser
from pip._internal.exceptions import CommandError
from pip._internal.locations import USER_CACHE_DIR, get_src_prefix
Expand All @@ -29,6 +27,7 @@
from pip._internal.models.target_python import TargetPython
from pip._internal.utils.hashes import STRONG_HASHES
from pip._internal.utils.misc import strtobool
from pip._vendor.packaging.utils import canonicalize_name

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

from pip._internal.build_env import get_runnable_pip
from pip._internal.cli import cmdoptions
from pip._internal.cli.parser import ConfigOptionParser, UpdatingDefaultsHelpFormatter
from pip._internal.cli.parser import (ConfigOptionParser,
UpdatingDefaultsHelpFormatter)
from pip._internal.commands import commands_dict, get_similar_commands
from pip._internal.exceptions import CommandError
from pip._internal.utils.misc import get_pip_version, get_prog
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
import functools
from typing import Callable, Generator, Iterable, Iterator, Optional, Tuple

from pip._vendor.rich.progress import (
BarColumn,
DownloadColumn,
FileSizeColumn,
Progress,
ProgressColumn,
SpinnerColumn,
TextColumn,
TimeElapsedColumn,
TimeRemainingColumn,
TransferSpeedColumn,
)

from pip._internal.utils.logging import get_indentation
from pip._vendor.rich.progress import (BarColumn, DownloadColumn,
FileSizeColumn, Progress,
ProgressColumn, SpinnerColumn,
TextColumn, TimeElapsedColumn,
TimeRemainingColumn,
TransferSpeedColumn)

DownloadProgressRenderer = Callable[[Iterable[bytes]], Iterator[bytes]]

Expand Down
Loading

0 comments on commit 8adff1a

Please sign in to comment.