Skip to content

Commit

Permalink
Merge pull request #51 from DataFog/feature/py312-support
Browse files Browse the repository at this point in the history
python 3.10, 3.11, 3.12 support | model
  • Loading branch information
sidmohan0 authored Aug 6, 2024
2 parents 1e5f14b + 924b47b commit 492ab5c
Show file tree
Hide file tree
Showing 14 changed files with 333 additions and 298 deletions.
22 changes: 20 additions & 2 deletions .github/workflows/dev-cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,24 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10"]
python-version: ["3.10", "3.11", "3.12"]
steps:
- name: Check out repo
uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Set up Python
uses: actions/setup-python@v4
with:
Expand All @@ -54,11 +68,15 @@ jobs:
pip install -e .
pip install tox just pre-commit
- name: Run Tests with tox
run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
- name: Submit to Codecov
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
flags: unittests
name: codecov-umbrella
- name: Clean up pip cache
run: |
pip cache purge
rm -rf ~/.cache/pip
23 changes: 20 additions & 3 deletions .github/workflows/feature-cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@ jobs:
steps:
- name: Check out repo
uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Set up Python
uses: actions/setup-python@v4
with:
Expand All @@ -51,10 +65,13 @@ jobs:
- name: Install Dependencies
run: |
pip install -U pip
pip install -e .
pip install tox just pre-commit
pip install --no-cache-dir -e .
pip install --no-cache-dir tox just pre-commit
- name: Free up disk space
run: |
sudo apt-get clean
- name: Run Tests with tox
run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
- name: Submit to Codecov
uses: codecov/codecov-action@v3
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/main-cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
pip install -e .
pip install tox just pre-commit
- name: Run Tests with tox
run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
- name: Submit to Codecov
uses: codecov/codecov-action@v3
with:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ For local development:
```
5. Install the package in editable mode:
```
pip install -e .
pip install -r requirements-dev.txt
```
6. Set up the project:
```
Expand Down
26 changes: 22 additions & 4 deletions datafog/processing/image_processing/donut_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sys
from io import BytesIO

import numpy as np
import requests
from PIL import Image

Expand All @@ -13,7 +14,6 @@

class DonutProcessor:
def __init__(self, model_path="naver-clova-ix/donut-base-finetuned-cord-v2"):

self.ensure_installed("torch")
self.ensure_installed("transformers")

Expand All @@ -36,13 +36,31 @@ def ensure_installed(self, package_name):
[sys.executable, "-m", "pip", "install", package_name]
)

async def parse_image(self, image: Image) -> str:
def preprocess_image(self, image: Image.Image) -> np.ndarray:
# Convert to RGB if the image is not already in RGB mode
if image.mode != "RGB":
image = image.convert("RGB")

# Convert to numpy array
image_np = np.array(image)

# Ensure the image is 3D (height, width, channels)
if image_np.ndim == 2:
image_np = np.expand_dims(image_np, axis=-1)
image_np = np.repeat(image_np, 3, axis=-1)

return image_np

async def parse_image(self, image: Image.Image) -> str:
"""Process w/ DonutProcessor and VisionEncoderDecoderModel"""
# Preprocess the image
image_np = self.preprocess_image(image)

task_prompt = "<s_cord-v2>"
decoder_input_ids = self.processor.tokenizer(
task_prompt, add_special_tokens=False, return_tensors="pt"
).input_ids
pixel_values = self.processor(image, return_tensors="pt").pixel_values
pixel_values = self.processor(images=image_np, return_tensors="pt").pixel_values

outputs = self.model.generate(
pixel_values.to(self.device),
Expand Down Expand Up @@ -71,7 +89,7 @@ def process_url(self, url: str) -> str:
image = self.downloader.download_image(url)
return self.parse_image(image)

def download_image(self, url: str) -> Image:
def download_image(self, url: str) -> Image.Image:
"""Download an image from URL."""
response = requests.get(url)
image = Image.open(BytesIO(response.content))
Expand Down
4 changes: 2 additions & 2 deletions datafog/processing/spark_processing/pyspark_udfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
"""Extract features using en_spacy_pii_fast model.
"""Extract features using en_core_web_lg model.
Returns:
list[list[str]]: Values as arrays in order defined in the PII_ANNOTATION_LABELS.
Expand Down Expand Up @@ -40,7 +40,7 @@ def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:


def broadcast_pii_annotator_udf(
spark_session=None, spacy_model: str = "en_spacy_pii_fast"
spark_session=None, spacy_model: str = "en_core_web_lg"
):
"""Broadcast PII annotator across Spark cluster and create UDF"""
ensure_installed("pyspark")
Expand Down
55 changes: 41 additions & 14 deletions datafog/processing/text_processing/spacy_pii_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,26 @@

from pydantic import BaseModel

PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
PII_ANNOTATION_LABELS = [
"CARDINAL",
"DATE",
"EVENT",
"FAC",
"GPE",
"LANGUAGE",
"LAW",
"LOC",
"MONEY",
"NORP",
"ORDINAL",
"ORG",
"PERCENT",
"PERSON",
"PRODUCT",
"QUANTITY",
"TIME",
"WORK_OF_ART",
]
MAXIMAL_STRING_SIZE = 1000000


Expand All @@ -12,21 +31,29 @@ class SpacyPIIAnnotator(BaseModel):

@classmethod
def create(cls) -> "SpacyPIIAnnotator":
try:
# Try loading as a spaCy model first
import spacy
import spacy

nlp = spacy.load("en_spacy_pii_fast")
try:
nlp = spacy.load("en_core_web_lg")
except OSError:
# If that fails, try importing as a module
try:
import en_spacy_pii_fast

nlp = en_spacy_pii_fast.load()
except ImportError:
raise ImportError(
"Failed to load en_spacy_pii_fast. Make sure it's installed correctly."
)
import subprocess
import sys

interpreter_location = sys.executable
subprocess.run(
[
interpreter_location,
"-m",
"pip",
"install",
"--no-deps",
"--no-cache-dir",
"https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl",
],
check=True,
)
nlp = spacy.load("en_core_web_lg")

return cls(nlp=nlp)

def annotate(self, text: str) -> Dict[str, List[str]]:
Expand Down
27 changes: 25 additions & 2 deletions datafog/services/image_service.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,34 @@
import asyncio
import io
import ssl
from typing import List

import aiohttp
import certifi
from PIL import Image

from datafog.processing.image_processing.donut_processor import DonutProcessor
from datafog.processing.image_processing.image_downloader import ImageDownloader
from datafog.processing.image_processing.pytesseract_processor import (
PytesseractProcessor,
)


class ImageDownloader:
async def download_image(self, url: str) -> Image.Image:
ssl_context = ssl.create_default_context(cafile=certifi.where())
async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(ssl=ssl_context)
) as session:
async with session.get(url) as response:
if response.status == 200:
image_data = await response.read()
return Image.open(io.BytesIO(image_data))
else:
raise Exception(
f"Failed to download image. Status code: {response.status}"
)


class ImageService:
def __init__(self, use_donut: bool = False, use_tesseract: bool = True):
self.downloader = ImageDownloader()
Expand All @@ -21,7 +40,11 @@ def __init__(self, use_donut: bool = False, use_tesseract: bool = True):
)

async def download_images(self, urls: List[str]) -> List[Image.Image]:
return await self.downloader.download_images(urls)
async def download_image(url: str) -> Image.Image:
return await self.downloader.download_image(url)

tasks = [asyncio.create_task(download_image(url)) for url in urls]
return await asyncio.gather(*tasks, return_exceptions=True)

async def ocr_extract(
self,
Expand Down
4 changes: 3 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ just
isort
black
blacken-docs
certifi
flake8
prettier
tox
pytest
pytest==7.4.0
pytest-asyncio==0.21.0
pytest-cov
mypy
autoflake
Expand Down
Loading

0 comments on commit 492ab5c

Please sign in to comment.