Skip to content

Commit

Permalink
Added Sentiment Analysis
Browse files Browse the repository at this point in the history
Added Sentiment Analysis example
  • Loading branch information
williammanning committed Dec 31, 2024
1 parent 9b1eb26 commit 173250d
Show file tree
Hide file tree
Showing 6 changed files with 4,813 additions and 0 deletions.
100 changes: 100 additions & 0 deletions churn_model_new/main/.ipynb_checkpoints/model-checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import os

import pandas as pd
import qwak
import xgboost as xgb
from qwak.model.base import QwakModel
from qwak.model.schema import ExplicitFeature, ModelSchema, InferenceOutput
from sklearn.model_selection import train_test_split


class XGBoostChurnPredictionModel(QwakModel):

def __init__(self):
self.params = {
'n_estimators': int(os.getenv('n_estimators', 300)),
'learning_rate': float(os.getenv('learning_rate', 0.05)),
'objective': 'binary:logistic'
}

# Create a XGBoost classifier with the specified parameters
self.model = xgb.XGBClassifier(**self.params)

# Log model parameters to Qwak for tracking purposes
qwak.log_param(self.params)

def build(self):
file_absolute_path = os.path.dirname(os.path.abspath(__file__))
df = pd.read_csv(f"{file_absolute_path}/data.csv")

# Creating the X and y variables
y = df['churn']
X = df.drop(['churn', 'User_Id', '__index_level_0__',
'event date', 'Phone', 'State'], axis=1)

# Splitting X and y into train and test version
X_train, X_validation, y_train, y_validation = train_test_split(
X, y, test_size=0.25, random_state=42
)

# Training our CatBoost model
self.model.fit(
X_train,
y_train,
eval_set=[(X_validation, y_validation)]
)

# Log metrics into Qwak
accuracy = self.model.score(X_validation, y_validation)
qwak.log_metric({"val_accuracy": accuracy})
qwak.log_data(dataframe=X, tag="train_data")

@qwak.api()
def predict(self, df):
"""
The predict(df) method is the actual inference method.
"""
# Getting the original columns order
feature_order = self.model.get_booster().feature_names

# Reformatting the prediction data order
prediction_data = df.drop(
['User_Id', 'State'], axis=1
).reindex(columns=feature_order)

predictions = self.model.predict(prediction_data)

return pd.DataFrame(
predictions,
columns=['Churn_Probability']
)

def schema(self):
"""
schema() define the model input structure.
Use it to enforce the structure of incoming requests.
"""
model_schema = ModelSchema(
inputs=[
ExplicitFeature(name="User_Id", type=str),
ExplicitFeature(name="State", type=str),
ExplicitFeature(name="Account_Length", type=int),
ExplicitFeature(name="Area_Code", type=str),
ExplicitFeature(name="Intl_Plan", type=int),
ExplicitFeature(name="VMail_Plan", type=int),
ExplicitFeature(name="VMail_Message", type=int),
ExplicitFeature(name="Day_Mins", type=float),
ExplicitFeature(name="Day_Calls", type=int),
ExplicitFeature(name="Eve_Mins", type=float),
ExplicitFeature(name="Eve_Calls", type=int),
ExplicitFeature(name="Night_Mins", type=float),
ExplicitFeature(name="Night_Calls", type=int),
ExplicitFeature(name="Intl_Mins", type=float),
ExplicitFeature(name="Intl_Calls", type=int),
ExplicitFeature(name="CustServ_Calls", type=int),
ExplicitFeature(name="Agitation_Level", type=int),
],
outputs=[
InferenceOutput(name="Churn_Probability", type=float)
])
return model_schema
48 changes: 48 additions & 0 deletions sentiment_analysis/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Sentiment Analysis Model with JFrog ML

## Overview

This project employs the Sentiment Analysis. It's implemented using the [JFrog ML](https://docs.qwak.com/docs/introduction).

### Features

<br>

## How to Run Remotely on JFrog ML

1. **Build on the JFrog ML Platform**:

Create a new model on JFrog ML using the command:

```bash
qwak models create "Sentiment Analysis" --project "Sample Project"
```

Initiate a model build with:

```bash
qwak models build --model-id <your-model-id> ./main
```

2. **Deploy the Model on the JFrog ML Platform with a Real-Time Endpoint**:

To deploy your model via the CLI, use the following command:

```bash
qwak models deploy realtime --model-id <your-model-id> --build-id <your-build-id>
```

<br>

## Project Structure

```bash
.
├── main # Main directory containing core code
│ ├── finetuning.py # Fine-tuning script
│ ├── model.py # Defines the Sentiment Analysis Model
│ └── poetry.yaml # Poetry configuration file
| └── pyproject.toml
└── README.md # Documentation
```
<br>
114 changes: 114 additions & 0 deletions sentiment_analysis/main/finetuning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Tokenize the data
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import AdamW


# Define PyTorch Dataset
class CustomDataset(Dataset):
def __init__(self, examples):
self.examples = examples

def __getitem__(self, idx):
item = self.examples[idx]
return {
'input_ids': torch.tensor(item['input_ids']),
'attention_mask': torch.tensor(item['attention_mask']),
'label': torch.tensor(item['label']),
}

def __len__(self):
return len(self.examples)


def tokenize_function(examples, tokenizer):
return tokenizer(examples["sentence"], padding="max_length", truncation=True)


def generate_dataset(tokenizer, dataset) -> tuple[CustomDataset, CustomDataset]:
tokenized_datasets = dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)

# Train-validation split
train_dataset, eval_dataset = (
tokenized_datasets["train"],
tokenized_datasets["validation"],
)

train_dataset = CustomDataset(train_dataset)
eval_dataset = CustomDataset(eval_dataset)
return train_dataset, eval_dataset

def eval_model(model, device, eval_loader):
print("Running model evaluation")
# Evaluation
model.eval()
eval_loss = 0
total_eval_batches = len(eval_loader)
for batch_idx, batch in enumerate(eval_loader):
with torch.no_grad():
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["label"].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
eval_loss += loss.item()

# Log every 10% of evaluation batches
if batch_idx % (total_eval_batches // 10) == 0:
print(f"Evaluation Batch {batch_idx}/{total_eval_batches}, Eval Loss: {eval_loss / (batch_idx + 1):.4f}")
avg_eval_loss = eval_loss / total_eval_batches
return avg_eval_loss

def train_model(
model, device, lr, num_epochs, train_loader, eval_loader, early_stopping, logger
):
# Early stopping configuration
patience = 3
best_eval_loss = float("inf")
epochs_no_improve = 0
log_interval = len(train_loader) // 10
# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr)
# Fine-tuning loop
for epoch in range(num_epochs):
# Training
model.train()
train_loss = 0
for batch_idx, batch in enumerate(train_loader):
optimizer.zero_grad()
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["label"].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
train_loss += loss.item()
loss.backward()
optimizer.step()
# Log every 10% of batches
if batch_idx % log_interval == 0:
print(f"Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Train Loss: {train_loss / (batch_idx + 1):.4f}")
# Log every 100 batches
if batch_idx % 100 == 0 and batch_idx != 0:
print(
f"Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Train Loss: {train_loss / (batch_idx + 1):.4f}"
)

avg_train_loss = train_loss / len(train_loader)

avg_eval_loss = eval_model(model, device, eval_loader)
print(
f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Eval Loss: {avg_eval_loss:.4f}"
)

# Early stopping
if early_stopping and avg_eval_loss < best_eval_loss:
best_eval_loss = avg_eval_loss
epochs_no_improve = 0
else:
epochs_no_improve += 1

if epochs_no_improve == patience:
print(f"Early stopping after {epoch + 1} epochs.")
break
return model
Loading

0 comments on commit 173250d

Please sign in to comment.