-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
6 changed files
with
4,813 additions
and
0 deletions.
There are no files selected for viewing
100 changes: 100 additions & 0 deletions
100
churn_model_new/main/.ipynb_checkpoints/model-checkpoint.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import os | ||
|
||
import pandas as pd | ||
import qwak | ||
import xgboost as xgb | ||
from qwak.model.base import QwakModel | ||
from qwak.model.schema import ExplicitFeature, ModelSchema, InferenceOutput | ||
from sklearn.model_selection import train_test_split | ||
|
||
|
||
class XGBoostChurnPredictionModel(QwakModel): | ||
|
||
def __init__(self): | ||
self.params = { | ||
'n_estimators': int(os.getenv('n_estimators', 300)), | ||
'learning_rate': float(os.getenv('learning_rate', 0.05)), | ||
'objective': 'binary:logistic' | ||
} | ||
|
||
# Create a XGBoost classifier with the specified parameters | ||
self.model = xgb.XGBClassifier(**self.params) | ||
|
||
# Log model parameters to Qwak for tracking purposes | ||
qwak.log_param(self.params) | ||
|
||
def build(self): | ||
file_absolute_path = os.path.dirname(os.path.abspath(__file__)) | ||
df = pd.read_csv(f"{file_absolute_path}/data.csv") | ||
|
||
# Creating the X and y variables | ||
y = df['churn'] | ||
X = df.drop(['churn', 'User_Id', '__index_level_0__', | ||
'event date', 'Phone', 'State'], axis=1) | ||
|
||
# Splitting X and y into train and test version | ||
X_train, X_validation, y_train, y_validation = train_test_split( | ||
X, y, test_size=0.25, random_state=42 | ||
) | ||
|
||
# Training our CatBoost model | ||
self.model.fit( | ||
X_train, | ||
y_train, | ||
eval_set=[(X_validation, y_validation)] | ||
) | ||
|
||
# Log metrics into Qwak | ||
accuracy = self.model.score(X_validation, y_validation) | ||
qwak.log_metric({"val_accuracy": accuracy}) | ||
qwak.log_data(dataframe=X, tag="train_data") | ||
|
||
@qwak.api() | ||
def predict(self, df): | ||
""" | ||
The predict(df) method is the actual inference method. | ||
""" | ||
# Getting the original columns order | ||
feature_order = self.model.get_booster().feature_names | ||
|
||
# Reformatting the prediction data order | ||
prediction_data = df.drop( | ||
['User_Id', 'State'], axis=1 | ||
).reindex(columns=feature_order) | ||
|
||
predictions = self.model.predict(prediction_data) | ||
|
||
return pd.DataFrame( | ||
predictions, | ||
columns=['Churn_Probability'] | ||
) | ||
|
||
def schema(self): | ||
""" | ||
schema() define the model input structure. | ||
Use it to enforce the structure of incoming requests. | ||
""" | ||
model_schema = ModelSchema( | ||
inputs=[ | ||
ExplicitFeature(name="User_Id", type=str), | ||
ExplicitFeature(name="State", type=str), | ||
ExplicitFeature(name="Account_Length", type=int), | ||
ExplicitFeature(name="Area_Code", type=str), | ||
ExplicitFeature(name="Intl_Plan", type=int), | ||
ExplicitFeature(name="VMail_Plan", type=int), | ||
ExplicitFeature(name="VMail_Message", type=int), | ||
ExplicitFeature(name="Day_Mins", type=float), | ||
ExplicitFeature(name="Day_Calls", type=int), | ||
ExplicitFeature(name="Eve_Mins", type=float), | ||
ExplicitFeature(name="Eve_Calls", type=int), | ||
ExplicitFeature(name="Night_Mins", type=float), | ||
ExplicitFeature(name="Night_Calls", type=int), | ||
ExplicitFeature(name="Intl_Mins", type=float), | ||
ExplicitFeature(name="Intl_Calls", type=int), | ||
ExplicitFeature(name="CustServ_Calls", type=int), | ||
ExplicitFeature(name="Agitation_Level", type=int), | ||
], | ||
outputs=[ | ||
InferenceOutput(name="Churn_Probability", type=float) | ||
]) | ||
return model_schema |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# Sentiment Analysis Model with JFrog ML | ||
|
||
## Overview | ||
|
||
This project employs the Sentiment Analysis. It's implemented using the [JFrog ML](https://docs.qwak.com/docs/introduction). | ||
|
||
### Features | ||
|
||
<br> | ||
|
||
## How to Run Remotely on JFrog ML | ||
|
||
1. **Build on the JFrog ML Platform**: | ||
|
||
Create a new model on JFrog ML using the command: | ||
|
||
```bash | ||
qwak models create "Sentiment Analysis" --project "Sample Project" | ||
``` | ||
|
||
Initiate a model build with: | ||
|
||
```bash | ||
qwak models build --model-id <your-model-id> ./main | ||
``` | ||
|
||
2. **Deploy the Model on the JFrog ML Platform with a Real-Time Endpoint**: | ||
|
||
To deploy your model via the CLI, use the following command: | ||
|
||
```bash | ||
qwak models deploy realtime --model-id <your-model-id> --build-id <your-build-id> | ||
``` | ||
|
||
<br> | ||
|
||
## Project Structure | ||
|
||
```bash | ||
. | ||
├── main # Main directory containing core code | ||
│ ├── finetuning.py # Fine-tuning script | ||
│ ├── model.py # Defines the Sentiment Analysis Model | ||
│ └── poetry.yaml # Poetry configuration file | ||
| └── pyproject.toml | ||
└── README.md # Documentation | ||
``` | ||
<br> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
# Tokenize the data | ||
import torch | ||
from torch.utils.data import Dataset | ||
from tqdm import tqdm | ||
from transformers import AdamW | ||
|
||
|
||
# Define PyTorch Dataset | ||
class CustomDataset(Dataset): | ||
def __init__(self, examples): | ||
self.examples = examples | ||
|
||
def __getitem__(self, idx): | ||
item = self.examples[idx] | ||
return { | ||
'input_ids': torch.tensor(item['input_ids']), | ||
'attention_mask': torch.tensor(item['attention_mask']), | ||
'label': torch.tensor(item['label']), | ||
} | ||
|
||
def __len__(self): | ||
return len(self.examples) | ||
|
||
|
||
def tokenize_function(examples, tokenizer): | ||
return tokenizer(examples["sentence"], padding="max_length", truncation=True) | ||
|
||
|
||
def generate_dataset(tokenizer, dataset) -> tuple[CustomDataset, CustomDataset]: | ||
tokenized_datasets = dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True) | ||
|
||
# Train-validation split | ||
train_dataset, eval_dataset = ( | ||
tokenized_datasets["train"], | ||
tokenized_datasets["validation"], | ||
) | ||
|
||
train_dataset = CustomDataset(train_dataset) | ||
eval_dataset = CustomDataset(eval_dataset) | ||
return train_dataset, eval_dataset | ||
|
||
def eval_model(model, device, eval_loader): | ||
print("Running model evaluation") | ||
# Evaluation | ||
model.eval() | ||
eval_loss = 0 | ||
total_eval_batches = len(eval_loader) | ||
for batch_idx, batch in enumerate(eval_loader): | ||
with torch.no_grad(): | ||
input_ids = batch["input_ids"].to(device) | ||
attention_mask = batch["attention_mask"].to(device) | ||
labels = batch["label"].to(device) | ||
outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | ||
loss = outputs.loss | ||
eval_loss += loss.item() | ||
|
||
# Log every 10% of evaluation batches | ||
if batch_idx % (total_eval_batches // 10) == 0: | ||
print(f"Evaluation Batch {batch_idx}/{total_eval_batches}, Eval Loss: {eval_loss / (batch_idx + 1):.4f}") | ||
avg_eval_loss = eval_loss / total_eval_batches | ||
return avg_eval_loss | ||
|
||
def train_model( | ||
model, device, lr, num_epochs, train_loader, eval_loader, early_stopping, logger | ||
): | ||
# Early stopping configuration | ||
patience = 3 | ||
best_eval_loss = float("inf") | ||
epochs_no_improve = 0 | ||
log_interval = len(train_loader) // 10 | ||
# Define optimizer and scheduler | ||
optimizer = AdamW(model.parameters(), lr=lr) | ||
# Fine-tuning loop | ||
for epoch in range(num_epochs): | ||
# Training | ||
model.train() | ||
train_loss = 0 | ||
for batch_idx, batch in enumerate(train_loader): | ||
optimizer.zero_grad() | ||
input_ids = batch["input_ids"].to(device) | ||
attention_mask = batch["attention_mask"].to(device) | ||
labels = batch["label"].to(device) | ||
outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | ||
loss = outputs.loss | ||
train_loss += loss.item() | ||
loss.backward() | ||
optimizer.step() | ||
# Log every 10% of batches | ||
if batch_idx % log_interval == 0: | ||
print(f"Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Train Loss: {train_loss / (batch_idx + 1):.4f}") | ||
# Log every 100 batches | ||
if batch_idx % 100 == 0 and batch_idx != 0: | ||
print( | ||
f"Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Train Loss: {train_loss / (batch_idx + 1):.4f}" | ||
) | ||
|
||
avg_train_loss = train_loss / len(train_loader) | ||
|
||
avg_eval_loss = eval_model(model, device, eval_loader) | ||
print( | ||
f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Eval Loss: {avg_eval_loss:.4f}" | ||
) | ||
|
||
# Early stopping | ||
if early_stopping and avg_eval_loss < best_eval_loss: | ||
best_eval_loss = avg_eval_loss | ||
epochs_no_improve = 0 | ||
else: | ||
epochs_no_improve += 1 | ||
|
||
if epochs_no_improve == patience: | ||
print(f"Early stopping after {epoch + 1} epochs.") | ||
break | ||
return model |
Oops, something went wrong.