Added Sentiment Analysis

Added Sentiment Analysis example
jfrog · Dec 31, 2024 · 173250d · 173250d
1 parent 9b1eb26
commit 173250d
Show file tree

Hide file tree

Showing 6 changed files with 4,813 additions and 0 deletions.
diff --git a/churn_model_new/main/.ipynb_checkpoints/model-checkpoint.py b/churn_model_new/main/.ipynb_checkpoints/model-checkpoint.py
@@ -0,0 +1,100 @@
+import os
+
+import pandas as pd
+import qwak
+import xgboost as xgb
+from qwak.model.base import QwakModel
+from qwak.model.schema import ExplicitFeature, ModelSchema, InferenceOutput
+from sklearn.model_selection import train_test_split
+
+
+class XGBoostChurnPredictionModel(QwakModel):
+
+    def __init__(self):
+        self.params = {
+            'n_estimators': int(os.getenv('n_estimators', 300)),
+            'learning_rate': float(os.getenv('learning_rate', 0.05)),
+            'objective': 'binary:logistic'
+        }
+
+        # Create a XGBoost classifier with the specified parameters
+        self.model = xgb.XGBClassifier(**self.params)
+
+        # Log model parameters to Qwak for tracking purposes
+        qwak.log_param(self.params)
+
+    def build(self):
+        file_absolute_path = os.path.dirname(os.path.abspath(__file__))
+        df = pd.read_csv(f"{file_absolute_path}/data.csv")
+
+        # Creating the X and y variables
+        y = df['churn']
+        X = df.drop(['churn', 'User_Id', '__index_level_0__',
+                     'event date', 'Phone', 'State'], axis=1)
+
+        # Splitting X and y into train and test version
+        X_train, X_validation, y_train, y_validation = train_test_split(
+            X, y, test_size=0.25, random_state=42
+        )
+
+        # Training our CatBoost model
+        self.model.fit(
+            X_train,
+            y_train,
+            eval_set=[(X_validation, y_validation)]
+        )
+
+        # Log metrics into Qwak
+        accuracy = self.model.score(X_validation, y_validation)
+        qwak.log_metric({"val_accuracy": accuracy})
+        qwak.log_data(dataframe=X, tag="train_data")
+
+    @qwak.api()
+    def predict(self, df):
+        """
+        The predict(df) method is the actual inference method.
+        """
+        # Getting the original columns order
+        feature_order = self.model.get_booster().feature_names
+
+        # Reformatting the prediction data order
+        prediction_data = df.drop(
+            ['User_Id', 'State'], axis=1
+        ).reindex(columns=feature_order)
+
+        predictions = self.model.predict(prediction_data)
+
+        return pd.DataFrame(
+            predictions,
+            columns=['Churn_Probability']
+        )
+
+    def schema(self):
+        """
+        schema() define the model input structure.
+        Use it to enforce the structure of incoming requests.
+        """
+        model_schema = ModelSchema(
+            inputs=[
+                ExplicitFeature(name="User_Id", type=str),
+                ExplicitFeature(name="State", type=str),
+                ExplicitFeature(name="Account_Length", type=int),
+                ExplicitFeature(name="Area_Code", type=str),
+                ExplicitFeature(name="Intl_Plan", type=int),
+                ExplicitFeature(name="VMail_Plan", type=int),
+                ExplicitFeature(name="VMail_Message", type=int),
+                ExplicitFeature(name="Day_Mins", type=float),
+                ExplicitFeature(name="Day_Calls", type=int),
+                ExplicitFeature(name="Eve_Mins", type=float),
+                ExplicitFeature(name="Eve_Calls", type=int),
+                ExplicitFeature(name="Night_Mins", type=float),
+                ExplicitFeature(name="Night_Calls", type=int),
+                ExplicitFeature(name="Intl_Mins", type=float),
+                ExplicitFeature(name="Intl_Calls", type=int),
+                ExplicitFeature(name="CustServ_Calls", type=int),
+                ExplicitFeature(name="Agitation_Level", type=int),
+            ],
+            outputs=[
+                InferenceOutput(name="Churn_Probability", type=float)
+            ])
+        return model_schema
diff --git a/sentiment_analysis/README.md b/sentiment_analysis/README.md
@@ -0,0 +1,48 @@
+# Sentiment Analysis Model with JFrog ML
+
+## Overview
+
+This project employs the Sentiment Analysis. It's implemented using the [JFrog ML](https://docs.qwak.com/docs/introduction).
+
+### Features
+
+<br>
+
+## How to Run Remotely on JFrog ML
+
+1. **Build on the JFrog ML Platform**:
+
+    Create a new model on JFrog ML using the command:
+
+    ```bash
+    qwak models create "Sentiment Analysis" --project "Sample Project"
+    ```
+
+    Initiate a model build with:
+
+    ```bash
+    qwak models build --model-id <your-model-id> ./main
+    ```
+
+2. **Deploy the Model on the JFrog ML Platform with a Real-Time Endpoint**:
+
+    To deploy your model via the CLI, use the following command:
+
+    ```bash
+    qwak models deploy realtime --model-id <your-model-id> --build-id <your-build-id>
+    ```
+
+<br>
+
+## Project Structure
+
+```bash
+.
+├── main                   # Main directory containing core code
+│   ├── finetuning.py      # Fine-tuning script
+│   ├── model.py           # Defines the Sentiment Analysis Model
+│   └── poetry.yaml        # Poetry configuration file
+|   └── pyproject.toml    
+└── README.md              # Documentation
+```
+<br>
diff --git a/sentiment_analysis/main/finetuning.py b/sentiment_analysis/main/finetuning.py
@@ -0,0 +1,114 @@
+# Tokenize the data
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from transformers import AdamW
+
+
+# Define PyTorch Dataset
+class CustomDataset(Dataset):
+    def __init__(self, examples):
+        self.examples = examples
+
+    def __getitem__(self, idx):
+            item = self.examples[idx]
+            return {
+                'input_ids': torch.tensor(item['input_ids']),
+                'attention_mask': torch.tensor(item['attention_mask']),
+                'label': torch.tensor(item['label']),
+            }
+
+    def __len__(self):
+        return len(self.examples)
+
+
+def tokenize_function(examples, tokenizer):
+    return tokenizer(examples["sentence"], padding="max_length", truncation=True)
+
+
+def generate_dataset(tokenizer, dataset) -> tuple[CustomDataset, CustomDataset]:
+    tokenized_datasets = dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)
+
+    # Train-validation split
+    train_dataset, eval_dataset = (
+        tokenized_datasets["train"],
+        tokenized_datasets["validation"],
+    )
+
+    train_dataset = CustomDataset(train_dataset)
+    eval_dataset = CustomDataset(eval_dataset)
+    return train_dataset, eval_dataset
+
+def eval_model(model, device, eval_loader):
+    print("Running model evaluation")
+    # Evaluation
+    model.eval()
+    eval_loss = 0
+    total_eval_batches = len(eval_loader)
+    for batch_idx, batch in enumerate(eval_loader):
+        with torch.no_grad():
+            input_ids = batch["input_ids"].to(device)
+            attention_mask = batch["attention_mask"].to(device)
+            labels = batch["label"].to(device)
+            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
+            loss = outputs.loss
+            eval_loss += loss.item()
+
+        # Log every 10% of evaluation batches
+        if batch_idx % (total_eval_batches // 10) == 0:
+            print(f"Evaluation Batch {batch_idx}/{total_eval_batches}, Eval Loss: {eval_loss / (batch_idx + 1):.4f}")
+    avg_eval_loss = eval_loss / total_eval_batches
+    return avg_eval_loss
+
+def train_model(
+    model, device, lr, num_epochs, train_loader, eval_loader, early_stopping, logger
+):
+    # Early stopping configuration
+    patience = 3
+    best_eval_loss = float("inf")
+    epochs_no_improve = 0
+    log_interval = len(train_loader) // 10
+    # Define optimizer and scheduler
+    optimizer = AdamW(model.parameters(), lr=lr)
+    # Fine-tuning loop
+    for epoch in range(num_epochs):
+        # Training
+        model.train()
+        train_loss = 0
+        for batch_idx, batch in enumerate(train_loader):
+            optimizer.zero_grad()
+            input_ids = batch["input_ids"].to(device)
+            attention_mask = batch["attention_mask"].to(device)
+            labels = batch["label"].to(device)
+            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
+            loss = outputs.loss
+            train_loss += loss.item()
+            loss.backward()
+            optimizer.step()
+                    # Log every 10% of batches
+            if batch_idx % log_interval == 0:
+                print(f"Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Train Loss: {train_loss / (batch_idx + 1):.4f}")
+            # Log every 100 batches
+            if batch_idx % 100 == 0 and batch_idx != 0:
+                print(
+                    f"Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Train Loss: {train_loss / (batch_idx + 1):.4f}"
+                )
+
+        avg_train_loss = train_loss / len(train_loader)
+
+        avg_eval_loss = eval_model(model, device, eval_loader)
+        print(
+            f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Eval Loss: {avg_eval_loss:.4f}"
+        )
+
+        # Early stopping
+        if early_stopping and avg_eval_loss < best_eval_loss:
+            best_eval_loss = avg_eval_loss
+            epochs_no_improve = 0
+        else:
+            epochs_no_improve += 1
+
+        if epochs_no_improve == patience:
+            print(f"Early stopping after {epoch + 1} epochs.")
+            break
+    return model