VectorInstitute · Adibvafa · Apr 17, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 16, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
     - id: check-toml
 
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: 'v0.3.1'
+    rev: 'v0.3.7'
     hooks:
     - id: ruff
       args: [--fix, --exit-non-zero-on-fix]

diff --git a/evaluation/AttentionVisualization.ipynb b/evaluation/AttentionVisualization.ipynb
@@ -45,53 +45,28 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import sys\n",
-    "from typing import Any, Dict\n",
     "\n",
     "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "from matplotlib.patches import Rectangle\n",
-    "import plotly.figure_factory as ff\n",
     "import plotly.graph_objects as go\n",
-    "import seaborn as sns\n",
-    "\n",
-    "import pytorch_lightning as pl\n",
     "import torch\n",
-    "from torch.utils.data import Subset\n",
-    "from lightning.pytorch.loggers import WandbLogger\n",
-    "from pytorch_lightning.callbacks import (\n",
-    "    EarlyStopping,\n",
-    "    LearningRateMonitor,\n",
-    "    ModelCheckpoint,\n",
-    ")\n",
-    "from pytorch_lightning.strategies.ddp import DDPStrategy\n",
-    "from pytorch_lightning.strategies import DeepSpeedStrategy\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from torch.utils.data import DataLoader\n",
-    "from torch.utils.data import Subset\n",
-    "\n",
-    "from bertviz.transformers_neuron_view import BertModel, BertTokenizer\n",
+    "from bertviz import head_view, model_view\n",
     "from bertviz.neuron_view import show\n",
-    "from transformers import AutoTokenizer, AutoModel, utils\n",
-    "from bertviz import model_view, head_view\n",
+    "from torch.utils.data import DataLoader, Subset\n",
+    "from transformers import utils\n",
+    "\n",
     "\n",
     "utils.logging.set_verbosity_error()  # Suppress standard warnings\n",
     "\n",
     "\n",
     "ROOT = \"/fs01/home/afallah/odyssey/odyssey\"\n",
     "os.chdir(ROOT)\n",
     "\n",
-    "from lib.data import FinetuneDataset\n",
-    "from lib.tokenizer import ConceptTokenizer\n",
-    "from lib.utils import (\n",
-    "    get_run_id,\n",
-    "    load_config,\n",
+    "from odyssey.data.dataset import FinetuneDataset\n",
+    "from odyssey.data.tokenizer import ConceptTokenizer\n",
+    "from odyssey.models.prediction import load_finetuned_model, predict_patient_outcomes\n",
+    "from odyssey.models.utils import (\n",
     "    load_finetune_data,\n",
-    "    seed_everything,\n",
-    ")\n",
-    "from lib.prediction import load_finetuned_model, predict_patient_outcomes\n",
-    "from models.big_bird_cehr.model import BigBirdFinetune, BigBirdPretrain\n",
-    "from models.cehr_bert.model import BertFinetune, BertPretrain"
+    ")"
    ]
   },
   {
@@ -996,7 +971,7 @@
     "\n",
     "for i in range(len(attention_matrix)):\n",
     "    truncated_attention_matrix.append(\n",
-    "        attention_matrix[i][:, :, :truncate_at, :truncate_at]\n",
+    "        attention_matrix[i][:, :, :truncate_at, :truncate_at],\n",
     "    )\n",
     "\n",
     "truncated_attention_matrix = tuple(truncated_attention_matrix)\n",
@@ -3218,7 +3193,7 @@
     "                    textangle=-90,\n",
     "                    bgcolor=\"red\",\n",
     "                    opacity=0.8,\n",
-    "                )\n",
+    "                ),\n",
     "            )\n",
     "\n",
     "    # Plot the attention matrix as a heatmap\n",
@@ -3231,7 +3206,7 @@
     "            hoverinfo=\"text\",\n",
     "            text=hover_text,\n",
     "            colorscale=\"YlGnBu\",\n",
-    "        )\n",
+    "        ),\n",
     "    )\n",
     "\n",
     "    fig.update_layout(\n",
@@ -3256,7 +3231,7 @@
     "        print(\n",
     "            f\"Token {tokenizer.id_to_token(concept_ids[token1])} \"\n",
     "            f\"with Token {tokenizer.id_to_token(concept_ids[token2])}: \"\n",
-    "            f\"Attention Value {attention_value:.3f}\"\n",
+    "            f\"Attention Value {attention_value:.3f}\",\n",
     "        )\n",
     "\n",
     "    fig.show()\n",

diff --git a/evaluation/TestAnalysis.ipynb b/evaluation/TestAnalysis.ipynb
@@ -0,0 +1,266 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-04-10 12:13:14,754] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "import torch\n",
+    "from sklearn.metrics import (\n",
+    "    auc,\n",
+    "    average_precision_score,\n",
+    "    balanced_accuracy_score,\n",
+    "    f1_score,\n",
+    "    precision_recall_curve,\n",
+    "    precision_score,\n",
+    "    recall_score,\n",
+    "    roc_auc_score,\n",
+    ")\n",
+    "from transformers import utils\n",
+    "\n",
+    "\n",
+    "utils.logging.set_verbosity_error()  # Suppress standard warnings\n",
+    "\n",
+    "\n",
+    "ROOT = \"/fs01/home/afallah/odyssey/odyssey\"\n",
+    "os.chdir(ROOT)\n",
+    "\n",
+    "from odyssey.data.tokenizer import ConceptTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class config:\n",
+    "    \"\"\"Save the configuration arguments.\"\"\"\n",
+    "\n",
+    "    model_path = \"test_epoch_end.ckpt\"\n",
+    "    vocab_dir = \"data/vocab\"\n",
+    "    data_dir = \"data/bigbird_data\"\n",
+    "    sequence_file = \"patient_sequences/patient_sequences_2048_mortality.parquet\"\n",
+    "    id_file = \"patient_id_dict/dataset_2048_mortality_1month.pkl\"\n",
+    "    valid_scheme = \"few_shot\"\n",
+    "    num_finetune_patients = \"20000\"\n",
+    "    label_name = \"label_mortality_1month\"\n",
+    "\n",
+    "    max_len = 2048\n",
+    "    batch_size = 1\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = ConceptTokenizer(data_dir=config.vocab_dir)\n",
+    "tokenizer.fit_on_vocab()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers', 'MixedPrecision'])"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = torch.load(config.model_path, map_location=config.device)\n",
+    "model.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'loss': tensor(0.1638, dtype=torch.float64),\n",
+       " 'preds': tensor([6, 7, 0,  ..., 7, 7, 7]),\n",
+       " 'labels': tensor([[1., 0., 0.,  ..., 0., 0., 0.],\n",
+       "         [0., 0., 0.,  ..., 0., 0., 0.],\n",
+       "         [1., 0., 0.,  ..., 0., 0., 0.],\n",
+       "         ...,\n",
+       "         [0., 0., 0.,  ..., 0., 0., 0.],\n",
+       "         [0., 0., 0.,  ..., 0., 0., 0.],\n",
+       "         [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64),\n",
+       " 'logits': tensor([[ 2.3418, -2.0781,  0.2194,  ..., -5.3945, -7.1797, -3.4180],\n",
+       "         [-1.6533, -3.4277, -6.8086,  ..., -6.8359, -5.2266, -5.6484],\n",
+       "         [ 1.0947, -3.7930, -6.1094,  ..., -6.3867, -6.6836, -5.5508],\n",
+       "         ...,\n",
+       "         [-2.8223, -3.7285, -4.6797,  ..., -7.9922, -5.6992, -6.7812],\n",
+       "         [-3.7148, -5.6328, -6.7188,  ..., -9.4062, -7.6445, -7.6016],\n",
+       "         [-2.5840, -2.2871, -4.6484,  ..., -7.8633, -4.7539, -6.4648]],\n",
+       "        dtype=torch.float16)}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_outputs = torch.load(\"test_outputs.pt\")\n",
+    "test_outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'Balanced Accuracy': 0.5, 'F1 Score': 0.0, 'Precision': 0.0, 'Recall': 0.0, 'AUROC': 0.8100258785715974, 'Average Precision Score': 0.001364147006900979, 'AUC-PR': 0.5006820735034505}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/fs01/home/afallah/light/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1497: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
+     ]
+    }
+   ],
+   "source": [
+    "def calculate_metrics(y_true, y_pred, y_prob):\n",
+    "    \"\"\"\n",
+    "    Calculate and return performance metrics.\n",
+    "    \"\"\"\n",
+    "    metrics = {\n",
+    "        \"Balanced Accuracy\": balanced_accuracy_score(y_true, y_pred),\n",
+    "        \"F1 Score\": f1_score(y_true, y_pred),\n",
+    "        \"Precision\": precision_score(y_true, y_pred),\n",
+    "        \"Recall\": recall_score(y_true, y_pred),\n",
+    "        \"AUROC\": roc_auc_score(y_true, y_prob),\n",
+    "        \"Average Precision Score\": average_precision_score(y_true, y_pred),\n",
+    "    }\n",
+    "\n",
+    "    precision, recall, _ = precision_recall_curve(y_true, y_pred)\n",
+    "    metrics[\"AUC-PR\"] = auc(recall, precision)\n",
+    "\n",
+    "    return metrics\n",
+    "\n",
+    "\n",
+    "targets = [10]\n",
+    "\n",
+    "for i in targets:\n",
+    "    labels = test_outputs[\"labels\"][:, i]\n",
+    "    logits = torch.sigmoid(test_outputs[\"logits\"][:, i])\n",
+    "    preds = (logits >= 0.5).int()\n",
+    "\n",
+    "    print(calculate_metrics(labels, preds, logits))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor(0)"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds.sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor(34., dtype=torch.float64)"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "labels.sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0, 0, 0,  ..., 0, 0, 0], dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}