diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/formatting-tests.yaml
similarity index 62%
rename from .github/workflows/unit-tests.yaml
rename to .github/workflows/formatting-tests.yaml
index 40c5c10..5d09fa9 100644
--- a/.github/workflows/unit-tests.yaml
+++ b/.github/workflows/formatting-tests.yaml
@@ -1,4 +1,4 @@
-name: unit tests
+name: formatting tests
 
 on:
   push:
@@ -7,35 +7,30 @@ on:
     branches: [ develop ]
 
 jobs:
-  build:
-
+  formatting:
     runs-on: ${{ matrix.os }}
 
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest]
-        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
 
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
+    - name: Set up Python 3.9
       uses: actions/setup-python@v2
       with:
-        python-version: ${{ matrix.python-version }}
+        python-version: 3.9
 
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         pip install flake8 pytest
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
 
     - name: Update black
-      if: ${{ matrix.python-version == 3.9 }}
       run: |
         pip install --upgrade black
 
     - name: Lint and Format Check with flake8 and black
-      if: ${{ matrix.python-version == 3.9 }}
       run: |
         black --diff --check .
         flake8
diff --git a/.github/workflows/nvidia-tests.yaml b/.github/workflows/nvidia-tests.yaml
new file mode 100644
index 0000000..7722155
--- /dev/null
+++ b/.github/workflows/nvidia-tests.yaml
@@ -0,0 +1,34 @@
+name: nvidia-rtx-3090 tests
+
+on:
+  push:
+    branches: [ develop ]
+  pull_request:
+    branches: [ develop ]
+
+jobs:
+  mnist-trainer:
+    runs-on: [ nvidia ]
+
+    strategy:
+      matrix:
+        ginter: [ 1, 2 ]
+        memopt: [ '0', '1' ] 
+    steps:
+    - uses: actions/checkout@v3
+    - name: Install AxoNN
+      run: |
+        pip install -r requirements.txt
+    - name: Download dataset
+      run: |
+        python -c "import torchvision; torchvision.datasets.MNIST(root=\"./axonn/tests\", download=True, train=True)"
+    - name: Train 
+      run: |
+        export G_inter=${{ matrix.ginter }}
+        export G_data=$(( 2 / G_inter ))
+        export memopt=${{ matrix.memopt }}
+        echo "training with G_inter = ${G_inter}, G_data = $(( 2 / G_inter  )) ${{ matrix.memopt }}" 
+        mpirun -n 2 pytest --with-mpi 
+    - name: Uninstall AxoNN
+      run: |
+        pip uninstall --yes axonn
diff --git a/README.md b/README.md
index 409bc4e..ac1fac7 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,14 @@
 [![Build Status](https://github.com/hpcgroup/axonn/actions/workflows/unit-tests.yaml/badge.svg)](https://github.com/hpcgroup/axonn/actions)
 [![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
-A parallel framework for training deep neural networks.
+AxoNN is a parallel framework for training deep neural networks. 
+
+### Installation 
+Prior to the installation, [PyTorch](https://pytorch.org/get-started/locally/) must already be installed.  
+
+```bash
+pip install axonn
+```
 
 ### Contributing
 
diff --git a/axonn/__init__.py b/axonn/__init__.py
new file mode 100644
index 0000000..8abdbbe
--- /dev/null
+++ b/axonn/__init__.py
@@ -0,0 +1,4 @@
+# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/axonn/axonn.py b/axonn/axonn.py
index ca8648d..d2f50b1 100644
--- a/axonn/axonn.py
+++ b/axonn/axonn.py
@@ -366,6 +366,7 @@ def _initialize_mixed_precision_with_cpu_offload(
     return model, optimizer
 
 
+@torch.no_grad()
 def register_model_and_optimizer(model_shard, optimizer):
     """AxoNN's user facing function to register a model shard and
     the corresponding optimizer.
@@ -397,6 +398,9 @@ def register_model_and_optimizer(model_shard, optimizer):
         model_params.div_(config.G_data), async_op=False
     )  # sync all parameters across data parallel ranks
 
+    if computation_dtype == torch.float16:
+        model_params_fp32.copy_(model_params_fp16)
+
     fp32_optimizer = optimizer
     fp32_optimizer.skip_next_step = False
 
diff --git a/axonn/optim.py b/axonn/optim.py
index e341c29..29e00a1 100644
--- a/axonn/optim.py
+++ b/axonn/optim.py
@@ -1,3 +1,8 @@
+# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 import torch
 from torch.optim.optimizer import Optimizer
 from . import axonn as ax
@@ -15,7 +20,7 @@ def __init__(
         lr=1e-3,
         betas=(0.9, 0.999),
         eps=1e-8,
-        weight_decay=1e-2,
+        weight_decay=0,
         bucket_size=16000000,
         coalescing_factor=4,
     ):
diff --git a/examples/test_vit.py b/axonn/tests/test_vit.py
similarity index 62%
rename from examples/test_vit.py
rename to axonn/tests/test_vit.py
index a0933bb..4c8b7fd 100644
--- a/examples/test_vit.py
+++ b/axonn/tests/test_vit.py
@@ -4,27 +4,32 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 
-from axonn import axonn as ax
-from axonn import optim
 import torchvision
 from external.models.vit import DistributedViT
 from torchvision.transforms import ToTensor
 import torch
 from tqdm import tqdm
+import pytest
+import os
 
 
+@pytest.mark.mpi
 def test_vit_mnist():
-    bs_per_gpu = 64
-    num_gpus = 6
-    bs = num_gpus * bs_per_gpu
-    mbs = bs_per_gpu
-    epochs = 10
-    cpu_offload = True
-    N, D, H = 12, 768, 12
+    from axonn import axonn as ax
+    from axonn import optim
+
+    G_inter = int(os.environ.get("G_inter"))
+    assert 6 % G_inter == 0
+    G_data = int(os.environ.get("G_data"))
+    bs = int(os.environ.get("batch_size", 64))
+    mbs = int(os.environ.get("micro_batch_size", 16))
+    epochs = int(os.environ.get("epochs", 10))
+    cpu_offload = bool(os.environ.get("memopt"))
+    N, D, H = 6, 128, 8
 
     ax.init(
-        G_data=2,
-        G_inter=3,
+        G_data=G_data,
+        G_inter=G_inter,
         mixed_precision=True,
         fp16_allreduce=True,
         cpu_offload=cpu_offload,
@@ -52,22 +57,19 @@ def test_vit_mnist():
     if cpu_offload:
         optimizer = optim.CPUAdam(model.parameters(), lr=0.001)
     else:
-        optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
     ax.register_model_and_optimizer(model, optimizer)
 
     ax.register_loss_fn(torch.nn.CrossEntropyLoss())
 
     train_dataset = torchvision.datasets.MNIST(
-        root="./examples/dataset/", train=True, transform=ToTensor()
+        root="./axonn/tests", train=True, transform=ToTensor()
     )
     train_loader = ax.create_dataloader(train_dataset, bs, mbs, 0)
-
+    previous_model_state_memory = None
     for epoch_number in range(epochs):
         epoch_loss = 0
-        for x, y in tqdm(
-            train_loader,
-            disable=not (ilp_rank == 0 and ax.config.data_parallel_rank == 0),
-        ):
+        for x, y in tqdm(train_loader, disable=True):
             optimizer.zero_grad()
             if ilp_rank == 0:
                 x, y = x.cuda(), y.cuda()
@@ -80,10 +82,18 @@ def test_vit_mnist():
             batch_loss = ax.run_batch(x, y, eval_mode=False)
             optimizer.step()
             epoch_loss += batch_loss
+            current_model_state_memory = torch.cuda.memory_allocated()
+            assert (not previous_model_state_memory) or (
+                current_model_state_memory == previous_model_state_memory
+            ), "model state memory should stay the same throughout training"
         if ilp_rank == G_inter - 1:
             ax.print_status(
                 f"Epoch {epoch_number+1} : epoch loss {epoch_loss/len(train_loader)}"
+                f": model state memory = {torch.cuda.memory_allocated()/2**30} GB"
             )
 
+    assert epoch_loss / len(train_loader) < 0.1, "model did not converge"
+
 
-test_vit_mnist()
+if __name__ == "__main__":
+    test_vit_mnist()
diff --git a/examples/ptb_loader.py b/examples/ptb_loader.py
index 0009ded..ebbbbfa 100644
--- a/examples/ptb_loader.py
+++ b/examples/ptb_loader.py
@@ -1,3 +1,8 @@
+# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 from torch.utils.data import Dataset
 import torch
 import os
diff --git a/examples/test_lm.py b/examples/test_lm.py
index 8080630..18399bf 100644
--- a/examples/test_lm.py
+++ b/examples/test_lm.py
@@ -1,3 +1,8 @@
+# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 from axonn import axonn as ax
 from axonn import optim
 from external.models.nvidia_transformer import DistributedGPT
diff --git a/examples/wikitext_loader.py b/examples/wikitext_loader.py
index 01e62a1..32bf608 100644
--- a/examples/wikitext_loader.py
+++ b/examples/wikitext_loader.py
@@ -1,3 +1,8 @@
+# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 import transformers
 import os
 from tqdm import tqdm
diff --git a/external/models/nvidia_transformer.py b/external/models/nvidia_transformer.py
index c18df05..9de9e81 100644
--- a/external/models/nvidia_transformer.py
+++ b/external/models/nvidia_transformer.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..5b8afdc
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,9 @@
+# Copyright 2022 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+[pytest]
+addopts = --durations=20 -ra
+testpaths = axonn/tests
+python_files = *.py
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c3dead2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+torchvision
+einops
+tqdm
+
+-e .
diff --git a/setup.py b/setup.py
index b31d008..8e41548 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 
 setup(
     name="axonn",
-    version="0.0.1",
+    version="0.1.0",
     description="A parallel library for extreme-scale deep learning",
     long_description="""An asynchronous, message-driven parallel framework for
         extreme-scale deep learning""",
@@ -18,4 +18,5 @@
     classifiers=["Development Status :: 2 - Pre-Alpha"],
     keywords="deep learning, distributed computing, parallel computing",
     packages=find_packages(),
+    install_requires=["torch", "mpi4py"],
 )
diff --git a/train.sh b/train.sh
deleted file mode 100755
index 614345b..0000000
--- a/train.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-SUMMIT_FS_HOME=/gpfs/alpine/csc452/scratch/ssingh37/
-export LC_CTYPE=en_US.UTF-8
-export PAMI_ENABLE_STRIPING=1
-export PAMI_IBV_ADAPTER_AFFINITY=1
-export PAMI_IBV_DEVICE_NAME="mlx5_0:1,mlx5_3:1"
-export PAMI_IBV_DEVICE_NAME_1="mlx5_3:1,mlx5_0:1"
-export PYTHONPATH="/gpfs/alpine/csc452/scratch/ssingh37/axonn:$PYTHONPATH"
-
-
-nodes=($(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch))
-head=${nodes[0]}
-export RANK=$OMPI_COMM_WORLD_RANK
-export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
-export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
-export MASTER_ADDR=$head
-export MASTER_PORT=29500
-
-G_inter=12
-G_data=8
-mbs=4
-bs=16384
-transformer_args='-N 48 -D 6336 -H 36'
-
-jsrun --smpiargs='-gpu' -n 16 -a 6 -g 6 -c 42 -r 1 python -u examples/test_lm.py --G-inter $G_inter --G-data $G_data --micro-batch-size $mbs --batch-size $bs $transformer_args --dataset wikitext --cpu-offload
-