diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/formatting-tests.yaml similarity index 62% rename from .github/workflows/unit-tests.yaml rename to .github/workflows/formatting-tests.yaml index 40c5c10..5d09fa9 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/formatting-tests.yaml @@ -1,4 +1,4 @@ -name: unit tests +name: formatting tests on: push: @@ -7,35 +7,30 @@ on: branches: [ develop ] jobs: - build: - + formatting: runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python 3.9 uses: actions/setup-python@v2 with: - python-version: ${{ matrix.python-version }} + python-version: 3.9 - name: Install dependencies run: | python -m pip install --upgrade pip pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Update black - if: ${{ matrix.python-version == 3.9 }} run: | pip install --upgrade black - name: Lint and Format Check with flake8 and black - if: ${{ matrix.python-version == 3.9 }} run: | black --diff --check . flake8 diff --git a/.github/workflows/nvidia-tests.yaml b/.github/workflows/nvidia-tests.yaml new file mode 100644 index 0000000..7722155 --- /dev/null +++ b/.github/workflows/nvidia-tests.yaml @@ -0,0 +1,34 @@ +name: nvidia-rtx-3090 tests + +on: + push: + branches: [ develop ] + pull_request: + branches: [ develop ] + +jobs: + mnist-trainer: + runs-on: [ nvidia ] + + strategy: + matrix: + ginter: [ 1, 2 ] + memopt: [ '0', '1' ] + steps: + - uses: actions/checkout@v3 + - name: Install AxoNN + run: | + pip install -r requirements.txt + - name: Download dataset + run: | + python -c "import torchvision; torchvision.datasets.MNIST(root=\"./axonn/tests\", download=True, train=True)" + - name: Train + run: | + export G_inter=${{ matrix.ginter }} + export G_data=$(( 2 / G_inter )) + export memopt=${{ matrix.memopt }} + echo "training with G_inter = ${G_inter}, G_data = $(( 2 / G_inter )) ${{ matrix.memopt }}" + mpirun -n 2 pytest --with-mpi + - name: Uninstall AxoNN + run: | + pip uninstall --yes axonn diff --git a/README.md b/README.md index 409bc4e..ac1fac7 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,14 @@ [![Build Status](https://github.com/hpcgroup/axonn/actions/workflows/unit-tests.yaml/badge.svg)](https://github.com/hpcgroup/axonn/actions) [![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -A parallel framework for training deep neural networks. +AxoNN is a parallel framework for training deep neural networks. + +### Installation +Prior to the installation, [PyTorch](https://pytorch.org/get-started/locally/) must already be installed. + +```bash +pip install axonn +``` ### Contributing diff --git a/axonn/__init__.py b/axonn/__init__.py new file mode 100644 index 0000000..8abdbbe --- /dev/null +++ b/axonn/__init__.py @@ -0,0 +1,4 @@ +# Copyright 2021 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/axonn/axonn.py b/axonn/axonn.py index ca8648d..d2f50b1 100644 --- a/axonn/axonn.py +++ b/axonn/axonn.py @@ -366,6 +366,7 @@ def _initialize_mixed_precision_with_cpu_offload( return model, optimizer +@torch.no_grad() def register_model_and_optimizer(model_shard, optimizer): """AxoNN's user facing function to register a model shard and the corresponding optimizer. @@ -397,6 +398,9 @@ def register_model_and_optimizer(model_shard, optimizer): model_params.div_(config.G_data), async_op=False ) # sync all parameters across data parallel ranks + if computation_dtype == torch.float16: + model_params_fp32.copy_(model_params_fp16) + fp32_optimizer = optimizer fp32_optimizer.skip_next_step = False diff --git a/axonn/optim.py b/axonn/optim.py index e341c29..29e00a1 100644 --- a/axonn/optim.py +++ b/axonn/optim.py @@ -1,3 +1,8 @@ +# Copyright 2021 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + import torch from torch.optim.optimizer import Optimizer from . import axonn as ax @@ -15,7 +20,7 @@ def __init__( lr=1e-3, betas=(0.9, 0.999), eps=1e-8, - weight_decay=1e-2, + weight_decay=0, bucket_size=16000000, coalescing_factor=4, ): diff --git a/examples/test_vit.py b/axonn/tests/test_vit.py similarity index 62% rename from examples/test_vit.py rename to axonn/tests/test_vit.py index a0933bb..4c8b7fd 100644 --- a/examples/test_vit.py +++ b/axonn/tests/test_vit.py @@ -4,27 +4,32 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -from axonn import axonn as ax -from axonn import optim import torchvision from external.models.vit import DistributedViT from torchvision.transforms import ToTensor import torch from tqdm import tqdm +import pytest +import os +@pytest.mark.mpi def test_vit_mnist(): - bs_per_gpu = 64 - num_gpus = 6 - bs = num_gpus * bs_per_gpu - mbs = bs_per_gpu - epochs = 10 - cpu_offload = True - N, D, H = 12, 768, 12 + from axonn import axonn as ax + from axonn import optim + + G_inter = int(os.environ.get("G_inter")) + assert 6 % G_inter == 0 + G_data = int(os.environ.get("G_data")) + bs = int(os.environ.get("batch_size", 64)) + mbs = int(os.environ.get("micro_batch_size", 16)) + epochs = int(os.environ.get("epochs", 10)) + cpu_offload = bool(os.environ.get("memopt")) + N, D, H = 6, 128, 8 ax.init( - G_data=2, - G_inter=3, + G_data=G_data, + G_inter=G_inter, mixed_precision=True, fp16_allreduce=True, cpu_offload=cpu_offload, @@ -52,22 +57,19 @@ def test_vit_mnist(): if cpu_offload: optimizer = optim.CPUAdam(model.parameters(), lr=0.001) else: - optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) ax.register_model_and_optimizer(model, optimizer) ax.register_loss_fn(torch.nn.CrossEntropyLoss()) train_dataset = torchvision.datasets.MNIST( - root="./examples/dataset/", train=True, transform=ToTensor() + root="./axonn/tests", train=True, transform=ToTensor() ) train_loader = ax.create_dataloader(train_dataset, bs, mbs, 0) - + previous_model_state_memory = None for epoch_number in range(epochs): epoch_loss = 0 - for x, y in tqdm( - train_loader, - disable=not (ilp_rank == 0 and ax.config.data_parallel_rank == 0), - ): + for x, y in tqdm(train_loader, disable=True): optimizer.zero_grad() if ilp_rank == 0: x, y = x.cuda(), y.cuda() @@ -80,10 +82,18 @@ def test_vit_mnist(): batch_loss = ax.run_batch(x, y, eval_mode=False) optimizer.step() epoch_loss += batch_loss + current_model_state_memory = torch.cuda.memory_allocated() + assert (not previous_model_state_memory) or ( + current_model_state_memory == previous_model_state_memory + ), "model state memory should stay the same throughout training" if ilp_rank == G_inter - 1: ax.print_status( f"Epoch {epoch_number+1} : epoch loss {epoch_loss/len(train_loader)}" + f": model state memory = {torch.cuda.memory_allocated()/2**30} GB" ) + assert epoch_loss / len(train_loader) < 0.1, "model did not converge" + -test_vit_mnist() +if __name__ == "__main__": + test_vit_mnist() diff --git a/examples/ptb_loader.py b/examples/ptb_loader.py index 0009ded..ebbbbfa 100644 --- a/examples/ptb_loader.py +++ b/examples/ptb_loader.py @@ -1,3 +1,8 @@ +# Copyright 2021 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + from torch.utils.data import Dataset import torch import os diff --git a/examples/test_lm.py b/examples/test_lm.py index 8080630..18399bf 100644 --- a/examples/test_lm.py +++ b/examples/test_lm.py @@ -1,3 +1,8 @@ +# Copyright 2021 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + from axonn import axonn as ax from axonn import optim from external.models.nvidia_transformer import DistributedGPT diff --git a/examples/wikitext_loader.py b/examples/wikitext_loader.py index 01e62a1..32bf608 100644 --- a/examples/wikitext_loader.py +++ b/examples/wikitext_loader.py @@ -1,3 +1,8 @@ +# Copyright 2021 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + import transformers import os from tqdm import tqdm diff --git a/external/models/nvidia_transformer.py b/external/models/nvidia_transformer.py index c18df05..9de9e81 100644 --- a/external/models/nvidia_transformer.py +++ b/external/models/nvidia_transformer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys import os diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..5b8afdc --- /dev/null +++ b/pytest.ini @@ -0,0 +1,9 @@ +# Copyright 2022 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +[pytest] +addopts = --durations=20 -ra +testpaths = axonn/tests +python_files = *.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c3dead2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +torchvision +einops +tqdm + +-e . diff --git a/setup.py b/setup.py index b31d008..8e41548 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name="axonn", - version="0.0.1", + version="0.1.0", description="A parallel library for extreme-scale deep learning", long_description="""An asynchronous, message-driven parallel framework for extreme-scale deep learning""", @@ -18,4 +18,5 @@ classifiers=["Development Status :: 2 - Pre-Alpha"], keywords="deep learning, distributed computing, parallel computing", packages=find_packages(), + install_requires=["torch", "mpi4py"], ) diff --git a/train.sh b/train.sh deleted file mode 100755 index 614345b..0000000 --- a/train.sh +++ /dev/null @@ -1,25 +0,0 @@ -SUMMIT_FS_HOME=/gpfs/alpine/csc452/scratch/ssingh37/ -export LC_CTYPE=en_US.UTF-8 -export PAMI_ENABLE_STRIPING=1 -export PAMI_IBV_ADAPTER_AFFINITY=1 -export PAMI_IBV_DEVICE_NAME="mlx5_0:1,mlx5_3:1" -export PAMI_IBV_DEVICE_NAME_1="mlx5_3:1,mlx5_0:1" -export PYTHONPATH="/gpfs/alpine/csc452/scratch/ssingh37/axonn:$PYTHONPATH" - - -nodes=($(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch)) -head=${nodes[0]} -export RANK=$OMPI_COMM_WORLD_RANK -export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK -export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE -export MASTER_ADDR=$head -export MASTER_PORT=29500 - -G_inter=12 -G_data=8 -mbs=4 -bs=16384 -transformer_args='-N 48 -D 6336 -H 36' - -jsrun --smpiargs='-gpu' -n 16 -a 6 -g 6 -c 42 -r 1 python -u examples/test_lm.py --G-inter $G_inter --G-data $G_data --micro-batch-size $mbs --batch-size $bs $transformer_args --dataset wikitext --cpu-offload -