Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Compile compatibility #789

Closed
wants to merge 45 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
ec808a7
init
vmoens May 21, 2024
34c9107
amend
vmoens May 21, 2024
c418e3f
Merge remote-tracking branch 'origin/main' into compile-compat
vmoens May 22, 2024
f26209d
amend
vmoens May 22, 2024
3db5b59
Merge branch 'main' into compile-compat
vmoens May 22, 2024
2c644d3
amend
vmoens May 22, 2024
e817ad5
Merge remote-tracking branch 'origin/main' into compile-compat
vmoens May 23, 2024
f120175
Merge remote-tracking branch 'origin/main' into compile-compat
vmoens May 23, 2024
93ed29a
amend
vmoens May 23, 2024
8f1eb8e
amend
vmoens May 24, 2024
f0c3e8d
amend
vmoens May 24, 2024
5f55d48
amend
vmoens May 24, 2024
efdd277
amend
vmoens May 24, 2024
efb09ed
init
vmoens May 24, 2024
f1e9744
amend
vmoens May 24, 2024
af19749
Merge remote-tracking branch 'origin/main' into faster-tc
vmoens May 24, 2024
187c2d8
amend
vmoens May 24, 2024
70296ae
Merge remote-tracking branch 'origin/main' into compile-compat
vmoens May 24, 2024
e89bcb3
Merge branch 'faster-tc' into compile-compat-fastertc
vmoens May 24, 2024
e4cf49f
amend
vmoens May 25, 2024
0142d8d
Merge remote-tracking branch 'origin/main' into compile-compat
vmoens May 25, 2024
80965e0
Merge branch 'compile-compat-fastertc' into compile-compat
vmoens May 25, 2024
8519835
amend
vmoens May 25, 2024
e1428b9
amend
vmoens May 25, 2024
81eaac4
amend
vmoens May 25, 2024
a4512ad
Merge remote-tracking branch 'origin/main' into compile-compat
vmoens May 31, 2024
2d8bef5
amend
vmoens May 31, 2024
19f9545
amend
vmoens May 31, 2024
c17c7b1
amend
vmoens May 31, 2024
fcc9131
amend
vmoens Jun 4, 2024
f7c7760
Merge remote-tracking branch 'origin/main' into compile-compat
vmoens Jun 4, 2024
ba5c247
amend
vmoens Jun 4, 2024
181da2a
amend
vmoens Jun 4, 2024
3b74b7e
amend
vmoens Jun 4, 2024
a1f9cdf
Merge remote-tracking branch 'origin/main' into compile-compat
vmoens Jun 5, 2024
44a9901
Merge branch 'main' into compile-compat
vmoens Jun 19, 2024
4cda3a9
amend
vmoens Jun 19, 2024
da0b99f
Merge remote-tracking branch 'origin/main' into compile-compat
vmoens Jun 27, 2024
b5cb40a
amend
vmoens Jun 27, 2024
7252fcf
amend
vmoens Jun 27, 2024
dd3560d
amend
vmoens Jun 29, 2024
cc8267d
Merge remote-tracking branch 'origin/main' into compile-compat
vmoens Jul 2, 2024
8fad24d
amend
vmoens Jul 2, 2024
674c16f
Merge branch 'main' into compile-compat
vmoens Jul 3, 2024
fc55e29
Merge branch 'main' into compile-compat
vmoens Jul 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/unittest/linux/scripts/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ lib_dir="${env_dir}/lib"
# solves ImportError: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.21' not found
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$lib_dir
export MKL_THREADING_LAYER=GNU
export TORCHDYNAMO_INLINE_INBUILT_NN_MODULES=1

coverage run -m pytest test/smoke_test.py -v --durations 20
coverage run -m pytest --instafail -v --durations 20
Expand Down
1 change: 1 addition & 0 deletions .github/unittest/linux_torchrec/scripts/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ lib_dir="${env_dir}/lib"
# solves ImportError: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.21' not found
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$lib_dir
export MKL_THREADING_LAYER=GNU
export TORCHDYNAMO_INLINE_INBUILT_NN_MODULES=1

coverage run -m pytest test/smoke_test.py -v --durations 20
coverage run -m pytest --instafail -v --durations 20
Expand Down
1 change: 1 addition & 0 deletions .github/unittest/rl_linux_optdeps/scripts/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ git config --global --add safe.directory '*'
root_dir="$(git rev-parse --show-toplevel)"
export MKL_THREADING_LAYER=GNU
export CKPT_BACKEND=torch
export TORCHDYNAMO_INLINE_INBUILT_NN_MODULES=1

#MUJOCO_GL=glfw pytest --cov=torchrl --junitxml=test-results/junit.xml -v --durations 20
MUJOCO_GL=egl python -m pytest rl/test --instafail -v --durations 20 --ignore rl/test/test_distributed.py
4 changes: 2 additions & 2 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
- name: Run benchmarks
run: |
cd benchmarks/
python -m pytest -vvv --rank 0 --benchmark-json output.json
TORCHDYNAMO_INLINE_INBUILT_NN_MODULES=1 python -m pytest -vvv --rank 0 --benchmark-json output.json
- name: Store benchmark results
uses: benchmark-action/github-action-benchmark@v1
if: ${{ github.ref == 'refs/heads/main' || github.event_name == 'workflow_dispatch' }}
Expand Down Expand Up @@ -114,7 +114,7 @@ jobs:
- name: Run benchmarks
run: |
cd benchmarks/
python -m pytest -vvv --rank 0 --benchmark-json output.json
TORCHDYNAMO_INLINE_INBUILT_NN_MODULES=1 python -m pytest -vvv --rank 0 --benchmark-json output.json
- name: Store benchmark results
uses: benchmark-action/github-action-benchmark@v1
if: ${{ github.ref == 'refs/heads/main' || github.event_name == 'workflow_dispatch' }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/benchmarks_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
- name: Run benchmarks
run: |
cd benchmarks/
RUN_BENCHMARK="pytest -vvv --rank 0 --benchmark-json "
RUN_BENCHMARK="TORCHDYNAMO_INLINE_INBUILT_NN_MODULES=1 pytest -vvv --rank 0 --benchmark-json "
git checkout ${{ github.event.pull_request.base.sha }}
$RUN_BENCHMARK ${{ env.BASELINE_JSON }}
git checkout ${{ github.event.pull_request.head.sha }}
Expand Down Expand Up @@ -128,7 +128,7 @@ jobs:
- name: Run benchmarks
run: |
cd benchmarks/
RUN_BENCHMARK="pytest -vvv --rank 0 --benchmark-json "
RUN_BENCHMARK="TORCHDYNAMO_INLINE_INBUILT_NN_MODULES=1 pytest -vvv --rank 0 --benchmark-json "
git checkout ${{ github.event.pull_request.base.sha }}
$RUN_BENCHMARK ${{ env.BASELINE_JSON }}
git checkout ${{ github.event.pull_request.head.sha }}
Expand Down
297 changes: 297 additions & 0 deletions benchmarks/compile/compile_td_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse

import pytest
import torch
from tensordict import LazyStackedTensorDict, tensorclass, TensorDict
from torch.utils._pytree import tree_map


@tensorclass
class MyTensorClass:
a: torch.Tensor
b: torch.Tensor
c: torch.Tensor
d: torch.Tensor
e: torch.Tensor
f: torch.Tensor


# Functions
def add_one(td):
return td + 1


def add_one_pytree(td):
return tree_map(lambda x: x + 1, td)


def add_self(td):
return td + td


def add_self_pytree(td):
return tree_map(lambda x: x + x, td)


def copy(td):
return td.copy()


def copy_pytree(td):
return tree_map(lambda x: x, td)


def assign_and_add(td, k):
for i in range(k, k + 100):
td[str(i)] = i
return td + 1


def assign_and_add_pytree(td, k, device):
for i in range(k, k + 100):
td[str(i)] = torch.tensor(i, device=device)
return tree_map(lambda x: x + 1, td)


def assign_and_add_stack(td, k):
for i in range(k, k + 100):
td[str(i)] = torch.full((2,), i, device=td.device)
return td + 1


def index(td, idx):
return td[idx]


def index_pytree(td, idx):
return tree_map(lambda x: x[idx], td)


def get_nested_td():
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
d = {}
_d = d
for i in range(10):
_d["a"] = torch.ones((), device=device)
_d[str(i)] = {}
_d = _d[str(i)]
_d["a"] = torch.ones((), device=device)
return TensorDict(d, device=device)


def get_flat_td():
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
return TensorDict(
{str(i): torch.full((), i, device=device) for i in range(50)}, device=device
)


def get_flat_tc():
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
return MyTensorClass(
a=torch.ones((15,), device=device),
b=torch.ones((15,), device=device),
c=torch.ones((15,), device=device),
d=torch.ones((15,), device=device),
e=torch.ones((15,), device=device),
f=torch.ones((15,), device=device),
device=device,
)


# Tests runtime of a simple arithmetic op over a highly nested tensordict
@pytest.mark.parametrize("mode", ["compile", "eager"])
@pytest.mark.parametrize("dict_type", ["tensordict", "pytree"])
def test_compile_add_one_nested(mode, dict_type, benchmark):
if dict_type == "tensordict":
if mode == "compile":
func = torch.compile(add_one, fullgraph=True)
else:
func = add_one
td = get_nested_td()
else:
if mode == "compile":
func = torch.compile(add_one_pytree, fullgraph=True)
else:
func = add_one_pytree
td = get_nested_td().to_dict()
func(td)
benchmark(func, td)


# Tests the speed of copying a nested tensordict
@pytest.mark.parametrize("mode", ["compile", "eager"])
@pytest.mark.parametrize("dict_type", ["tensordict", "pytree"])
def test_compile_copy_nested(mode, dict_type, benchmark):
if dict_type == "tensordict":
if mode == "compile":
func = torch.compile(copy, fullgraph=True)
else:
func = copy
td = get_nested_td()
else:
if mode == "compile":
func = torch.compile(copy_pytree, fullgraph=True)
else:
func = copy_pytree
td = get_nested_td().to_dict()
func(td)
benchmark(func, td)


# Tests runtime of a simple arithmetic op over a flat tensordict
@pytest.mark.parametrize("mode", ["compile", "eager"])
@pytest.mark.parametrize("dict_type", ["tensordict", "tensorclass", "pytree"])
def test_compile_add_one_flat(mode, dict_type, benchmark):
if dict_type == "tensordict":
if mode == "compile":
func = torch.compile(add_one, fullgraph=True)
else:
func = add_one
td = get_flat_td()
elif dict_type == "tensorclass":
if mode == "compile":
func = torch.compile(add_one, fullgraph=True)
else:
func = add_one
td = get_flat_tc()
else:
if mode == "compile":
func = torch.compile(add_one_pytree, fullgraph=True)
else:
func = add_one_pytree
td = get_flat_td().to_dict()
func(td)
benchmark(func, td)


@pytest.mark.parametrize("mode", ["eager", "compile"])
@pytest.mark.parametrize("dict_type", ["tensordict", "tensorclass", "pytree"])
def test_compile_add_self_flat(mode, dict_type, benchmark):
if dict_type == "tensordict":
if mode == "compile":
func = torch.compile(add_self, fullgraph=True)
else:
func = add_self
td = get_flat_td()
elif dict_type == "tensorclass":
if mode == "compile":
func = torch.compile(add_self, fullgraph=True)
else:
func = add_self
td = get_flat_tc()
else:
if mode == "compile":
func = torch.compile(add_self_pytree, fullgraph=True)
else:
func = add_self_pytree
td = get_flat_td().to_dict()
func(td)
benchmark(func, td)


# Tests the speed of copying a flat tensordict
@pytest.mark.parametrize("mode", ["compile", "eager"])
@pytest.mark.parametrize("dict_type", ["tensordict", "pytree"])
def test_compile_copy_flat(mode, dict_type, benchmark):
if dict_type == "tensordict":
if mode == "compile":
func = torch.compile(copy, fullgraph=True)
else:
func = copy
td = get_flat_td()
elif dict_type == "tensorclass":
if mode == "compile":
func = torch.compile(copy, fullgraph=True)
else:
func = copy
td = get_flat_tc()
else:
if mode == "compile":
func = torch.compile(copy_pytree, fullgraph=True)
else:
func = copy_pytree
td = get_flat_td().to_dict()
func(td)
benchmark(func, td)


# Tests the speed of assigning entries to an empty tensordict
@pytest.mark.parametrize("mode", ["compile", "eager"])
@pytest.mark.parametrize("dict_type", ["tensordict", "pytree"])
def test_compile_assign_and_add(mode, dict_type, benchmark):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
td = TensorDict(device=device)
if dict_type == "tensordict":
if mode == "compile":
func = torch.compile(assign_and_add, fullgraph=True)
else:
func = assign_and_add
kwargs = {}
else:
if mode == "compile":
func = torch.compile(assign_and_add_pytree, fullgraph=True)
else:
func = assign_and_add_pytree
td = td.to_dict()
kwargs = {"device": device}
func(td, 5, **kwargs)
benchmark(func, td, 5, **kwargs)


# Tests the speed of assigning entries to a lazy stacked tensordict


@pytest.mark.parametrize("mode", ["compile", "eager"])
def test_compile_assign_and_add_stack(mode, benchmark):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
td = LazyStackedTensorDict(TensorDict(device=device), TensorDict(device=device))
if mode == "compile":
func = torch.compile(assign_and_add_stack, fullgraph=True)
else:
func = assign_and_add_stack
kwargs = {}
func(td, 5, **kwargs)
benchmark(func, td, 5, **kwargs)


# Tests indexing speed
@pytest.mark.parametrize("mode", ["compile", "eager"])
@pytest.mark.parametrize("dict_type", ["tensordict", "tensorclass", "pytree"])
@pytest.mark.parametrize("index_type", ["tensor", "slice", "int"])
def test_compile_indexing(mode, dict_type, index_type, benchmark):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
td = TensorDict(
{"a": torch.arange(100), "b": {"c": torch.arange(100)}},
batch_size=[100],
device=device,
)
if dict_type == "tensordict":
if mode == "compile":
func = torch.compile(index, fullgraph=True)
else:
func = index
else:
if mode == "compile":
func = torch.compile(index_pytree, fullgraph=True)
else:
func = index_pytree
td = td.to_dict()
if index_type == int:
idx = 5
else:
idx = slice(None, None, 2)
if index_type == "tensor":
idx = torch.tensor(range(*idx.indices(10)))

func(td, idx)
benchmark(func, td, idx)


if __name__ == "__main__":
args, unknown = argparse.ArgumentParser().parse_known_args()
pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)
Loading
Loading