diff --git a/tf_euler/python/dataset/citeseer.py b/tf_euler/python/dataset/citeseer.py index abef2f6..8c8a1aa 100644 --- a/tf_euler/python/dataset/citeseer.py +++ b/tf_euler/python/dataset/citeseer.py @@ -58,7 +58,26 @@ def download_data(self, source_url, out_dir): DataSet.download_file(source_url, citeseer_tgz_dir) with tarfile.open(citeseer_tgz_dir) as citeseer_file: print('unzip data..') - citeseer_file.extractall(out_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(citeseer_file, out_dir) def convert2json(self, convert_dir, out_dir): def add_node(id, type, weight, label, feature): diff --git a/tf_euler/python/dataset/cora.py b/tf_euler/python/dataset/cora.py index a04edd6..17d2f8c 100644 --- a/tf_euler/python/dataset/cora.py +++ b/tf_euler/python/dataset/cora.py @@ -57,7 +57,26 @@ def download_data(self, source_url, out_dir): DataSet.download_file(source_url, cora_tgz_dir) with tarfile.open(cora_tgz_dir) as cora_file: print('unzip data..') - cora_file.extractall(out_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(cora_file, out_dir) def convert2json(self, convert_dir, out_dir): def add_node(id, type, weight, label, feature): diff --git a/tf_euler/python/dataset/fb15k.py b/tf_euler/python/dataset/fb15k.py index 4af35e8..a2ac98b 100644 --- a/tf_euler/python/dataset/fb15k.py +++ b/tf_euler/python/dataset/fb15k.py @@ -57,7 +57,26 @@ def download_data(self, source_url, out_dir): DataSet.download_file(source_url, fb_tgz_dir) with tarfile.open(fb_tgz_dir) as fb_file: print('unzip data..') - fb_file.extractall(out_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(fb_file, out_dir) def convert2json(self, convert_dir, out_dir): def add_node(id, type, weight): diff --git a/tf_euler/python/dataset/pubmed.py b/tf_euler/python/dataset/pubmed.py index 6339730..a5d106f 100644 --- a/tf_euler/python/dataset/pubmed.py +++ b/tf_euler/python/dataset/pubmed.py @@ -58,7 +58,26 @@ def download_data(self, source_url, out_dir): DataSet.download_file(source_url, pubmed_tgz_dir) with tarfile.open(pubmed_tgz_dir) as pubmed_file: print('unzip data..') - pubmed_file.extractall(out_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(pubmed_file, out_dir) def convert2json(self, convert_dir, out_dir): def add_node(id, type, weight, label, feature): diff --git a/tf_euler/python/dataset/wn18.py b/tf_euler/python/dataset/wn18.py index 47e5982..caae0b6 100644 --- a/tf_euler/python/dataset/wn18.py +++ b/tf_euler/python/dataset/wn18.py @@ -56,7 +56,26 @@ def download_data(self, source_url, out_dir): DataSet.download_file(self.source_url, fb_tgz_dir) with tarfile.open(fb_tgz_dir) as fb_file: print('unzip data..') - fb_file.extractall(out_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(fb_file, out_dir) def convert2json(self, convert_dir, out_dir): def add_node(id, type, weight):