Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model] Implement model Unimp #83

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions gammagl/datasets/OgbGraphData.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
,ogbg-molbace,ogbg-molbbbp,ogbg-molclintox,ogbg-molmuv,ogbg-molpcba,ogbg-molsider,ogbg-moltox21,ogbg-moltoxcast,ogbg-molhiv,ogbg-molesol,ogbg-molfreesolv,ogbg-mollipo,ogbg-molchembl,ogbg-ppa,ogbg-code2
num tasks,1,1,2,17,128,27,12,617,1,1,1,1,1310,1,1
eval metric,rocauc,rocauc,rocauc,ap,ap,rocauc,rocauc,rocauc,rocauc,rmse,rmse,rmse,rocauc,acc,F1
download_name,bace,bbbp,clintox,muv,pcba,sider,tox21,toxcast,hiv,esol,freesolv,lipophilicity,chembl,ogbg_ppi_medium,code2
version,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
url,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/bace.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/bbbp.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/clintox.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/muv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/pcba.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/sider.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/tox21.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/toxcast.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/hiv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/esol.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/freesolv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/lipophilicity.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/chembl.zip,http://snap.stanford.edu/ogb/data/graphproppred/ogbg_ppi_medium.zip,http://snap.stanford.edu/ogb/data/graphproppred/code2.zip
add_inverse_edge,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False
data type,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,,
has_node_attr,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True
has_edge_attr,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False
task type,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,regression,regression,regression,binary classification,multiclass classification,subtoken prediction
num classes,2,2,2,2,2,2,2,2,2,-1,-1,-1,2,37,-1
split,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,species,project
additional node files,None,None,None,None,None,None,None,None,None,None,None,None,None,None,"node_is_attributed,node_dfs_order,node_depth"
additional edge files,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None
binary,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
14 changes: 14 additions & 0 deletions gammagl/datasets/OgbLinkData.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
,ogbl-ppa,ogbl-collab,ogbl-citation2,ogbl-wikikg2,ogbl-ddi,ogbl-biokg,ogbl-vessel
eval metric,hits@100,hits@50,mrr,mrr,hits@20,mrr,rocauc
task type,link prediction,link prediction,link prediction,KG completion,link prediction,KG completion,link prediction
download_name,ppassoc,collab,citation-v2,wikikg-v2,ddi,biokg,vessel
version,1,1,1,1,1,1,1
url,http://snap.stanford.edu/ogb/data/linkproppred/ppassoc.zip,http://snap.stanford.edu/ogb/data/linkproppred/collab.zip,http://snap.stanford.edu/ogb/data/linkproppred/citation-v2.zip,http://snap.stanford.edu/ogb/data/linkproppred/wikikg-v2.zip,http://snap.stanford.edu/ogb/data/linkproppred/ddi.zip,http://snap.stanford.edu/ogb/data/linkproppred/biokg.zip,http://snap.stanford.edu/ogb/data/linkproppred/vessel.zip
add_inverse_edge,True,True,False,False,True,False,False
has_node_attr,True,True,True,False,False,False,True
has_edge_attr,False,False,False,False,False,False,True
split,throughput,time,time,time,target,random,spatial
additional node files,None,None,node_year,None,None,None,None
additional edge files,None,"edge_weight,edge_year",None,edge_reltype,None,edge_reltype,None
is hetero,False,False,False,False,False,True,False
binary,False,False,False,False,False,False,True
16 changes: 16 additions & 0 deletions gammagl/datasets/OgbNodeData.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
,ogbn-proteins,ogbn-products,ogbn-arxiv,ogbn-mag,ogbn-papers100M
num tasks,112,1,1,1,1
num classes,2,47,40,349,172
eval metric,rocauc,acc,acc,acc,acc
task type,binary classification,multiclass classification,multiclass classification,multiclass classification,multiclass classification
download_name,proteins,products,arxiv,mag,papers100M-bin
version,1,1,1,2,1
url,http://snap.stanford.edu/ogb/data/nodeproppred/proteins.zip,http://snap.stanford.edu/ogb/data/nodeproppred/products.zip,http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip,http://snap.stanford.edu/ogb/data/nodeproppred/mag.zip,http://snap.stanford.edu/ogb/data/nodeproppred/papers100M-bin.zip
add_inverse_edge,True,True,False,False,False
has_node_attr,False,True,True,True,True
has_edge_attr,True,False,False,False,False
split,species,sales_ranking,time,time,time
additional node files,node_species,None,node_year,node_year,node_year
additional edge files,None,None,None,edge_reltype,None
is hetero,False,False,False,True,False
binary,False,False,False,False,True
161 changes: 161 additions & 0 deletions gammagl/datasets/ogb_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from gammagl.data import InMemoryDataset
from gammgl.utils.ogb_url import decide_download, download_url, extract_zip
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gammgl -> gammagl

from gammagl.io.read_ogb import read_graph


class OgbGraphDataset(InMemoryDataset):
def __init__(self, name, root = 'dataset', transform=None, pre_transform = None, meta_dict = None):
'''
- name (str): name of the dataset
- root (str): root directory to store the dataset folder
- transform, pre_transform (optional): transform/pre-transform graph objects

- meta_dict: dictionary that stores all the meta-information about data. Default is None,
but when something is passed, it uses its information. Useful for debugging for external contributers.
'''

self.name = name ## original name, e.g., ogbg-molhiv

if meta_dict is None:
self.dir_name = '_'.join(name.split('-'))

# check if previously-downloaded folder exists.
# If so, use that one.
if osp.exists(osp.join(root, self.dir_name + '_pyg')):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pyg相关的代码改成gammagl

self.dir_name = self.dir_name + '_pyg'

self.original_root = root
self.root = osp.join(root, self.dir_name)

master = pd.read_csv(os.path.join(os.path.dirname(__file__), 'OgbGraphData.csv'), index_col = 0)
if not self.name in master:
error_mssg = 'Invalid dataset name {}.\n'.format(self.name)
error_mssg += 'Available datasets are as follows:\n'
error_mssg += '\n'.join(master.keys())
raise ValueError(error_mssg)
self.meta_info = master[self.name]

else:
self.dir_name = meta_dict['dir_path']
self.original_root = ''
self.root = meta_dict['dir_path']
self.meta_info = meta_dict

# check version
# First check whether the dataset has been already downloaded or not.
# If so, check whether the dataset version is the newest or not.
# If the dataset is not the newest version, notify this to the user.
if osp.isdir(self.root) and (not osp.exists(osp.join(self.root, 'RELEASE_v' + str(self.meta_info['version']) + '.txt'))):
print(self.name + ' has been updated.')
if input('Will you update the dataset now? (y/N)\n').lower() == 'y':
shutil.rmtree(self.root)

self.download_name = self.meta_info['download_name'] ## name of downloaded file, e.g., tox21

self.num_tasks = int(self.meta_info['num tasks'])
self.eval_metric = self.meta_info['eval metric']
self.task_type = self.meta_info['task type']
self.__num_classes__ = int(self.meta_info['num classes'])
self.binary = self.meta_info['binary'] == 'True'

super(PygGraphPropPredDataset, self).__init__(self.root, transform, pre_transform)

self.data, self.slices = self.load_data(self.processed_paths[0])

def get_idx_split(self, split_type = None):
if split_type is None:
split_type = self.meta_info['split']

path = osp.join(self.root, 'split', split_type)

# short-cut if split_dict.pt exists
if os.path.isfile(os.path.join(path, 'split_dict.pt')):
return self.load_data(os.path.join(path, 'split_dict.pt'))

train_idx = pd.read_csv(osp.join(path, 'train.csv.gz'), compression='gzip', header = None).values.T[0]
valid_idx = pd.read_csv(osp.join(path, 'valid.csv.gz'), compression='gzip', header = None).values.T[0]
test_idx = pd.read_csv(osp.join(path, 'test.csv.gz'), compression='gzip', header = None).values.T[0]

return {'train': train_idx, 'valid': valid_idx, 'test': test_idx}

@property
def num_classes(self):
return self.__num_classes__

@property
def raw_file_names(self):
if self.binary:
return ['data.npz']
else:
file_names = ['edge']
if self.meta_info['has_node_attr'] == 'True':
file_names.append('node-feat')
if self.meta_info['has_edge_attr'] == 'True':
file_names.append('edge-feat')
return [file_name + '.csv.gz' for file_name in file_names]

@property
def processed_file_names(self):
return 'geometric_data_processed.pt'

def download(self):
url = self.meta_info['url']
if decide_download(url):
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
shutil.rmtree(self.root)
shutil.move(osp.join(self.original_root, self.download_name), self.root)

else:
print('Stop downloading.')
shutil.rmtree(self.root)
exit(-1)

def process(self):
### read pyg graph list
add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

if self.meta_info['additional node files'] == 'None':
additional_node_files = []
else:
additional_node_files = self.meta_info['additional node files'].split(',')

if self.meta_info['additional edge files'] == 'None':
additional_edge_files = []
else:
additional_edge_files = self.meta_info['additional edge files'].split(',')

data_list = read_graph(self.raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary)

if self.task_type == 'subtoken prediction':
graph_label_notparsed = pd.read_csv(osp.join(self.raw_dir, 'graph-label.csv.gz'), compression='gzip', header = None).values
graph_label = [str(graph_label_notparsed[i][0]).split(' ') for i in range(len(graph_label_notparsed))]

for i, g in enumerate(data_list):
g.y = graph_label[i]

else:
if self.binary:
graph_label = np.load(osp.join(self.raw_dir, 'graph-label.npz'))['graph_label']
else:
graph_label = pd.read_csv(osp.join(self.raw_dir, 'graph-label.csv.gz'), compression='gzip', header = None).values

has_nan = np.isnan(graph_label).any()

for i, g in enumerate(data_list):
g.y = graph_label[i]

if self.pre_transform is not None:
data_list = [self.pre_transform(data) for data in data_list]

data, slices = self.collate(data_list)

print('Saving...')
self.save_data((data, slices), self.processed_paths[0])


142 changes: 142 additions & 0 deletions gammagl/datasets/ogb_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from gammagl.data import InMemoryDataset
from gammgl.utils.ogb_url import decide_download, download_url, extract_zip
from gammagl.io.read_ogb import read_graph, read_heterograph

class OgbLinkDataset(InMemoryDataset):
def __init__(self, name, root = 'dataset', transform=None, pre_transform=None, meta_dict = None):
'''
- name (str): name of the dataset
- root (str): root directory to store the dataset folder

- meta_dict: dictionary that stores all the meta-information about data. Default is None,
but when something is passed, it uses its information. Useful for debugging for external contributers.
'''

self.name = name ## original name, e.g., ogbl-ppa

if meta_dict is None:
self.dir_name = '_'.join(name.split('-'))

# check if previously-downloaded folder exists.
# If so, use that one.
if osp.exists(osp.join(root, self.dir_name + '_pyg')):
self.dir_name = self.dir_name + '_pyg'

self.original_root = root
self.root = osp.join(root, self.dir_name)

master = pd.read_csv(os.path.join(os.path.dirname(__file__), 'OgbLinkData.csv'), index_col = 0)
if not self.name in master:
error_mssg = 'Invalid dataset name {}.\n'.format(self.name)
error_mssg += 'Available datasets are as follows:\n'
error_mssg += '\n'.join(master.keys())
raise ValueError(error_mssg)
self.meta_info = master[self.name]

else:
self.dir_name = meta_dict['dir_path']
self.original_root = ''
self.root = meta_dict['dir_path']
self.meta_info = meta_dict

# check version
# First check whether the dataset has been already downloaded or not.
# If so, check whether the dataset version is the newest or not.
# If the dataset is not the newest version, notify this to the user.
if osp.isdir(self.root) and (not osp.exists(osp.join(self.root, 'RELEASE_v' + str(self.meta_info['version']) + '.txt'))):
print(self.name + ' has been updated.')
if input('Will you update the dataset now? (y/N)\n').lower() == 'y':
shutil.rmtree(self.root)

self.download_name = self.meta_info['download_name'] ## name of downloaded file, e.g., ppassoc

self.task_type = self.meta_info['task type']
self.eval_metric = self.meta_info['eval metric']
self.is_hetero = self.meta_info['is hetero'] == 'True'
self.binary = self.meta_info['binary'] == 'True'

super(OgbLinkDataset, self).__init__(self.root, transform, pre_transform)
self.data, self.slices = self.load_data(self.processed_paths[0])

def get_edge_split(self, split_type = None):
if split_type is None:
split_type = self.meta_info['split']

path = osp.join(self.root, 'split', split_type)

# short-cut if split_dict.pt exists
if os.path.isfile(os.path.join(path, 'split_dict.pt')):
return self.load_data(os.path.join(path, 'split_dict.pt'))

train = self.load_data(osp.join(path, 'train.pt'))
valid = self.load_data(osp.join(path, 'valid.pt'))
test = self.load_data(osp.join(path, 'test.pt'))

return {'train': train, 'valid': valid, 'test': test}

@property
def raw_file_names(self):
if self.binary:
if self.is_hetero:
return ['edge_index_dict.npz']
else:
return ['data.npz']
else:
if self.is_hetero:
return ['num-node-dict.csv.gz', 'triplet-type-list.csv.gz']
else:
file_names = ['edge']
if self.meta_info['has_node_attr'] == 'True':
file_names.append('node-feat')
if self.meta_info['has_edge_attr'] == 'True':
file_names.append('edge-feat')
return [file_name + '.csv.gz' for file_name in file_names]

@property
def processed_file_names(self):
return osp.join('geometric_data_processed.pt')

def download(self):
url = self.meta_info['url']
if decide_download(url):
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
shutil.rmtree(self.root)
shutil.move(osp.join(self.original_root, self.download_name), self.root)
else:
print('Stop downloading.')
shutil.rmtree(self.root)
exit(-1)

def process(self):
add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

if self.meta_info['additional node files'] == 'None':
additional_node_files = []
else:
additional_node_files = self.meta_info['additional node files'].split(',')

if self.meta_info['additional edge files'] == 'None':
additional_edge_files = []
else:
additional_edge_files = self.meta_info['additional edge files'].split(',')

if self.is_hetero:
data = read_heterograph(self.raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary)[0]
else:
data = read_graph(self.raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary)[0]

data = data if self.pre_transform is None else self.pre_transform(data)

print('Saving...')
self.save_data(self.collate([data]), self.processed_paths[0])

def __repr__(self):
return '{}()'.format(self.__class__.__name__)


Loading