diff --git a/dicee/analyse_experiments.py b/dicee/analyse_experiments.py index 65cb7ab6..09130c3e 100644 --- a/dicee/analyse_experiments.py +++ b/dicee/analyse_experiments.py @@ -1,4 +1,7 @@ -""" This script should be moved to dicee/scripts""" +""" This script should be moved to dicee/scripts +Example: +python dicee/analyse_experiments.py --dir Experiments --features "model" "trainMRR" "testMRR" +""" import os import json import pandas as pd @@ -120,19 +123,13 @@ def analyse(args): if os.path.isdir(full_path) is False: continue - with open(f'{full_path}/configuration.json', 'r') as f: config = json.load(f) - - try: - with open(f'{full_path}/report.json', 'r') as f: - report = json.load(f) - report = {i: report[i] for i in ['Runtime', 'NumParam']} - with open(f'{full_path}/eval_report.json', 'r') as f: - eval_report = json.load(f) - except FileNotFoundError: - print("NOT found") - continue + with open(f'{full_path}/report.json', 'r') as f: + report = json.load(f) + report = {i: report[i] for i in ['Runtime', 'NumParam']} + with open(f'{full_path}/eval_report.json', 'r') as f: + eval_report = json.load(f) config.update(eval_report) config.update(report) if "Train" in config: diff --git a/examples/LP_BenchmarkResults.md b/examples/LP_BenchmarkResults.md deleted file mode 100644 index 2db8188b..00000000 --- a/examples/LP_BenchmarkResults.md +++ /dev/null @@ -1,225 +0,0 @@ -# Link Prediction on Benchmark Datasets - -Here, we show that generalization performance of knowledge graph embedding models do not differ much if they are trained -well. - -# Hyperparameter Setting - -Hyperparameters play an important role in the successful applications of knowledge graph embedding models. In our -experiments, we selected such hyperparameter configuration so that experiments can be done less than a minute on UMLS -dataset - -# Link Prediction Performance Analysis on KINSHIP - -| model_name | train_mrr | train_h1 | train_h3 | train_h10 | val_mrr | val_h1 | val_h3 | val_h10 | test_mrr | test_h1 | test_h3 | test_h10 | runtime | -|:-------------|------------:|-----------:|-----------:|------------:|----------:|---------:|---------:|----------:|-----------:|----------:|----------:|-----------:|----------:| -| DistMult | 0.781619 | 0.681706 | 0.853406 | 0.967228 | 0.656955 | 0.516386 | 0.744382 | 0.941948 | 0.640159 | 0.496741 | 0.724395 | 0.938547 | 43.4192 | -| ComplEx | 0.883142 | 0.81291 | 0.947507 | 0.99011 | 0.757905 | 0.636236 | 0.860019 | 0.96161 | 0.762234 | 0.646182 | 0.851955 | 0.972533 | 45.6701 | -| QMult | 0.864761 | 0.789033 | 0.930185 | 0.985194 | 0.742049 | 0.61985 | 0.83661 | 0.955993 | 0.75211 | 0.636406 | 0.841713 | 0.956238 | 51.2721 | -| OMult | 0.85052 | 0.771126 | 0.917427 | 0.981449 | 0.744363 | 0.628277 | 0.834738 | 0.94897 | 0.745078 | 0.625233 | 0.838454 | 0.963687 | 47.2538 | -| ConEx | 0.726664 | 0.603406 | 0.815485 | 0.952891 | 0.68413 | 0.549157 | 0.779494 | 0.941479 | 0.676157 | 0.544693 | 0.768156 | 0.934358 | 52.2601 | -| ConvQ | 0.715911 | 0.586142 | 0.807584 | 0.957104 | 0.669007 | 0.525749 | 0.768258 | 0.948502 | 0.667853 | 0.524209 | 0.770019 | 0.935754 | 60.0243 | -| ConvO | 0.589155 | 0.433462 | 0.677434 | 0.918305 | 0.558573 | 0.401685 | 0.639513 | 0.902154 | 0.55491 | 0.39851 | 0.634078 | 0.902235 | 51.1115 | -| AConvO | 0.855045 | 0.770365 | 0.931355 | 0.986189 | 0.73474 | 0.609551 | 0.831461 | 0.950843 | 0.741717 | 0.61825 | 0.836127 | 0.962291 | 50.5469 | -| AConEx | 0.888418 | 0.8204 | 0.95055 | 0.989993 | 0.75704 | 0.637172 | 0.858614 | 0.955993 | 0.753787 | 0.634078 | 0.847765 | 0.967877 | 48.5488 | -| AConvQ | 0.888232 | 0.822156 | 0.947507 | 0.989876 | 0.74992 | 0.625468 | 0.852528 | 0.961142 | 0.752967 | 0.625698 | 0.856145 | 0.969274 | 46.3772 | - -# Link Prediction Performance Analysis on UMLS - -1. Multiplicative models fit the training dataset split of UMLS better than convolutional neural network based models. -2. Replacing the multiplicative connections of conv(h,r) with additive connections leads convolutional neural network - based models to fit better the training data. -3. Additive connections decrease the runtimes of the neural network based models. - -| model_name | train_mrr | train_h1 | train_h3 | train_h10 | val_mrr | val_h1 | val_h3 | val_h10 | test_mrr | test_h1 | test_h3 | test_h10 | runtime | -|:-------------|------------:|-----------:|-----------:|------------:|----------:|---------:|---------:|----------:|-----------:|----------:|----------:|-----------:|----------:| -| DistMult | 0.997396 | 0.994919 | 1 | 1 | 0.739 | 0.622699 | 0.821319 | 0.954755 | 0.738951 | 0.620272 | 0.824508 | 0.947806 | 32.1166 | -| ComplEx | 0.998107 | 0.996357 | 0.999904 | 1 | 0.725287 | 0.604294 | 0.809816 | 0.950153 | 0.748761 | 0.633888 | 0.83056 | 0.958396 | 39.2883 | -| QMult | 0.997148 | 0.994824 | 0.999712 | 1 | 0.733427 | 0.611963 | 0.82592 | 0.958589 | 0.732836 | 0.611195 | 0.826778 | 0.960666 | 37.2082 | -| OMult | 0.985969 | 0.972776 | 0.999233 | 1 | 0.754292 | 0.629601 | 0.855828 | 0.959356 | 0.762545 | 0.636914 | 0.857035 | 0.96823 | 38.2151 | -| ConEx | 0.963993 | 0.932707 | 0.995399 | 0.999041 | 0.80534 | 0.705521 | 0.883436 | 0.957822 | 0.810177 | 0.70121 | 0.906203 | 0.968986 | 46.6404 | -| ConvO | 0.912093 | 0.837615 | 0.98495 | 0.997316 | 0.832247 | 0.721626 | 0.934049 | 0.977761 | 0.830195 | 0.714826 | 0.933434 | 0.984871 | 58.9782 | -| ConvQ | 0.904769 | 0.831959 | 0.975748 | 0.993194 | 0.834325 | 0.730828 | 0.927147 | 0.976227 | 0.827083 | 0.711044 | 0.939486 | 0.977307 | 42.0177 | -| AConEx | 0.995367 | 0.991181 | 0.999617 | 1 | 0.719798 | 0.58589 | 0.824387 | 0.943252 | 0.734349 | 0.610439 | 0.824508 | 0.95764 | 40.6111 | -| AConvQ | 0.995162 | 0.990702 | 0.999808 | 1 | 0.709874 | 0.568252 | 0.819018 | 0.956288 | 0.737209 | 0.607413 | 0.850983 | 0.953858 | 42.8576 | -| AConvO | 0.986693 | 0.974214 | 0.999041 | 1 | 0.698491 | 0.548313 | 0.817485 | 0.946319 | 0.732535 | 0.594554 | 0.83888 | 0.961422 | 44.2366 | - -# Link Prediction Performance Analysis on FB15K-237 - -| model_name | train_mrr | train_h1 | train_h3 | train_h10 | val_mrr | val_h1 | val_h3 | val_h10 | test_mrr | test_h1 | test_h3 | test_h10 | runtime | -|:-----------|----------:|---------:|---------:|----------:|---------:|---------:|---------:|---------:|---------:|---------:|---------:|---------:|--------:| -| DistMult | 0.593914 | 0.499502 | 0.647212 | 0.773351 | 0.26749 | 0.189136 | 0.290077 | 0.425463 | 0.264973 | 0.1867 | 0.2854 | 0.426024 | 1006.55 | -| ComplEx | 0.574006 | 0.476723 | 0.627397 | 0.760223 | 0.261119 | 0.182948 | 0.282606 | 0.41996 | 0.259313 | 0.180592 | 0.281809 | 0.421602 | 1031.37 | -| QMult | 0.549874 | 0.448279 | 0.605929 | 0.743963 | 0.266945 | 0.188822 | 0.290248 | 0.423667 | 0.265085 | 0.186114 | 0.288161 | 0.424265 | 1038.14 | -| OMult | 0.497422 | 0.389729 | 0.55521 | 0.70461 | 0.276785 | 0.197149 | 0.302509 | 0.435329 | 0.27315 | 0.193809 | 0.2972 | 0.432791 | 1098.29 | -| ConEx | 0.16171 | 0.10981 | 0.171578 | 0.260664 | 0.185139 | 0.130026 | 0.197177 | 0.292159 | 0.184531 | 0.128286 | 0.197743 | 0.293633 | 1096.44 | -| AConEx | 0.559767 | 0.458587 | 0.615846 | 0.75322 | 0.271471 | 0.192273 | 0.294953 | 0.430539 | 0.270874 | 0.191977 | 0.293975 | 0.431227 | 1106.65 | -| AConvQ | 0.49182 | 0.383151 | 0.546962 | 0.702877 | 0.263918 | 0.184631 | 0.285458 | 0.424636 | 0.263824 | 0.184306 | 0.286988 | 0.424411 | 1118.75 | -| AConvO | 0.481607 | 0.374538 | 0.535467 | 0.690759 | 0.275071 | 0.197092 | 0.296664 | 0.435672 | 0.27353 | 0.194225 | 0.298568 | 0.433744 | 1203.71 | - - -## Rerun Experiments - -**Datasets: KGs/UMLS, KGs/KINSHIP, and KGs/FB15k-237** -## ConEx (Convolutional Complex Knowledge Graph Embeddings) - -```bash -python main.py --path_dataset_folder KGs/UMLS --model "ConEx" --optim Adam --embedding_dim 32 --num_epochs 256 --batch_size 1024 --lr 0.1 --backend 'pandas' --trainer 'PL' --scoring_technique 'KvsAll' --weight_decay 0.0 --input_dropout_rate 0.0 --hidden_dropout_rate 0.0 --feature_map_dropout_rate 0.0 --normalization LayerNorm --init_param xavier_normal --label_smoothing_rate 0.0 --kernel_size 3 --num_of_output_channels 3 --seed_for_computation 0 -``` -```bash -Total computation time: 38.546 seconds -Evaluate ConEx on Train set: Evaluate ConEx on Train set -{'H@1': 0.932707055214724, 'H@3': 0.995398773006135, 'H@10': 0.9990414110429447, 'MRR': 0.9639930469284392} -Evaluate ConEx on Validation set: Evaluate ConEx on Validation set -{'H@1': 0.7055214723926381, 'H@3': 0.8834355828220859, 'H@10': 0.9578220858895705, 'MRR': 0.8053400891316801} -Evaluate ConEx on Test set: Evaluate ConEx on Test set -{'H@1': 0.7012102874432677, 'H@3': 0.9062027231467473, 'H@10': 0.9689863842662633, 'MRR': 0.8101772584084878} -``` - -## AConEx (ConEx with Additive Connections) - -```bash -python main.py --path_dataset_folder KGs/UMLS --model "AConEx" --optim Adam --embedding_dim 32 --num_epochs 256 --batch_size 1024 --lr 0.1 --backend 'pandas' --trainer 'PL' --scoring_technique 'KvsAll' --weight_decay 0.0 --input_dropout_rate 0.0 --hidden_dropout_rate 0.0 --feature_map_dropout_rate 0.0 --normalization LayerNorm --init_param xavier_normal --label_smoothing_rate 0.0 --kernel_size 3 --num_of_output_channels 3 --seed_for_computation 0 --num_core 4 -``` - -```bash -Total computation time: 38.555 seconds -Evaluate AConEx on Train set: Evaluate AConEx on Train set -{'H@1': 0.991180981595092, 'H@3': 0.9996165644171779, 'H@10': 1.0, 'MRR': 0.9953668200408997} -Evaluate AConEx on Validation set: Evaluate AConEx on Validation set -{'H@1': 0.5858895705521472, 'H@3': 0.8243865030674846, 'H@10': 0.9432515337423313, 'MRR': 0.7197978156406648} -Evaluate AConEx on Test set: Evaluate AConEx on Test set -{'H@1': 0.6104387291981845, 'H@3': 0.8245083207261724, 'H@10': 0.9576399394856279, 'MRR': 0.734348890030545} -``` - -## ConvQ - -```bash -python main.py --path_dataset_folder KGs/UMLS --model "ConvQ" --optim Adam --embedding_dim 32 --num_epochs 256 --batch_size 1024 --lr 0.1 --backend 'pandas' --trainer 'PL' --scoring_technique 'KvsAll' --weight_decay 0.0 --input_dropout_rate 0.0 --hidden_dropout_rate 0.0 --feature_map_dropout_rate 0.0 --normalization LayerNorm --init_param xavier_normal --label_smoothing_rate 0.0 --kernel_size 3 --num_of_output_channels 3 --seed_for_computation 0 --num_core 4 -``` - -```bash -Total computation time: 41.013 seconds -Evaluate ConvQ on Train set: Evaluate ConvQ on Train set -{'H@1': 0.8319593558282209, 'H@3': 0.9757476993865031, 'H@10': 0.993194018404908, 'MRR': 0.9047693553188044} -Evaluate ConvQ on Validation set: Evaluate ConvQ on Validation set -{'H@1': 0.7308282208588958, 'H@3': 0.9271472392638037, 'H@10': 0.9762269938650306, 'MRR': 0.834325039087529} -Evaluate ConvQ on Test set: Evaluate ConvQ on Test set -{'H@1': 0.7110438729198184, 'H@3': 0.9394856278366112, 'H@10': 0.9773071104387292, 'MRR': 0.8270827146655849} -``` - -## AConvQ (ConvQ with with Additive Connections) - -```bash -python main.py --path_dataset_folder KGs/UMLS --model "AConvQ" --optim Adam --embedding_dim 32 --num_epochs 256 --batch_size 1024 --lr 0.1 --backend 'pandas' --trainer 'PL' --scoring_technique 'KvsAll' --weight_decay 0.0 --input_dropout_rate 0.0 --hidden_dropout_rate 0.0 --feature_map_dropout_rate 0.0 --normalization LayerNorm --init_param xavier_normal --label_smoothing_rate 0.0 --kernel_size 3 --num_of_output_channels 3 --seed_for_computation 0 --num_core 4 -``` - -```bash -Total computation time: 41.238 seconds -Evaluate AConvQ on Train set: Evaluate AConvQ on Train set -{'H@1': 0.9907016871165644, 'H@3': 0.999808282208589, 'H@10': 1.0, 'MRR': 0.9951623210633946} -Evaluate AConvQ on Validation set: Evaluate AConvQ on Validation set -{'H@1': 0.5682515337423313, 'H@3': 0.8190184049079755, 'H@10': 0.9562883435582822, 'MRR': 0.7098740236669477} -Evaluate AConvQ on Test set: Evaluate AConvQ on Test set -{'H@1': 0.6074130105900152, 'H@3': 0.850983358547655, 'H@10': 0.953857791225416, 'MRR': 0.737209298931209} -``` - -## ConvO - -```bash -python main.py --path_dataset_folder KGs/UMLS --model "ConvO" --optim Adam --embedding_dim 32 --num_epochs 256 --batch_size 1024 --lr 0.1 --backend 'pandas' --trainer 'PL' --scoring_technique 'KvsAll' --weight_decay 0.0 --input_dropout_rate 0.0 --hidden_dropout_rate 0.0 --feature_map_dropout_rate 0.0 --normalization LayerNorm --init_param xavier_normal --label_smoothing_rate 0.0 --kernel_size 3 --num_of_output_channels 3 --seed_for_computation 0 --num_core 4 -``` - -```bash -Total computation time: 54.756 seconds -Evaluate ConvO on Train set: Evaluate ConvO on Train set -{'H@1': 0.8376150306748467, 'H@3': 0.9849501533742331, 'H@10': 0.9973159509202454, 'MRR': 0.9120930363683276} -Evaluate ConvO on Validation set: Evaluate ConvO on Validation set -{'H@1': 0.7216257668711656, 'H@3': 0.9340490797546013, 'H@10': 0.977760736196319, 'MRR': 0.8322465353766179} -Evaluate ConvO on Test set: Evaluate ConvO on Test set -{'H@1': 0.7148260211800302, 'H@3': 0.9334341906202723, 'H@10': 0.9848714069591528, 'MRR': 0.8301952668172322} - -``` - -## AConvO - -```bash -python main.py --path_dataset_folder KGs/UMLS --model "AConvO" --optim Adam --embedding_dim 32 --num_epochs 256 --batch_size 1024 --lr 0.1 --backend 'pandas' --trainer 'PL' --scoring_technique 'KvsAll' --weight_decay 0.0 --input_dropout_rate 0.0 --hidden_dropout_rate 0.0 --feature_map_dropout_rate 0.0 --normalization LayerNorm --init_param xavier_normal --label_smoothing_rate 0.0 --kernel_size 3 --num_of_output_channels 3 --seed_for_computation 0 --num_core 4 -``` - -```bash -Total computation time: 42.910 seconds -Evaluate AConvO on Train set: Evaluate AConvO on Train set -{'H@1': 0.9742139570552147, 'H@3': 0.9990414110429447, 'H@10': 1.0, 'MRR': 0.986693187627812} -Evaluate AConvO on Validation set: Evaluate AConvO on Validation set -{'H@1': 0.5483128834355828, 'H@3': 0.8174846625766872, 'H@10': 0.946319018404908, 'MRR': 0.6984911756060336} -Evaluate AConvO on Test set: Evaluate AConvO on Test set -{'H@1': 0.594553706505295, 'H@3': 0.8388804841149773, 'H@10': 0.9614220877458396, 'MRR': 0.7325348232341585} -``` - -## DistMult - -```bash -python main.py --path_dataset_folder KGs/UMLS --model "DistMult" --optim Adam --embedding_dim 32 --num_epochs 256 --batch_size 1024 --lr 0.1 --backend 'pandas' --trainer 'PL' --scoring_technique 'KvsAll' --weight_decay 0.0 --input_dropout_rate 0.0 --hidden_dropout_rate 0.0 --feature_map_dropout_rate 0.0 --normalization LayerNorm --init_param xavier_normal --label_smoothing_rate 0.0 --seed_for_computation 0 --num_core 4 -``` - -```bash -Total computation time: 43.789 seconds -Evaluate DistMult on Train set: Evaluate DistMult on Train set -{'H@1': 0.9949194785276073, 'H@3': 1.0, 'H@10': 1.0, 'MRR': 0.9973958333333333} -Evaluate DistMult on Validation set: Evaluate DistMult on Validation set -{'H@1': 0.6226993865030674, 'H@3': 0.821319018404908, 'H@10': 0.9547546012269938, 'MRR': 0.7389999406726034} -Evaluate DistMult on Test set: Evaluate DistMult on Test set -{'H@1': 0.6202723146747352, 'H@3': 0.8245083207261724, 'H@10': 0.9478063540090772, 'MRR': 0.7389508344813291} -``` - -## ComplEx - -```bash -python main.py --path_dataset_folder KGs/UMLS --model "ComplEx" --optim Adam --embedding_dim 32 --num_epochs 256 --batch_size 1024 --lr 0.1 --backend 'pandas' --trainer 'PL' --scoring_technique 'KvsAll' --weight_decay 0.0 --input_dropout_rate 0.0 --hidden_dropout_rate 0.0 --feature_map_dropout_rate 0.0 --normalization LayerNorm --init_param xavier_normal --label_smoothing_rate 0.0 --seed_for_computation 0 --num_core 4 -``` - -```bash -Total computation time: 37.448 seconds -Evaluate ComplEx on Train set: Evaluate ComplEx on Train set -{'H@1': 0.9963573619631901, 'H@3': 0.9999041411042945, 'H@10': 1.0, 'MRR': 0.9981067868098159} -Evaluate ComplEx on Validation set: Evaluate ComplEx on Validation set -{'H@1': 0.6042944785276073, 'H@3': 0.8098159509202454, 'H@10': 0.9501533742331288, 'MRR': 0.725286550123257} -Evaluate ComplEx on Test set: Evaluate ComplEx on Test set -{'H@1': 0.6338880484114977, 'H@3': 0.8305597579425114, 'H@10': 0.9583963691376702, 'MRR': 0.7487605914404327} - -``` - -## QMult - -```bash -python main.py --path_dataset_folder KGs/UMLS --model "QMult" --optim Adam --embedding_dim 32 --num_epochs 256 --batch_size 1024 --lr 0.1 --backend 'pandas' --trainer 'PL' --scoring_technique 'KvsAll' --weight_decay 0.0 --input_dropout_rate 0.0 --hidden_dropout_rate 0.0 --normalization LayerNorm --init_param xavier_normal --label_smoothing_rate 0.0 --seed_for_computation 0 --num_core 4 -``` - -```bash -Total computation time: 35.082 seconds -Evaluate QMult on Train set: Evaluate QMult on Train set -{'H@1': 0.9948236196319018, 'H@3': 0.9997124233128835, 'H@10': 1.0, 'MRR': 0.9971481978527608} -Evaluate QMult on Validation set: Evaluate QMult on Validation set -{'H@1': 0.6119631901840491, 'H@3': 0.825920245398773, 'H@10': 0.9585889570552147, 'MRR': 0.7334272790319509} -Evaluate QMult on Test set: Evaluate QMult on Test set -{'H@1': 0.6111951588502269, 'H@3': 0.8267776096822995, 'H@10': 0.9606656580937972, 'MRR': 0.7328362412430357} - -``` - -## OMult - -```bash -python main.py --path_dataset_folder KGs/UMLS --model "OMult" --optim Adam --embedding_dim 32 --num_epochs 256 --batch_size 1024 --lr 0.1 --backend 'pandas' --trainer 'PL' --scoring_technique 'KvsAll' --weight_decay 0.0 --input_dropout_rate 0.0 --hidden_dropout_rate 0.0 --normalization LayerNorm --init_param xavier_normal --label_smoothing_rate 0.0 --seed_for_computation 0 --num_core 4 -``` - -```bash -Total computation time: 52.970 seconds -Evaluate OMult on Train set: Evaluate OMult on Train set -{'H@1': 0.9727760736196319, 'H@3': 0.9992331288343558, 'H@10': 1.0, 'MRR': 0.985969452965235} -Evaluate OMult on Validation set: Evaluate OMult on Validation set -{'H@1': 0.629601226993865, 'H@3': 0.8558282208588958, 'H@10': 0.9593558282208589, 'MRR': 0.754292154154622} -Evaluate OMult on Test set: Evaluate OMult on Test set -{'H@1': 0.6369137670196672, 'H@3': 0.857034795763994, 'H@10': 0.9682299546142209, 'MRR': 0.7625450316560369} -``` - diff --git a/examples/Pykeen.md b/examples/Pykeen.md index 42a70fab..ebe99e21 100644 --- a/examples/Pykeen.md +++ b/examples/Pykeen.md @@ -1,18 +1,29 @@ # Using Pykeen -# Training a Pykeen Model with Pytorch-lightning +# Training Few Pykeen Models with KvsAll + +``` +dicee --dataset_dir KGs/UMLS --model Pykeen_MuRE --num_epochs 100 --batch_size 256 --lr 0.1 --scoring_technique KvsAll +dicee --dataset_dir KGs/UMLS --model Pykeen_HolE --num_epochs 100 --batch_size 256 --lr 0.1 --scoring_technique KvsAll +dicee --dataset_dir KGs/UMLS --model Pykeen_DistMult --num_epochs 100 --batch_size 256 --lr 0.1 --scoring_technique KvsAll +dicee --dataset_dir KGs/UMLS --model Pykeen_ComplEx --num_epochs 100 --batch_size 256 --lr 0.1 --scoring_technique KvsAll +dicee --dataset_dir KGs/UMLS --model Pykeen_QuatE --num_epochs 100 --batch_size 256 --lr 0.1 --scoring_technique KvsAll +``` + ``` -python main.py --model Pykeen_MuRE --num_epochs 10 --batch_size 256 --lr 0.1 --trainer "PL" --num_core 4 --scoring_technique KvsAll --pykeen_model_kwargs embedding_dim=64 -python main.py --model Pykeen_HolE --num_epochs 10 --batch_size 256 --lr 0.1 --trainer "PL" --num_core 4 --scoring_technique KvsAll --pykeen_model_kwargs embedding_dim=64 -python main.py --model Pykeen_DistMult --num_epochs 10 --batch_size 256 --lr 0.1 --trainer "PL" --num_core 4 --scoring_technique KvsAll --pykeen_model_kwargs embedding_dim=64 -python main.py --model Pykeen_ComplEx --num_epochs 10 --batch_size 256 --lr 0.1 --trainer "PL" --num_core 4 --scoring_technique KvsAll --pykeen_model_kwargs embedding_dim=32 -python main.py --model Pykeen_QuatE --num_epochs 10 --batch_size 256 --lr 0.1 --trainer "PL" --num_core 4 --scoring_technique KvsAll --pykeen_model_kwargs embedding_dim=16 -python analyse_experiments.py -| model_name | train_mrr | train_h1 | train_h3 | train_h10 | val_mrr | val_h1 | val_h3 | val_h10 | test_mrr | test_h1 | test_h3 | test_h10 | runtime | params | -|:----------------|------------:|-----------:|-----------:|------------:|----------:|---------:|---------:|----------:|-----------:|----------:|----------:|-----------:|----------:|---------:| -| Pykeen_DistMult | 0.827423 | 0.707918 | 0.938267 | 0.992331 | 0.794849 | 0.661043 | 0.918712 | 0.980061 | 0.776859 | 0.639183 | 0.901664 | 0.982602 | 3.9479 | 14528 | -| Pykeen_MuRE | 0.96874 | 0.94306 | 0.995878 | 1 | 0.79331 | 0.685583 | 0.882669 | 0.969325 | 0.804933 | 0.705749 | 0.887292 | 0.967474 | 4.89179 | 20686 | -| Pykeen_ComplEx | 0.771764 | 0.673792 | 0.846817 | 0.925422 | 0.709708 | 0.598926 | 0.784509 | 0.895706 | 0.701002 | 0.58472 | 0.7882 | 0.8941 | 3.13179 | 14528 | -| Pykeen_HolE | 0.945462 | 0.901169 | 0.990798 | 0.999425 | 0.778686 | 0.667945 | 0.868098 | 0.960123 | 0.760102 | 0.641452 | 0.854766 | 0.962935 | 5.46916 | 14528 | -| Pykeen_QuatE | 0.92043 | 0.870207 | 0.96434 | 0.988976 | 0.817052 | 0.72546 | 0.898006 | 0.95092 | 0.819355 | 0.726929 | 0.894856 | 0.955371 | 4.24787 | 14528 | +python dicee/analyse_experiments.py --dir Experiments --features "model" "trainMRR" "testMRR" + +\begin{tabular}{lrrr} +\toprule +model & trainMRR & testMRR & NumParam \\ +\midrule +Pykeen_MuRE & 0.879 & 0.836 & 10478 \\ +Pykeen_HolE & 0.830 & 0.689 & 7264 \\ +Pykeen_QuatE & 1.000 & 0.683 & 29056 \\ +Pykeen_ComplEx & 1.000 & 0.648 & 14528 \\ +Pykeen_DistMult & 0.818 & 0.588 & 7264 \\ +\bottomrule +\end{tabular} + + ``` \ No newline at end of file diff --git a/examples/continual_training.py b/examples/continual_training.py deleted file mode 100644 index 88038333..00000000 --- a/examples/continual_training.py +++ /dev/null @@ -1,26 +0,0 @@ -from dicee.executer import ContinuousExecute -import argparse -def argparse_default(description=None): - parser = argparse.ArgumentParser(add_help=False) - # Dataset and storage related - parser.add_argument("--path_experiment_folder", type=str, default="Experiments/2023-01-07 18:44:47.703307", - help="The path of a folder containing pretrained model") - parser.add_argument("--num_epochs", type=int, default=1, help='Number of epochs for training.') - parser.add_argument("--lr", type=float, default=None) - parser.add_argument("--num_core", type=int, default=None, help='Number of cores to be used.') - parser.add_argument('--scoring_technique', default=None, help="KvsSample, 1vsAll, KvsAll, NegSample") - parser.add_argument('--neg_ratio', type=int, default=None, - help='The number of negative triples generated per positive triple.') - parser.add_argument('--optim', type=str, default=None, help='[NAdam, Adam, SGD]') - parser.add_argument('--batch_size', type=int, default=None, help='Mini batch size') - parser.add_argument("--seed_for_computation", type=int, default=0, help='Seed for all, see pl seed_everything().') - parser.add_argument("--trainer", type=str, default=None, - help='PL (pytorch lightning trainer), torchDDP (custom ddp), torchCPUTrainer (custom cpu only)') - if description is None: - return parser.parse_args() - return parser.parse_args(description) - - -if __name__ == '__main__': - args = argparse_default() - ContinuousExecute(args).continual_start() diff --git a/examples/large_scale_training.py b/examples/large_scale_training.py deleted file mode 100644 index 7b1dd9a4..00000000 --- a/examples/large_scale_training.py +++ /dev/null @@ -1,321 +0,0 @@ -""" -Example script to train KGE models via model parallel with GPU-off-loading -""" -from typing import Tuple -import os -import torch -import dicee -from dicee import Keci -from dicee import NegSampleDataset -import polars as pl -import time -import pandas as pd -import numpy as np -from tqdm import tqdm -import argparse -from argparse import ArgumentParser -import pickle - -def input_arguments(): - parser = ArgumentParser() - parser.add_argument("--path_kg", type=str, default="dbpedia-2022-12-nt-wo-lit-polars.parquet.snappy", - help="path parquet formatted polars dataframe") - parser.add_argument("--path_idx_kg", type=str, default="data.npy", - help="path to numpy ndarray") - parser.add_argument("--path_checkpoint", type=str, default="Keci_1_14.torch" - ) - parser.add_argument("--path_checkpoint2", type=str, default="Keci_1_14.torch") - - parser.add_argument("--batch_size", type=int, default=10_000_000) - parser.add_argument("--neg_sample_ratio", type=float, default=1.0) - parser.add_argument("--embedding_dim", type=int, default=20) - parser.add_argument("--num_epochs", type=int, default=1) - parser.add_argument("--read_only", default=None) - parser.add_argument("--lr", type=float, default=0.1) - - return parser.parse_args() - -class MultiEpochsDataLoader(torch.utils.data.DataLoader): - """ To avoid the excessive time spent to fetch the first batch at each new epoch - See https://discuss.pytorch.org/t/enumerate-dataloader-slow/87778/2 - """ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._DataLoader__initialized = False - self.batch_sampler = _RepeatSampler(self.batch_sampler) - self._DataLoader__initialized = True - self.iterator = super().__iter__() - - def __len__(self): - return len(self.batch_sampler.sampler) - - def __iter__(self): - for i in range(len(self)): - yield next(self.iterator) - - -class _RepeatSampler(object): - """ Sampler that repeats forever. - - Args: - sampler (Sampler) - """ - - def __init__(self, sampler): - self.sampler = sampler - - def __iter__(self): - while True: - yield from iter(self.sampler) - -def get_data(args) -> Tuple[np.ndarray, int, int]: - if args.path_kg: - """ do this """ - print("Reading KG...\n") - start_time = time.time() - data = pl.read_parquet(source=args.path_kg,n_rows=args.read_only) - print(f"took {time.time() - start_time}") - print("Unique entities...") - start_time = time.time() - unique_entities = pl.concat((data.get_column('subject'), data.get_column('object'))).unique().rename( - 'entity') - print(f"Number of unique entities:{len(unique_entities)}") - unique_entities = unique_entities.to_list() - print(f"took {time.time() - start_time}") - - print("Unique relations...") - start_time = time.time() - unique_relations = data.unique(subset=["relation"]).select("relation").to_series() - print(f"Number of unique relations:{len(unique_relations)}") - unique_relations = unique_relations.to_list() - print(f"took {time.time() - start_time}") - - print("Entity index mapping...") - start_time = time.time() - entity_to_idx = {ent: idx for idx, ent in enumerate(unique_entities)} - pickle.dump(entity_to_idx, open("entity_to_idx.p", "wb")) - - print(f"took {time.time() - start_time}") - - print("Relation index mapping...") - start_time = time.time() - rel_to_idx = {rel: idx for idx, rel in enumerate(unique_relations)} - pickle.dump(rel_to_idx, open("relation_to_idx.p","wb")) - print(f"took {time.time() - start_time}") - print("Constructing training data...") - start_time = time.time() - data = data.with_columns(pl.col("subject").map_dict(entity_to_idx).alias("subject"), - pl.col("relation").map_dict(rel_to_idx).alias("relation"), - pl.col("object").map_dict(entity_to_idx).alias("object")).to_numpy() - print(f"took {time.time() - start_time}") - - num_entities = len(unique_entities) - num_relations = len(unique_relations) - # TODO: maybe save the data into some designated folter - with open("data.npy", 'wb') as f: - np.save(f, data) - - return data, num_entities, num_relations - - elif args.path_idx_kg: - print("Loading the index numpy KG..\n") - #data=np.load(args.path_idx_kg, mmap_mode='r') - with open(args.path_idx_kg, 'rb') as f: - data = np.load(f) - num_entities = 1 + max(max(data[:, 0]), max(data[:, 2])) - num_relations = 1 + max(data[:, 1]) - return data, num_entities, num_relations - else: - raise RuntimeError - - -def init_model(args, num_entities, num_relations): - start_time = time.time() - print('Initializing models...') - model1 = Keci( - args={"optim": "Adam", "p": 0, "q": 1, "num_entities": num_entities, "num_relations": num_relations, - "embedding_dim": args.embedding_dim, 'learning_rate': args.lr}) - model2 = Keci( - args={"optim": "Adam", "p": 0, "q": 1, "num_entities": num_entities, "num_relations": num_relations, - "embedding_dim": args.embedding_dim, 'learning_rate': args.lr}) - print(f"took {time.time() - start_time}") - return (model1, model2), (model1.configure_optimizers(), model2.configure_optimizers()) - - -def get_model(args, num_entities: int, num_relations: int): - # Initialize |GPUs| models on a single node - models, optimizers = init_model(args, num_entities, num_relations) - if args.path_checkpoint: - """ Load the checkpoint""" - # update models - model1, model2 = models - opt1, opt2 = optimizers - - model1.load_state_dict(torch.load(args.path_checkpoint,map_location='cpu')) - model2.load_state_dict(torch.load(args.path_checkpoint,map_location='cpu')) - models = (model1, model2) - optimizers = (opt1, opt2) - return models, optimizers - - -def get_train_loader(args): - data: np.ndarray - data, num_ent, num_rel = get_data(args) - data: torch.utils.data.DataLoader - print('Creating dataset...') - data: NegSampleDataset - # TODO: neg_sample_ratio is not used at the moment - data = NegSampleDataset(train_set=data, - num_entities=num_ent, num_relations=num_rel, - neg_sample_ratio=1.0) - data: torch.utils.data.DataLoader - data = MultiEpochsDataLoader(dataset=data, - batch_size=args.batch_size, shuffle=True, - num_workers=32) - print('Number of triples', len(data.dataset)) - return data, num_ent, num_rel - -def run_epoch(loss_function,dataloader,model1,model2,opt1,opt2): - device1 = "cuda:0" - device2 = "cuda:1" - epoch_loss = 0.0 - for ith, (x, y) in enumerate(tqdm(dataloader)): - # (1) Shape the batch - x = x.flatten(start_dim=0, end_dim=1) - y = y.flatten(start_dim=0, end_dim=1) - - # (2) Empty the gradients - opt1.zero_grad(set_to_none=True) - opt2.zero_grad(set_to_none=True) - - # (3) Forward Backward and Parameter Update - start_time = time.time() - # (3.1) Select embeddings of triples - h1, r1, t1 = model1.get_triple_representation(x) - # (3.2) Move (3.1) into a single GPU - h1, r1, t1, y = h1.pin_memory().to(device1, non_blocking=True), r1.pin_memory().to(device1,non_blocking=True), t1.pin_memory().to(device1, non_blocking=True), y.pin_memory().to(device1, non_blocking=True) - # (3.3) Compute triple score (Forward Pass) - yhat1 = model1.score(h1, r1, t1) - - # (3.4) Select second part of the embeddings of triples - h2, r2, t2 = model2.get_triple_representation(x) - # (3.5) Move (3.4) into a single GPU - h2, r2, t2 = h2.pin_memory().to(device2, non_blocking=True), r2.pin_memory().to(device2, non_blocking=True), t2.pin_memory().to(device2, non_blocking=True) - # 3.6 Forward Pass - yhat2 = model2.score(h2, r2, t2).to(device1) - # (3.7) Composite Prediction - yhat = yhat1 + yhat2 - # (3.8) Compute Loss - batch_loss = loss_function(yhat, y) - # (3.9) Compute gradients (Backward Pass) - batch_loss.backward() - # (3.10) Update parameters - opt1.step() - opt2.step() - # (4) Update epoch loss - numpy_batch_loss = batch_loss.item() - epoch_loss += numpy_batch_loss - print(f"\tBatch Loss:{numpy_batch_loss}\tForward-Backward-Update: {time.time() - start_time}") - print(f"Epoch Loss:{epoch_loss}") - -def run(args): - # (1) Get training data - dataloader: torch.utils.data.DataLoader - dataloader, num_ent, num_rel = get_train_loader(args) - # (2) Get model - models, optimizers = get_model(args, num_ent, num_rel) - print("Compiling...") - # (3) Compile models - model1, model2 = models - print('####### Model 1 #######') - print(model1) - print(model1.summarize()) - model1 = torch.compile(model1) - print(model1) - print('######## Model2 #######') - print(model2) - print(model2.summarize()) - model2 = torch.compile(model2) - print(model2) - # (4) Get optim - opt1, opt2 = optimizers - print(opt1) - print(opt2) - # (5) Get loss func - loss_function = model1.loss_function - print("Training...") - - device1 = "cuda:0" - device2 = "cuda:1" - # @TODO: Ensure the multi-node training - for e in range(args.num_epochs): - epoch_loss = 0 - if e==-1: - args.batch_size+=args.batch_size - print(f"Increase Batch size to {args.batch_size}") - args.batch_size+=args.batch_size - dataloader = MultiEpochsDataLoader(dataset=dataloader.dataset,batch_size=args.batch_size, shuffle=True,num_workers=32) - - - run_epoch(loss_function,dataloader,model1,model2,opt1,opt2) - """ - for ith, (x, y) in enumerate(tqdm(dataloader)): - # (1) Shape the batch - x = x.flatten(start_dim=0, end_dim=1) - y = y.flatten(start_dim=0, end_dim=1) - - # (2) Empty the gradients - opt1.zero_grad(set_to_none=True) - opt2.zero_grad(set_to_none=True) - - # (3) Forward Backward and Parameter Update - start_time = time.time() - # (3.1) Select embeddings of triples - h1, r1, t1 = model1.get_triple_representation(x) - # (3.2) Move (3.1) into a single GPU - h1, r1, t1, y = h1.pin_memory().to(device1, non_blocking=True), r1.pin_memory().to(device1,non_blocking=True), t1.pin_memory().to(device1, non_blocking=True), y.pin_memory().to(device1, non_blocking=True) - # (3.3) Compute triple score (Forward Pass) - yhat1 = model1.score(h1, r1, t1) - - # (3.4) Select second part of the embeddings of triples - h2, r2, t2 = model2.get_triple_representation(x) - # (3.5) Move (3.4) into a single GPU - h2, r2, t2 = h2.pin_memory().to(device2, non_blocking=True), r2.pin_memory().to(device2, non_blocking=True), t2.pin_memory().to(device2, non_blocking=True) - # 3.6 Forward Pass - yhat2 = model2.score(h2, r2, t2).to(device1) - # (3.7) Composite Prediction - yhat = yhat1 + yhat2 - # (3.8) Compute Loss - batch_loss = loss_function(yhat, y) - # (3.9) Compute gradients (Backward Pass) - batch_loss.backward() - # (3.10) Update parameters - opt1.step() - opt2.step() - # (4) Update epoch loss - numpy_batch_loss = batch_loss.item() - epoch_loss += numpy_batch_loss - if ith % 1 == 0: # init an argument - print(f"\tBatch Loss:{numpy_batch_loss}\tForward-Backward-Update: {time.time() - start_time}") - print(f"Epoch:{e}\tEpoch Loss:{epoch_loss}") - """ - print("Saving....") - start_time=time.time() - model1.to("cpu") - model2.to("cpu") - print(model1._orig_mod.state_dict()) - torch.save(model1._orig_mod.state_dict(),f"{model1._orig_mod.name}_1_{e}.torch") - print(model2._orig_mod.state_dict()) - torch.save(model2._orig_mod.state_dict(),f"{model1._orig_mod.name}_2_{e}.torch") - print('DONE') - print(f"took {time.time() - start_time}") - - -if __name__ == '__main__': - run(input_arguments()) - -# @TODO Post Processing -# Note mode1 and model2 keci with p=0, q=1 -# model1 real_m1:[] complex_m1[] -# model2 real_m2:[] complex_m2[] -# y1 y2 => Final model = real_m1[],real_m2[], complex_m1[] complex_m2[] diff --git a/examples/ppe.py b/examples/ppe.py deleted file mode 100644 index a39a6c5d..00000000 --- a/examples/ppe.py +++ /dev/null @@ -1,44 +0,0 @@ -from dicee.executer import Execute -from dicee.config import Namespace - -args = Namespace() -args.model = 'Keci' -args.p = 0 -args.q = 1 -args.scoring_technique = "KvsAll" -args.dataset_dir = "KGs/UMLS" -args.num_epochs = 200 -args.lr = 0.1 -args.embedding_dim = 32 -args.batch_size = 1024 -reports = Execute(args).start() -""" -Evaluate Keci on Train set: Evaluate Keci on Train set -{'H@1': 0.9966449386503068, 'H@3': 1.0, 'H@10': 1.0, 'MRR': 0.9983064928425357} -Evaluate Keci on Validation set: Evaluate Keci on Validation set -{'H@1': 0.6134969325153374, 'H@3': 0.8098159509202454, 'H@10': 0.9424846625766872, 'MRR': 0.7293869361804316} -Evaluate Keci on Test set: Evaluate Keci on Test set -{'H@1': 0.6437216338880484, 'H@3': 0.8275340393343419, 'H@10': 0.959909228441755, 'MRR': 0.751216359363361} -Total Runtime: 13.259 seconds -""" -args = Namespace() -args.model = 'Keci' -args.p = 0 -args.q = 1 -args.scoring_technique = "KvsAll" -args.dataset_dir = "KGs/UMLS" -args.num_epochs = 200 -args.lr = 0.1 -args.embedding_dim = 32 -args.batch_size = 1024 -args.callbacks = {"PPE": {"epoch_to_start": 100}} -reports = Execute(args).start() -""" -Evaluate Keci on Train set: Evaluate Keci on Train set -{'H@1': 0.9934815950920245, 'H@3': 1.0, 'H@10': 1.0, 'MRR': 0.9966609151329243} -Evaluate Keci on Validation set: Evaluate Keci on Validation set -{'H@1': 0.7001533742331288, 'H@3': 0.8696319018404908, 'H@10': 0.9585889570552147, 'MRR': 0.7946759330503159} -Evaluate Keci on Test set: Evaluate Keci on Test set -{'H@1': 0.710287443267776, 'H@3': 0.8789712556732224, 'H@10': 0.9780635400907716, 'MRR': 0.8082179592109334} -Total Runtime: 12.497 seconds -""" \ No newline at end of file