Skip to content

Commit

Permalink
update automation addition
Browse files Browse the repository at this point in the history
  • Loading branch information
TheRazorace committed Jan 11, 2024
1 parent 5ea8749 commit 0484393
Show file tree
Hide file tree
Showing 11 changed files with 913 additions and 705 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,6 @@ View and explore the [RDF mappings](https://github.com/dtai-kg/MLSea-Discover/tr

Generate the RDF dumps of MLSea-KG by running:

python data_integration.py
python data_integration_openml.py
python data_integration_kaggle.py
python data_integration_pwc.py
31 changes: 22 additions & 9 deletions resource_code/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,27 @@
PWC_INPUT = "/Users/ioannisdasoulas/Desktop/ML-Discovery/ML-KG/Data/PwC-Data/"
OUTPUT_PATH = "/Users/ioannisdasoulas/Desktop/ML-Discovery/ML-KG/RDF_Dumps/"
ORIGINAL_DATA_FOLDER = "Original-Data/"
#UPDATE_MONTH_FOLDER = "December2023/"
UPDATE_MONTH_FOLDER = "10-01-2024/"

# OpenML API Checkpoints
OPENML_RUN_CHECKPOINT = 4037070
OPENML_RUN_CHECKPOINT = 4037082
OPENML_RUN_CURRENT_OFFSET = 6000000
OPENML_DATASET_CHECKPOINT = 5399
OPENML_FLOW_CHECKPOINT = 47250
OPENML_TASK_CHECKPOINT = 16736
OPENML_DATASET_CHECKPOINT = 5402
OPENML_FLOW_CHECKPOINT = 16751
OPENML_TASK_CHECKPOINT = 47250

# Dumps current file number
OPENML_TASK_DUMP_PART = 1
OPENML_FLOW_DUMP_PART = 1
OPENML_DATASET_DUMP_PART = 1
OPENML_RUN_DUMP_PART = 29
KAGGLE_DUMP_PART = 1
PWC_DUMP_PART = 1

# Triples limit per dump
OPENML_DUMP_LIMIT = 50000000
KAGGLE_DUMP_LIMIT = 30000000
PWC_DUMP_LIMIT = 20000000

def update_openml_checkpoints(run_cp, dataset_cp, task_cp, flow_cp):

Expand All @@ -21,10 +34,10 @@ def update_openml_checkpoints(run_cp, dataset_cp, task_cp, flow_cp):
content = file.read()

# Update the values in memory
content = content.replace('OPENML_RUN_CHECKPOINT = 4037070', 'OPENML_RUN_CHECKPOINT = ' + str(run_cp))
content = content.replace('OPENML_DATASET_CHECKPOINT = 5399', 'OPENML_DATASET_CHECKPOINT = ' + str(dataset_cp))
content = content.replace('OPENML_FLOW_CHECKPOINT = 47250', 'OPENML_FLOW_CHECKPOINT = ' + str(task_cp))
content = content.replace('OPENML_TASK_CHECKPOINT = 16736', 'OPENML_TASK_CHECKPOINT = ' + str(flow_cp))
content = content.replace('OPENML_RUN_CHECKPOINT = 4037082', 'OPENML_RUN_CHECKPOINT = ' + str(run_cp))
content = content.replace('OPENML_DATASET_CHECKPOINT = 5402', 'OPENML_DATASET_CHECKPOINT = ' + str(dataset_cp))
content = content.replace('OPENML_FLOW_CHECKPOINT = 16751', 'OPENML_FLOW_CHECKPOINT = ' + str(flow_cp))
content = content.replace('OPENML_TASK_CHECKPOINT = 47250', 'OPENML_TASK_CHECKPOINT = ' + str(task_cp))

# Write the changes back to the constants.py file
with open('config.py', 'w') as file:
Expand Down
Loading

0 comments on commit 0484393

Please sign in to comment.