Skip to content

Commit

Permalink
Merge pull request Reed-CompBio#132 from jhiemstrawisc/make-registry-…
Browse files Browse the repository at this point in the history
…configurable

Make registry configurable
  • Loading branch information
agitter authored Jan 16, 2024
2 parents acea45e + e9c7be6 commit 879f1b5
Show file tree
Hide file tree
Showing 24 changed files with 820 additions and 562 deletions.
51 changes: 26 additions & 25 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ from spras import runner
import shutil
import yaml
from spras.dataset import Dataset
from spras.util import process_config
from spras.analysis import ml, summary, graphspace, cytoscape
import spras.config as _config

# Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037
# and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them
Expand All @@ -13,23 +13,27 @@ SEP = '/'
wildcard_constraints:
params="params-\w+"

config, datasets, out_dir, algorithm_params, algorithm_directed, pca_params, hac_params = process_config(config)
# Elsewhere we import this as config, but in the Snakefile, the variable config is already populated
# with the parsed config.yaml. This is done by Snakemake, which magically pipes config into this file
# without declaration!
_config.init_global(config)

# TODO consider the best way to pass global configuration information to the run functions
SINGULARITY = "singularity" in config and config["singularity"]
if SINGULARITY:
print('Running Singularity containers')
else:
print('Running Docker containers')
out_dir = _config.config.out_dir
algorithm_params = _config.config.algorithm_params
algorithm_directed = _config.config.algorithm_directed
pca_params = _config.config.pca_params
hac_params = _config.config.hac_params

FRAMEWORK = _config.config.container_framework
print(f"Running {FRAMEWORK} containers")

# Return the dataset dictionary from the config file given the label
def get_dataset(datasets, label):
return datasets[label]
def get_dataset(_datasets, label):
return _datasets[label]

algorithms = list(algorithm_params)
algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]

dataset_labels = list(datasets.keys())
dataset_labels = list(_config.config.datasets.keys())

# Get the parameter dictionary for the specified
# algorithm and parameter combination hash
Expand All @@ -46,7 +50,7 @@ def write_parameter_log(algorithm, param_label, logfile):

# Log the dataset contents specified in the config file in a yaml file
def write_dataset_log(dataset, logfile):
dataset_contents = get_dataset(datasets,dataset)
dataset_contents = get_dataset(_config.config.datasets,dataset)

# safe_dump gives RepresenterError for an OrderedDict
# config file has to convert the dataset from OrderedDict to dict to avoid this
Expand All @@ -57,23 +61,22 @@ def write_dataset_log(dataset, logfile):
def make_final_input(wildcards):
final_input = []

# TODO analysis could be parsed in the parse_config() function.
if config["analysis"]["summary"]["include"]:
if _config.config.analysis_include_summary:
# add summary output file for each pathway
# TODO: reuse in the future once we make summary work for mixed graphs. See https://github.com/Reed-CompBio/spras/issues/128
# final_input.extend(expand('{out_dir}{sep}{dataset}-{algorithm_params}{sep}summary.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
# add table summarizing all pathways for each dataset
final_input.extend(expand('{out_dir}{sep}{dataset}-pathway-summary.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels))

if config["analysis"]["graphspace"]["include"]:
if _config.config.analysis_include_graphspace:
# add graph and style JSON files.
final_input.extend(expand('{out_dir}{sep}{dataset}-{algorithm_params}{sep}gs.json',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-{algorithm_params}{sep}gsstyle.json',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))

if config["analysis"]["cytoscape"]["include"]:
if _config.config.analysis_include_cytoscape:
final_input.extend(expand('{out_dir}{sep}{dataset}-cytoscape.cys',out_dir=out_dir,sep=SEP,dataset=dataset_labels))

if config["analysis"]["ml"]["include"]:
if _config.config.analysis_include_ml:
final_input.extend(expand('{out_dir}{sep}{dataset}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
Expand Down Expand Up @@ -117,7 +120,7 @@ rule log_datasets:
# Return all files used in the dataset
# Input preparation needs to be rerun if these files are modified
def get_dataset_dependencies(wildcards):
dataset = datasets[wildcards.dataset]
dataset = _config.config.datasets[wildcards.dataset]
all_files = dataset["node_files"] + dataset["edge_files"] + dataset["other_files"]
# Add the relative file path
all_files = [dataset["data_dir"] + SEP + data_file for data_file in all_files]
Expand All @@ -131,7 +134,7 @@ rule merge_input:
output: dataset_file = SEP.join([out_dir, '{dataset}-merged.pickle'])
run:
# Pass the dataset to PRRunner where the files will be merged and written to disk (i.e. pickled)
dataset_dict = get_dataset(datasets, wildcards.dataset)
dataset_dict = get_dataset(_config.config.datasets, wildcards.dataset)
runner.merge_input(dataset_dict, output.dataset_file)

# The checkpoint is like a rule but can be used in dynamic workflows
Expand Down Expand Up @@ -207,9 +210,7 @@ rule reconstruct:
# Remove the default placeholder parameter added for algorithms that have no parameters
if 'spras_placeholder' in params:
params.pop('spras_placeholder')
# TODO consider the best way to pass global configuration information to the run functions
# This approach requires that all run functions support a singularity option
params['singularity'] = SINGULARITY
params['container_framework'] = FRAMEWORK
runner.run(wildcards.algorithm, params)

# Original pathway reconstruction output to universal output
Expand Down Expand Up @@ -246,7 +247,7 @@ rule viz_cytoscape:
output:
session = SEP.join([out_dir, '{dataset}-cytoscape.cys'])
run:
cytoscape.run_cytoscape(input.pathways, output.session, SINGULARITY)
cytoscape.run_cytoscape(input.pathways, output.session, FRAMEWORK)


# Write a single summary table for all pathways for each dataset
Expand Down
Loading

0 comments on commit 879f1b5

Please sign in to comment.