From 4e4229567a1426d224f62a03012e42cfddc17320 Mon Sep 17 00:00:00 2001 From: ctrlaltaf Date: Mon, 2 Dec 2024 11:21:04 -0800 Subject: [PATCH] tested on various methods --- Snakefile | 17 ++++++++++------- config/synthetic.yaml | 10 +++++----- spras/evaluation.py | 11 +++-------- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/Snakefile b/Snakefile index 547075ae..619c05a4 100644 --- a/Snakefile +++ b/Snakefile @@ -389,8 +389,8 @@ rule evaluation: ensemble_file=lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}ensemble-pathway.txt", pca_coordinates_file =lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}pca-coordinates.txt" output: - pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway.txt"]), - pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway.png']), + pr_node_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway.txt"]), + pr_node_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway.png']), pr_edge_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway_edge.txt"]), pr_edge_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway_edge.png']), pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-curve-ensemble-nodes.png']), @@ -398,13 +398,12 @@ rule evaluation: run: node_table = Evaluation.from_file(input.gold_standard_file).node_table edge_table = Evaluation.from_file(input.gold_standard_file).edge_table - Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png) + Evaluation.precision_and_recall_node(input.pathways, node_table, algorithms, output.pr_node_file, output.pr_node_png) Evaluation.precision_and_recall_edge(input.pathways, edge_table, algorithms, output.pr_edge_file, output.pr_edge_png) node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file) Evaluation.precision_recall_curve_node_ensemble(node_ensemble, node_table, output.pr_curve_png) pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) - Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file) - # Evaluation.precision_and_recall_edge(pca_chosen_pathway, edge_table, algorithms, output.pca_chosen_pr_file) + Evaluation.precision_and_recall_node(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file) # Returns all pathways for a specific algorithm and dataset @@ -431,9 +430,13 @@ rule evaluation_per_algo_pathways: output: pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-per-pathway.txt"]), pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-per-pathway.png']), + pr_edge_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-per-pathway_edge.txt"]), + pr_edge_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-per-pathway_edge.png']), run: node_table = Evaluation.from_file(input.gold_standard_file).node_table - Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png) + Evaluation.precision_and_recall_node(input.pathways, node_table, algorithms, output.pr_file, output.pr_png) + edge_table = Evaluation.from_file(input.gold_standard_file).edge_table + Evaluation.precision_and_recall_edge(input.pathways, edge_table, algorithms, output.pr_edge_file, output.pr_edge_png) rule evaluation_per_algo_ensemble_pr_curve: input: @@ -455,7 +458,7 @@ rule evaluation_per_algo_pca_chosen: run: node_table = Evaluation.from_file(input.gold_standard_file).node_table pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) - Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file) + Evaluation.precision_and_recall_node(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file) # Remove the output directory rule clean: diff --git a/config/synthetic.yaml b/config/synthetic.yaml index 2b9f1a23..2c785642 100644 --- a/config/synthetic.yaml +++ b/config/synthetic.yaml @@ -45,7 +45,7 @@ container_registry: algorithms: - name: "pathlinker" params: - include: false + include: true run1: k: range(100,201,100) @@ -69,7 +69,7 @@ algorithms: - name: "meo" params: - include: false + include: true run1: max_path_length: [3] local_search: ["Yes"] @@ -77,18 +77,18 @@ algorithms: - name: "mincostflow" params: - include: false + include: true run1: flow: [1] # The flow must be an int capacity: [1] - name: "allpairs" params: - include: false + include: true - name: "domino" params: - include: false + include: true run1: slice_threshold: [0.3] module_threshold: [0.05] diff --git a/spras/evaluation.py b/spras/evaluation.py index c60645ab..71641865 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -84,11 +84,11 @@ def load_files_from_dict(self, gold_standard_dict: Dict): @staticmethod def precision_and_recall_edge(file_paths: Iterable[Path], edge_table: pd.DataFrame, algorithms: list, output_file: str, output_png:str=None): """ - Takes in file paths for a specific dataset and an associated gold standard node table. + Takes in file paths for a specific dataset and an associated gold standard edge table. Calculates precision and recall for each pathway file Returns output back to output_file @param file_paths: file paths of pathway reconstruction algorithm outputs - @param node_table: the gold standard nodes + @param edge_table: the gold standard edges @param algorithms: list of algorithms used in current run of SPRAS @param output_file: the filename to save the precision and recall of each pathway @param output_png (optional): the filename to plot the precision and recall of each pathway (not a PRC) @@ -99,17 +99,12 @@ def precision_and_recall_edge(file_paths: Iterable[Path], edge_table: pd.DataFra results = [] for file in file_paths: df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"]) - print(file) - print(df) y_pred = set() for row in df.itertuples(): y_pred.add((row[1], row[2])) all_edges = set(gs_edges.union(y_pred)) y_true_binary = [1 if (edge[0], edge[1]) in gs_edges or (edge[1], edge[0]) in gs_edges else 0 for edge in all_edges] y_pred_binary = [1 if (edge[0], edge[1]) in y_pred or (edge[1], edge[0]) in y_pred else 0 for edge in all_edges] - - # # default to 0.0 if there is a divide by 0 error - # # not using precision_recall_curve because thresholds are binary (0 or 1); rather we are directly calculating precision and recall per pathway precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0) recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0) results.append({"Pathway": file, "Precision": precision, "Recall": recall}) @@ -152,7 +147,7 @@ def precision_and_recall_edge(file_paths: Iterable[Path], edge_table: pd.DataFra plt.savefig(output_png) @staticmethod - def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, algorithms: list, output_file: str, output_png:str=None): + def precision_and_recall_node(file_paths: Iterable[Path], node_table: pd.DataFrame, algorithms: list, output_file: str, output_png:str=None): """ Takes in file paths for a specific dataset and an associated gold standard node table. Calculates precision and recall for each pathway file