Skip to content

Commit

Permalink
update ans
Browse files Browse the repository at this point in the history
  • Loading branch information
xingzhongyu committed Jan 19, 2025
1 parent 1f242f4 commit 0c2b37d
Show file tree
Hide file tree
Showing 17 changed files with 890 additions and 659 deletions.
2 changes: 1 addition & 1 deletion dance/atlas/sc_similarity/anndata_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def wasserstein_dist(self) -> float:
b = np.ones((Y.shape[0], )) / Y.shape[0]
M = ot.dist(X, Y, metric='euclidean')
wasserstein_dist = ot.emd2(a, b, M)
return 1 / 1 + wasserstein_dist
return 1 / (1 + wasserstein_dist)

def get_Hausdorff(self):
X = self.X
Expand Down
Empty file added examples/atlas/__init__.py
Empty file.
165 changes: 0 additions & 165 deletions examples/atlas/sc_similarity_examples/cal_w1_w2.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
tissues = ["blood", "brain", "heart", "intestine", "kidney", "lung", "pancreas"]
import pandas as pd

from dance.settings import ATLASDIR, SIMILARITYDIR

if __name__ == "__main__":
for tissue in tissues:
metadata_df = pd.read_csv(ATLASDIR / f"metadatas/{tissue}_metadata.csv")
sweep_result_df = pd.read_csv(ATLASDIR / f"sweep_results/{tissue.capitalize()}_ans.csv")
sweep_result_df = sweep_result_df.rename(columns={"Dataset_id": "dataset_id"})
sweep_result_df["dataset_id"] = sweep_result_df["dataset_id"].str.split('(').str[0]
result_df = metadata_df.merge(sweep_result_df, how="outer", on="dataset_id")
# result_df.to_csv(SIMILARITYDIR / f"data/results/{tissue}_result.csv")
# for tissue in tissues:
# df=pd.read_csv(SIMILARITYDIR / f"data/results/{tissue}_result.csv")
with pd.ExcelWriter(SIMILARITYDIR / "data/Cell Type Annotation Atlas.xlsx", mode='a',
if_sheet_exists='replace') as writer:
result_df.to_excel(writer, sheet_name=tissue)
79 changes: 0 additions & 79 deletions examples/atlas/sc_similarity_examples/process_similarity.py

This file was deleted.

8 changes: 8 additions & 0 deletions examples/atlas/sc_similarity_examples/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
\[scGPT->metadatas\]+\[get_result_web->sweep_results\]+\[data_processing/merge_result_metadata.py\]->\[data/cell_type_annotation_atlas.xlsx\]
\[data/cell_type_annotation_atlas.xlsx\]+\[similarity/analyze_atlas_accuracy.py\]->\[data/in_atlas_datas\]
\[similarity/example_usage_anndata.py\]+\[data/in_atlas_datas\]+\[data/cell_type_annotation_atlas.xlsx\]->\[data/dataset_similarity\]
\[data/dataset_similarity\]+\[similarity/process_tissue_similarity_matrices.py\]->\[data/new_sim\]

#run_similarity_optimization.sh
\[data/new_sim\]+\[similarity/optimize_similarity_weights.py\]+\[cache/sweep_cache.json\]->\[data/similarity_weights_results\]
\[data/similarity_weights_results\]+\[similarity/visualize_atlas_performance.py\]+\[cache/sweep_cache.json\]->\[data/imgs\]
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

# 定义数组
array=("blood" "brain" "heart" "intestine" "kidney" "lung" "pancreas")
# 循环数组并在后台运行 Python 脚本
for tissue in "${array[@]}"
do
python similarity/example_usage_anndata.py --tissue "$tissue"
# python similarity/optimize_similarity_weights.py --tissue "$tissue"
# python visualization/visualize_atlas_performance.py --tissue "$tissue"
# python similarity/optimize_similarity_weights.py --tissue "$tissue" --in_query
# python visualization/visualize_atlas_performance.py --tissue "$tissue" --in_query
# python similarity/optimize_similarity_weights.py --tissue "$tissue" --reduce_error
# python visualization/visualize_atlas_performance.py --tissue "$tissue" --reduce_error
# python similarity/optimize_similarity_weights.py --tissue "$tissue" --in_query --reduce_error
# python visualization/visualize_atlas_performance.py --tissue "$tissue" --in_query --reduce_error
echo "启动处理 tissue: $tissue"
done

# 等待所有后台进程完成
wait

echo "所有 Python 脚本已执行完成"
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@
import numpy as np
import pandas as pd
import yaml
from tqdm import tqdm

sys.path.append("..")
from dance.settings import DANCEDIR, SIMILARITYDIR

sys.path.append(str(DANCEDIR))
import ast

from get_result_web import get_sweep_url, spilt_web

from dance import logger
from dance.settings import entity, project
from dance.utils import try_import

file_root = str(Path(__file__).resolve().parent.parent)
Expand Down Expand Up @@ -70,8 +74,6 @@ def is_match(config_str):


wandb = try_import("wandb")
entity = "xzy11632"
project = "dance-dev"


def is_matching_dict(yaml_str, target_dict):
Expand Down Expand Up @@ -156,18 +158,20 @@ def get_ans_from_cache(query_dataset, method):
# Get best method from step2 of atlas datasets
# Search accuracy according to best method (all values should exist)
ans = pd.DataFrame(index=[method], columns=[f"{atlas_dataset}_from_cache" for atlas_dataset in atlas_datasets])

sweep_url = re.search(r"step2:([^|]+)",
conf_data[conf_data["dataset_id"] == query_dataset][method].iloc[0]).group(1)
step_str = conf_data[conf_data["dataset_id"] == query_dataset][method].iloc[0]
if pd.isna(step_str):
logger.warning(f"{query_dataset} is nan in {method}")
return ans
sweep_url = re.search(r"step2:([^|]+)", step_str).group(1)
_, _, sweep_id = spilt_web(sweep_url)
sweep = wandb.Api().sweep(f"{entity}/{project}/{sweep_id}")

for atlas_dataset in atlas_datasets:
best_yaml = conf_data[conf_data["dataset_id"] == atlas_dataset][f"{method}_best_yaml"].iloc[0]
runs = sweep.runs
for atlas_dataset in tqdm(atlas_datasets):
best_yaml = conf_data[conf_data["dataset_id"] == atlas_dataset][f"{method}_step2_best_yaml"].iloc[0]
match_run = None

# Find matching run configuration
for run in sweep.runs:
for run in tqdm(runs, leave=False):
if isinstance(best_yaml, float) and np.isnan(best_yaml):
continue
if is_matching_dict(best_yaml, run.config):
Expand All @@ -188,7 +192,7 @@ def get_ans_from_cache(query_dataset, method):
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--methods", default=["cta_actinn", "cta_celltypist", "cta_scdeepsort", "cta_singlecellnet"],
nargs="+")
parser.add_argument("--tissue", type=str, default="blood")
parser.add_argument("--tissue", type=str, default="pancreas")
args = parser.parse_args()
methods = args.methods
tissue = args.tissue
Expand All @@ -208,7 +212,7 @@ def get_ans_from_cache(query_dataset, method):
# "738942eb-ac72-44ff-a64b-8943b5ecd8d9", "a5d95a42-0137-496f-8a60-101e17f263c8",
# "71be997d-ff75-41b9-8a9f-1288c865f921"
# ]
conf_data = pd.read_excel("Cell Type Annotation Atlas.xlsx", sheet_name=tissue)
conf_data = pd.read_excel(SIMILARITYDIR / "data/Cell Type Annotation Atlas.xlsx", sheet_name=tissue)
# conf_data = pd.read_csv(f"results/{tissue}_result.csv", index_col=0)
atlas_datasets = list(conf_data[conf_data["queryed"] == False]["dataset_id"])
query_datasets = list(conf_data[conf_data["queryed"] == True]["dataset_id"])
Expand All @@ -219,8 +223,9 @@ def get_ans_from_cache(query_dataset, method):
ans.append(get_ans_from_cache(query_dataset, method))
ans = pd.concat(ans)
ans_all[query_dataset] = ans
for k, v in ans_all.items():
file_path = f"in_atlas_datas/{tissue}/{str(methods)}_{k}_in_atlas.csv"
print(query_dataset)
# for k, v in ans_all.items():
file_path = SIMILARITYDIR / f"data/in_atlas_datas/{tissue}/{str(methods)}_{query_dataset}_in_atlas.csv"
folder_path = Path(file_path).parent
folder_path.mkdir(parents=True, exist_ok=True)
v.to_csv(file_path)
ans.to_csv(file_path)
Loading

0 comments on commit 0c2b37d

Please sign in to comment.