Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Comparing mean vs. median in computing summary #22

Merged
merged 7 commits into from
Feb 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 21 additions & 22 deletions 0.download-data/scripts/nbconverted/4.reprocess-cell-health.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# coding: utf-8

# ## Reprocess Cell Health profiles
#
#
# Use a whole-plate normalization scheme instead of normalization by controls only.
#
#
# We will use the control normalization in downstream analyses, but we are interested in comparing the impact of normalization strategy on grit calculations.

# In[1]:
Expand All @@ -19,22 +19,26 @@
# In[2]:


def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482bb664d9de1"):
def normalize_profile(
plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482bb664d9de1"
):
link = f"https://github.com/broadinstitute/cell-health/raw/{commit}/1.generate-profiles/data/profiles/{plate}/{plate}_augmented.csv.gz"

annotate_df = pd.read_csv(link)

norm_file = pathlib.Path(f"{output_dir}/{plate}_wholeplate_normalized.csv.gz")
feat_select_file = pathlib.Path(f"{output_dir}/{plate}_wholeplate_normalized_feature_selected.csv.gz")

feat_select_file = pathlib.Path(
f"{output_dir}/{plate}_wholeplate_normalized_feature_selected.csv.gz"
)

normalize(
profiles=annotate_df,
features="infer",
meta_features=meta_features,
samples="all",
method="mad_robustize",
output_file=norm_file,
compression_options={"method": "gzip", "mtime": 1}
compression_options={"method": "gzip", "mtime": 1},
)


Expand All @@ -51,7 +55,7 @@ def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482b
"SQ00014615",
"SQ00014616",
"SQ00014617",
"SQ00014618"
"SQ00014618",
]

# Define metadata features
Expand Down Expand Up @@ -92,16 +96,14 @@ def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482b

# Concatentate all plates
x_df = (
pd.concat(
[pd.read_csv(x) for x in plate_files],
sort=True
)
pd.concat([pd.read_csv(x) for x in plate_files], sort=True)
.rename(
{
"Image_Metadata_Plate": "Metadata_Plate",
"Image_Metadata_Well": "Metadata_Well"
"Image_Metadata_Well": "Metadata_Well",
},
axis="columns")
axis="columns",
)
.drop(["Metadata_broad_sample"], axis="columns")
)

Expand All @@ -128,11 +130,7 @@ def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482b
"drop_outliers",
]

x_df = feature_select(
profiles=x_df,
operation=feature_select_ops,
na_cutoff=0
)
x_df = feature_select(profiles=x_df, operation=feature_select_ops, na_cutoff=0)

print(x_df.shape)
x_df.head(2)
Expand All @@ -154,6 +152,7 @@ def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482b


# Output
profile_file = pathlib.Path(f"{output_dir}/cell_health_profiles_merged_wholeplate_normalized_featureselected.tsv.gz")
profile_file = pathlib.Path(
f"{output_dir}/cell_health_profiles_merged_wholeplate_normalized_featureselected.tsv.gz"
)
x_df.to_csv(profile_file, index=False, sep="\t")

119 changes: 79 additions & 40 deletions 1.calculate-metrics/cell-health/0.calculate-grit.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,21 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Calculate Grit for Bulk Cell Health profiles"
"## Calculate Grit for Bulk Cell Health profiles\n",
"\n",
"Here, we calculate grit in several permutations\n",
"\n",
"1. Across the three different cell lines (A549, ES2, HCC44)\n",
"2. Using two different kinds of controls (cutting and permutation)\n",
"3. Using two different correlation metrics (Pearson and Spearman)\n",
"4. Using two different metrics to summarize control-based z-scored replicate correlations (mean and median)\n",
"\n",
"We also calculate mp-value for the same perturbations."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -469,9 +478,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"(1428, 6)\n",
"CPU times: user 47.6 s, sys: 3.68 s, total: 51.2 s\n",
"Wall time: 51.3 s\n"
"(2856, 7)\n",
"CPU times: user 1min 33s, sys: 8.1 s, total: 1min 41s\n",
"Wall time: 1min 41s\n"
]
},
{
Expand Down Expand Up @@ -501,6 +510,7 @@
" <th>cell_line</th>\n",
" <th>barcode_control</th>\n",
" <th>cor_method</th>\n",
" <th>grit_replicate_summary_method</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
Expand All @@ -512,6 +522,7 @@
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>pearson</td>\n",
" <td>mean</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
Expand All @@ -521,6 +532,7 @@
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>pearson</td>\n",
" <td>mean</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
Expand All @@ -530,6 +542,7 @@
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>pearson</td>\n",
" <td>mean</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
Expand All @@ -539,6 +552,7 @@
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>pearson</td>\n",
" <td>mean</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
Expand All @@ -548,18 +562,26 @@
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>pearson</td>\n",
" <td>mean</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" perturbation group grit cell_line barcode_control cor_method\n",
"0 AKT1-1 AKT1 0.793952 HCC44 cutting_control pearson\n",
"1 AKT1-2 AKT1 0.775840 HCC44 cutting_control pearson\n",
"2 ARID1B-1 ARID1B 0.448876 HCC44 cutting_control pearson\n",
"3 ARID1B-2 ARID1B 0.323462 HCC44 cutting_control pearson\n",
"4 ATF4-1 ATF4 0.214374 HCC44 cutting_control pearson"
" perturbation group grit cell_line barcode_control cor_method \\\n",
"0 AKT1-1 AKT1 0.793952 HCC44 cutting_control pearson \n",
"1 AKT1-2 AKT1 0.775840 HCC44 cutting_control pearson \n",
"2 ARID1B-1 ARID1B 0.448876 HCC44 cutting_control pearson \n",
"3 ARID1B-2 ARID1B 0.323462 HCC44 cutting_control pearson \n",
"4 ATF4-1 ATF4 0.214374 HCC44 cutting_control pearson \n",
"\n",
" grit_replicate_summary_method \n",
"0 mean \n",
"1 mean \n",
"2 mean \n",
"3 mean \n",
"4 mean "
]
},
"execution_count": 6,
Expand All @@ -573,21 +595,25 @@
"for cell_line in df.Metadata_cell_line.unique():\n",
" for control_barcode in control_barcodes:\n",
" for cor_method in [\"pearson\", \"spearman\"]:\n",
" result = evaluate(\n",
" profiles=df.query(\"Metadata_cell_line == @cell_line\"),\n",
" features=features,\n",
" meta_features=[barcode_col, gene_col],\n",
" replicate_groups=replicate_group_grit,\n",
" operation=\"grit\",\n",
" similarity_metric=cor_method,\n",
" grit_control_perts=control_barcodes[control_barcode]\n",
" ).assign(\n",
" cell_line=cell_line,\n",
" barcode_control=control_barcode,\n",
" cor_method=cor_method\n",
" )\n",
" for replicate_summary_method in [\"mean\", \"median\"]:\n",
" \n",
" result = evaluate(\n",
" profiles=df.query(\"Metadata_cell_line == @cell_line\"),\n",
" features=features,\n",
" meta_features=[barcode_col, gene_col],\n",
" replicate_groups=replicate_group_grit,\n",
" operation=\"grit\",\n",
" similarity_metric=cor_method,\n",
" grit_control_perts=control_barcodes[control_barcode],\n",
" grit_replicate_summary_method=replicate_summary_method\n",
" ).assign(\n",
" cell_line=cell_line,\n",
" barcode_control=control_barcode,\n",
" cor_method=cor_method,\n",
" grit_replicate_summary_method=replicate_summary_method\n",
" )\n",
"\n",
" grit_results.append(result)\n",
" grit_results.append(result)\n",
" \n",
"grit_results = pd.concat(grit_results).reset_index(drop=True)\n",
"\n",
Expand All @@ -604,7 +630,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"72\n"
"144\n"
]
},
{
Expand Down Expand Up @@ -634,6 +660,7 @@
" <th>cell_line</th>\n",
" <th>barcode_control</th>\n",
" <th>cor_method</th>\n",
" <th>grit_replicate_summary_method</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
Expand All @@ -645,6 +672,7 @@
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>pearson</td>\n",
" <td>mean</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
Expand All @@ -654,6 +682,7 @@
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>pearson</td>\n",
" <td>mean</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
Expand All @@ -663,6 +692,7 @@
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>pearson</td>\n",
" <td>mean</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
Expand All @@ -672,6 +702,7 @@
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>pearson</td>\n",
" <td>mean</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
Expand All @@ -681,18 +712,26 @@
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>pearson</td>\n",
" <td>mean</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" perturbation group grit cell_line barcode_control cor_method\n",
"0 AURKB-2 AURKB NaN HCC44 cutting_control pearson\n",
"1 BRAF-2 BRAF NaN HCC44 cutting_control pearson\n",
"2 BRAF1-1 BRAF1 NaN HCC44 cutting_control pearson\n",
"3 EMPTY EMPTY NaN HCC44 cutting_control pearson\n",
"4 SLC2A1-1 SLC2A1 NaN HCC44 cutting_control pearson"
" perturbation group grit cell_line barcode_control cor_method \\\n",
"0 AURKB-2 AURKB NaN HCC44 cutting_control pearson \n",
"1 BRAF-2 BRAF NaN HCC44 cutting_control pearson \n",
"2 BRAF1-1 BRAF1 NaN HCC44 cutting_control pearson \n",
"3 EMPTY EMPTY NaN HCC44 cutting_control pearson \n",
"4 SLC2A1-1 SLC2A1 NaN HCC44 cutting_control pearson \n",
"\n",
" grit_replicate_summary_method \n",
"0 mean \n",
"1 mean \n",
"2 mean \n",
"3 mean \n",
"4 mean "
]
},
"execution_count": 7,
Expand Down Expand Up @@ -736,8 +775,8 @@
"output_type": "stream",
"text": [
"(1428, 5)\n",
"CPU times: user 3h 15min, sys: 1min 53s, total: 3h 16min 54s\n",
"Wall time: 29min 48s\n"
"CPU times: user 3h 33min 56s, sys: 5min 9s, total: 3h 39min 6s\n",
"Wall time: 34min 45s\n"
]
},
{
Expand Down Expand Up @@ -780,23 +819,23 @@
" <tr>\n",
" <th>1</th>\n",
" <td>AKT1-2</td>\n",
" <td>0.1</td>\n",
" <td>0.0</td>\n",
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ARID1B-1</td>\n",
" <td>0.1</td>\n",
" <td>0.3</td>\n",
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ARID1B-2</td>\n",
" <td>0.0</td>\n",
" <td>0.1</td>\n",
" <td>HCC44</td>\n",
" <td>cutting_control</td>\n",
" <td>10</td>\n",
Expand All @@ -816,9 +855,9 @@
"text/plain": [
" Metadata_pert_name mp_value cell_line barcode_control num_permutations\n",
"0 AKT1-1 0.2 HCC44 cutting_control 10\n",
"1 AKT1-2 0.1 HCC44 cutting_control 10\n",
"2 ARID1B-1 0.1 HCC44 cutting_control 10\n",
"3 ARID1B-2 0.0 HCC44 cutting_control 10\n",
"1 AKT1-2 0.0 HCC44 cutting_control 10\n",
"2 ARID1B-1 0.3 HCC44 cutting_control 10\n",
"3 ARID1B-2 0.1 HCC44 cutting_control 10\n",
"4 ATF4-1 0.0 HCC44 cutting_control 10"
]
},
Expand Down
Loading