broadinstitute · gwaybio · Feb 18, 2021 · Feb 12, 2021 · Feb 12, 2021 · Feb 12, 2021
diff --git a/0.download-data/scripts/nbconverted/4.reprocess-cell-health.py b/0.download-data/scripts/nbconverted/4.reprocess-cell-health.py
@@ -2,9 +2,9 @@
 # coding: utf-8
 
 # ## Reprocess Cell Health profiles
-# 
+#
 # Use a whole-plate normalization scheme instead of normalization by controls only.
-# 
+#
 # We will use the control normalization in downstream analyses, but we are interested in comparing the impact of normalization strategy on grit calculations.
 
 # In[1]:
@@ -19,22 +19,26 @@
 # In[2]:
 
 
-def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482bb664d9de1"):
+def normalize_profile(
+    plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482bb664d9de1"
+):
     link = f"https://github.com/broadinstitute/cell-health/raw/{commit}/1.generate-profiles/data/profiles/{plate}/{plate}_augmented.csv.gz"
-    
+
     annotate_df = pd.read_csv(link)
-    
+
     norm_file = pathlib.Path(f"{output_dir}/{plate}_wholeplate_normalized.csv.gz")
-    feat_select_file = pathlib.Path(f"{output_dir}/{plate}_wholeplate_normalized_feature_selected.csv.gz")
-
+    feat_select_file = pathlib.Path(
+        f"{output_dir}/{plate}_wholeplate_normalized_feature_selected.csv.gz"
+    )
+
     normalize(
         profiles=annotate_df,
         features="infer",
         meta_features=meta_features,
         samples="all",
         method="mad_robustize",
         output_file=norm_file,
-        compression_options={"method": "gzip", "mtime": 1}
+        compression_options={"method": "gzip", "mtime": 1},
     )
 
 
@@ -51,7 +55,7 @@ def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482b
     "SQ00014615",
     "SQ00014616",
     "SQ00014617",
-    "SQ00014618"
+    "SQ00014618",
 ]
 
 # Define metadata features
@@ -92,16 +96,14 @@ def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482b
 
 # Concatentate all plates
 x_df = (
-    pd.concat(
-        [pd.read_csv(x) for x in plate_files],
-        sort=True
-    )
+    pd.concat([pd.read_csv(x) for x in plate_files], sort=True)
     .rename(
         {
             "Image_Metadata_Plate": "Metadata_Plate",
-            "Image_Metadata_Well": "Metadata_Well"
+            "Image_Metadata_Well": "Metadata_Well",
         },
-        axis="columns")
+        axis="columns",
+    )
     .drop(["Metadata_broad_sample"], axis="columns")
 )
 
@@ -128,11 +130,7 @@ def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482b
     "drop_outliers",
 ]
 
-x_df = feature_select(
-    profiles=x_df,
-    operation=feature_select_ops,
-    na_cutoff=0
-)
+x_df = feature_select(profiles=x_df, operation=feature_select_ops, na_cutoff=0)
 
 print(x_df.shape)
 x_df.head(2)
@@ -154,6 +152,7 @@ def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482b
 
 
 # Output
-profile_file = pathlib.Path(f"{output_dir}/cell_health_profiles_merged_wholeplate_normalized_featureselected.tsv.gz")
+profile_file = pathlib.Path(
+    f"{output_dir}/cell_health_profiles_merged_wholeplate_normalized_featureselected.tsv.gz"
+)
 x_df.to_csv(profile_file, index=False, sep="\t")
-
diff --git a/1.calculate-metrics/cell-health/0.calculate-grit.ipynb b/1.calculate-metrics/cell-health/0.calculate-grit.ipynb
@@ -4,12 +4,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Calculate Grit for Bulk Cell Health profiles"
+    "## Calculate Grit for Bulk Cell Health profiles\n",
+    "\n",
+    "Here, we calculate grit in several permutations\n",
+    "\n",
+    "1. Across the three different cell lines (A549, ES2, HCC44)\n",
+    "2. Using two different kinds of controls (cutting and permutation)\n",
+    "3. Using two different correlation metrics (Pearson and Spearman)\n",
+    "4. Using two different metrics to summarize control-based z-scored replicate correlations (mean and median)\n",
+    "\n",
+    "We also calculate mp-value for the same perturbations."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -469,9 +478,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(1428, 6)\n",
-      "CPU times: user 47.6 s, sys: 3.68 s, total: 51.2 s\n",
-      "Wall time: 51.3 s\n"
+      "(2856, 7)\n",
+      "CPU times: user 1min 33s, sys: 8.1 s, total: 1min 41s\n",
+      "Wall time: 1min 41s\n"
      ]
     },
     {
@@ -501,6 +510,7 @@
        "      <th>cell_line</th>\n",
        "      <th>barcode_control</th>\n",
        "      <th>cor_method</th>\n",
+       "      <th>grit_replicate_summary_method</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -512,6 +522,7 @@
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>pearson</td>\n",
+       "      <td>mean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -521,6 +532,7 @@
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>pearson</td>\n",
+       "      <td>mean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -530,6 +542,7 @@
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>pearson</td>\n",
+       "      <td>mean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -539,6 +552,7 @@
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>pearson</td>\n",
+       "      <td>mean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
@@ -548,18 +562,26 @@
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>pearson</td>\n",
+       "      <td>mean</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "  perturbation   group      grit cell_line  barcode_control cor_method\n",
-       "0       AKT1-1    AKT1  0.793952     HCC44  cutting_control    pearson\n",
-       "1       AKT1-2    AKT1  0.775840     HCC44  cutting_control    pearson\n",
-       "2     ARID1B-1  ARID1B  0.448876     HCC44  cutting_control    pearson\n",
-       "3     ARID1B-2  ARID1B  0.323462     HCC44  cutting_control    pearson\n",
-       "4       ATF4-1    ATF4  0.214374     HCC44  cutting_control    pearson"
+       "  perturbation   group      grit cell_line  barcode_control cor_method  \\\n",
+       "0       AKT1-1    AKT1  0.793952     HCC44  cutting_control    pearson   \n",
+       "1       AKT1-2    AKT1  0.775840     HCC44  cutting_control    pearson   \n",
+       "2     ARID1B-1  ARID1B  0.448876     HCC44  cutting_control    pearson   \n",
+       "3     ARID1B-2  ARID1B  0.323462     HCC44  cutting_control    pearson   \n",
+       "4       ATF4-1    ATF4  0.214374     HCC44  cutting_control    pearson   \n",
+       "\n",
+       "  grit_replicate_summary_method  \n",
+       "0                          mean  \n",
+       "1                          mean  \n",
+       "2                          mean  \n",
+       "3                          mean  \n",
+       "4                          mean  "
       ]
      },
      "execution_count": 6,
@@ -573,21 +595,25 @@
     "for cell_line in df.Metadata_cell_line.unique():\n",
     "    for control_barcode in control_barcodes:\n",
     "        for cor_method in [\"pearson\", \"spearman\"]:\n",
-    "            result = evaluate(\n",
-    "                profiles=df.query(\"Metadata_cell_line == @cell_line\"),\n",
-    "                features=features,\n",
-    "                meta_features=[barcode_col, gene_col],\n",
-    "                replicate_groups=replicate_group_grit,\n",
-    "                operation=\"grit\",\n",
-    "                similarity_metric=cor_method,\n",
-    "                grit_control_perts=control_barcodes[control_barcode]\n",
-    "            ).assign(\n",
-    "                cell_line=cell_line,\n",
-    "                barcode_control=control_barcode,\n",
-    "                cor_method=cor_method\n",
-    "            )\n",
+    "            for replicate_summary_method in [\"mean\", \"median\"]:\n",
+    "                \n",
+    "                result = evaluate(\n",
+    "                    profiles=df.query(\"Metadata_cell_line == @cell_line\"),\n",
+    "                    features=features,\n",
+    "                    meta_features=[barcode_col, gene_col],\n",
+    "                    replicate_groups=replicate_group_grit,\n",
+    "                    operation=\"grit\",\n",
+    "                    similarity_metric=cor_method,\n",
+    "                    grit_control_perts=control_barcodes[control_barcode],\n",
+    "                    grit_replicate_summary_method=replicate_summary_method\n",
+    "                ).assign(\n",
+    "                    cell_line=cell_line,\n",
+    "                    barcode_control=control_barcode,\n",
+    "                    cor_method=cor_method,\n",
+    "                    grit_replicate_summary_method=replicate_summary_method\n",
+    "                )\n",
     "\n",
-    "            grit_results.append(result)\n",
+    "                grit_results.append(result)\n",
     "    \n",
     "grit_results = pd.concat(grit_results).reset_index(drop=True)\n",
     "\n",
@@ -604,7 +630,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "72\n"
+      "144\n"
      ]
     },
     {
@@ -634,6 +660,7 @@
        "      <th>cell_line</th>\n",
        "      <th>barcode_control</th>\n",
        "      <th>cor_method</th>\n",
+       "      <th>grit_replicate_summary_method</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -645,6 +672,7 @@
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>pearson</td>\n",
+       "      <td>mean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -654,6 +682,7 @@
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>pearson</td>\n",
+       "      <td>mean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -663,6 +692,7 @@
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>pearson</td>\n",
+       "      <td>mean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -672,6 +702,7 @@
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>pearson</td>\n",
+       "      <td>mean</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
@@ -681,18 +712,26 @@
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>pearson</td>\n",
+       "      <td>mean</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "  perturbation   group  grit cell_line  barcode_control cor_method\n",
-       "0      AURKB-2   AURKB   NaN     HCC44  cutting_control    pearson\n",
-       "1       BRAF-2    BRAF   NaN     HCC44  cutting_control    pearson\n",
-       "2      BRAF1-1   BRAF1   NaN     HCC44  cutting_control    pearson\n",
-       "3        EMPTY   EMPTY   NaN     HCC44  cutting_control    pearson\n",
-       "4     SLC2A1-1  SLC2A1   NaN     HCC44  cutting_control    pearson"
+       "  perturbation   group  grit cell_line  barcode_control cor_method  \\\n",
+       "0      AURKB-2   AURKB   NaN     HCC44  cutting_control    pearson   \n",
+       "1       BRAF-2    BRAF   NaN     HCC44  cutting_control    pearson   \n",
+       "2      BRAF1-1   BRAF1   NaN     HCC44  cutting_control    pearson   \n",
+       "3        EMPTY   EMPTY   NaN     HCC44  cutting_control    pearson   \n",
+       "4     SLC2A1-1  SLC2A1   NaN     HCC44  cutting_control    pearson   \n",
+       "\n",
+       "  grit_replicate_summary_method  \n",
+       "0                          mean  \n",
+       "1                          mean  \n",
+       "2                          mean  \n",
+       "3                          mean  \n",
+       "4                          mean  "
       ]
      },
      "execution_count": 7,
@@ -736,8 +775,8 @@
      "output_type": "stream",
      "text": [
       "(1428, 5)\n",
-      "CPU times: user 3h 15min, sys: 1min 53s, total: 3h 16min 54s\n",
-      "Wall time: 29min 48s\n"
+      "CPU times: user 3h 33min 56s, sys: 5min 9s, total: 3h 39min 6s\n",
+      "Wall time: 34min 45s\n"
      ]
     },
     {
@@ -780,23 +819,23 @@
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>AKT1-2</td>\n",
-       "      <td>0.1</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>10</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>ARID1B-1</td>\n",
-       "      <td>0.1</td>\n",
+       "      <td>0.3</td>\n",
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>10</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>ARID1B-2</td>\n",
-       "      <td>0.0</td>\n",
+       "      <td>0.1</td>\n",
        "      <td>HCC44</td>\n",
        "      <td>cutting_control</td>\n",
        "      <td>10</td>\n",
@@ -816,9 +855,9 @@
       "text/plain": [
        "  Metadata_pert_name  mp_value cell_line  barcode_control  num_permutations\n",
        "0             AKT1-1       0.2     HCC44  cutting_control                10\n",
-       "1             AKT1-2       0.1     HCC44  cutting_control                10\n",
-       "2           ARID1B-1       0.1     HCC44  cutting_control                10\n",
-       "3           ARID1B-2       0.0     HCC44  cutting_control                10\n",
+       "1             AKT1-2       0.0     HCC44  cutting_control                10\n",
+       "2           ARID1B-1       0.3     HCC44  cutting_control                10\n",
+       "3           ARID1B-2       0.1     HCC44  cutting_control                10\n",
        "4             ATF4-1       0.0     HCC44  cutting_control                10"
       ]
      },