From faf1d21e86cd988d5c69c55eee973b781775c122 Mon Sep 17 00:00:00 2001
From: rsenft1 <28116530+rsenft1@users.noreply.github.com>
Date: Fri, 3 Mar 2023 10:07:50 -0500
Subject: [PATCH 1/3] add analysis notebook
---
.../01_jumpORF_create_collapsed_df.ipynb | 1133 +++++++++++++++++
.../BR00126734/BR00126734.csv.gz | 3 +
.../BR00126734/BR00126734_augmented.csv.gz | 3 +
.../BR00126734/BR00126734_normalized.csv.gz | 3 +
...734_normalized_feature_select_batch.csv.gz | 3 +
...malized_feature_select_negcon_batch.csv.gz | 3 +
.../BR00126735/BR00126735.csv.gz | 3 +
.../BR00126735/BR00126735_augmented.csv.gz | 3 +
.../BR00126735/BR00126735_normalized.csv.gz | 3 +
...735_normalized_feature_select_batch.csv.gz | 3 +
...malized_feature_select_negcon_batch.csv.gz | 3 +
.../BR00126634/BR00126634.csv.gz | 3 +
.../BR00126634/BR00126634_augmented.csv.gz | 3 +
.../BR00126634/BR00126634_normalized.csv.gz | 3 +
...634_normalized_feature_select_batch.csv.gz | 3 +
...malized_feature_select_negcon_batch.csv.gz | 3 +
.../BR00126641/BR00126641.csv.gz | 3 +
.../BR00126641/BR00126641_augmented.csv.gz | 3 +
.../BR00126641/BR00126641_normalized.csv.gz | 3 +
...641_normalized_feature_select_batch.csv.gz | 3 +
...malized_feature_select_negcon_batch.csv.gz | 3 +
21 files changed, 1193 insertions(+)
create mode 100644 analysis_notebook/01_jumpORF_create_collapsed_df.ipynb
create mode 100644 profiles/2021_08_23_Batch12/BR00126734/BR00126734.csv.gz
create mode 100644 profiles/2021_08_23_Batch12/BR00126734/BR00126734_augmented.csv.gz
create mode 100644 profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized.csv.gz
create mode 100644 profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_batch.csv.gz
create mode 100644 profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_negcon_batch.csv.gz
create mode 100644 profiles/2021_08_23_Batch12/BR00126735/BR00126735.csv.gz
create mode 100644 profiles/2021_08_23_Batch12/BR00126735/BR00126735_augmented.csv.gz
create mode 100644 profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized.csv.gz
create mode 100644 profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_batch.csv.gz
create mode 100644 profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_negcon_batch.csv.gz
create mode 100644 profiles/2021_08_30_Batch13/BR00126634/BR00126634.csv.gz
create mode 100644 profiles/2021_08_30_Batch13/BR00126634/BR00126634_augmented.csv.gz
create mode 100644 profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized.csv.gz
create mode 100644 profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_batch.csv.gz
create mode 100644 profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_negcon_batch.csv.gz
create mode 100644 profiles/2021_08_30_Batch13/BR00126641/BR00126641.csv.gz
create mode 100644 profiles/2021_08_30_Batch13/BR00126641/BR00126641_augmented.csv.gz
create mode 100644 profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized.csv.gz
create mode 100644 profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_batch.csv.gz
create mode 100644 profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_negcon_batch.csv.gz
diff --git a/analysis_notebook/01_jumpORF_create_collapsed_df.ipynb b/analysis_notebook/01_jumpORF_create_collapsed_df.ipynb
new file mode 100644
index 0000000..18c689a
--- /dev/null
+++ b/analysis_notebook/01_jumpORF_create_collapsed_df.ipynb
@@ -0,0 +1,1133 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# JUMP ORF data analysis notebook\n",
+ "\n",
+ "## Set up environment:\n",
+ " \n",
+ "1. Create a new environment for this project: `mamba create --name jumpORF python=3.9`\n",
+ "\n",
+ "2. Activate that environment `conda activate jumpORF` (note, I'm not sure why but I can't `mamba activate` even after `mamba init` but this seems to work)\n",
+ "\n",
+ "3. Install dependencies: \n",
+ "* `mamba install -c conda-forge dvc-s3` (more instructions here: https://dvc.org/doc/install/macos, specifically you need dvc with aws s3 since this is where the profiles are stored)\n",
+ "\n",
+ "## Get data on your local machine:\n",
+ "\n",
+ "1. Download the data repo: `git clone https://github.com/jump-cellpainting/jump-orf-data.git` (I use GitHub Desktop for this!)\n",
+ "\n",
+ "2. Download the metadata repo: `git clone https://github.com/jump-cellpainting/datasets.git`\n",
+ "\n",
+ "3. Pull the files in dvc down to your local computer. In terminal, in the folder where you've cloned `jump-orf-data`: `dvc pull` _(note this step can take a while)_\n",
+ "\n",
+ "6. Select the `jumpORF` environment for the kernel for this notebook (upper right of notebook in VScode) or otherwise ensure the jumpORF environment is activated \n",
+ " * _note that if you have the notebook open while you make the environment, you may need to restart VScode to see the updated list of environments_\n",
+ "\n",
+ "\n",
+ "## to do...\n",
+ "\n",
+ "2. Use new metadata here as source for finding plates/batches/etc: https://github.com/jump-cellpainting/datasets/tree/main/metadata \n",
+ "3. Controls include BFP, HcRed, Luciferase, LacZ (but we have excluded eGFP, though it is still showing as a control in the metadata sheet)\n"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Read in all JUMP ORF data\n",
+ "\n",
+ "* Grab the paths to all the profiles from the different batches. \n",
+ "* Read them into one dataframe (~13000 genes x ~ 5 replicates and a varying # of features depending on whether feature-selected (~1300) or the full data (~5900) profiles are used)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* What do we want to provide? Just the collapsed data? Or the non-collapsed version as well? \n",
+ "* Is there enough of a reason that we want people to have access to the precollapsed version? \n",
+ "* Perhaps do not save out these large csvs! Get through cleaning to collapsed data then save that out. \n",
+ "* concat, collapse, clean as separate function "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read data\n",
+ "import os\n",
+ "import pandas as pd\n",
+ "\n",
+ "\n",
+ "#get paths to files using the most truthful metadata\n",
+ "topfolder = \"../profiles\"\n",
+ "metadata_table = pd.read_csv(\"../../jump-datasets/metadata/plate.csv\",index_col=False)\n",
+ "metadata_table_ORF = metadata_table[(metadata_table.Metadata_PlateType == \"ORF\")]\n",
+ "\n",
+ "batch_list = metadata_table_ORF[(metadata_table_ORF.Metadata_PlateType == \"ORF\")].Metadata_Batch.unique()\n",
+ "\n",
+ "batch_list_2 = metadata_table_ORF.loc[metadata_table_ORF[\"Metadata_PlateType\"] == \"ORF\"][\"Metadata_Batch\"].unique()\n",
+ "\n",
+ "\n",
+ "filesuffix=\"_normalized_feature_select_negcon_all.csv.gz\"\n",
+ "filepaths = [os.path.join(topfolder, metadata_table_ORF.Metadata_Batch.values[row], metadata_table_ORF.Metadata_Plate.values[row],metadata_table_ORF.Metadata_Plate.values[row]+filesuffix) for row in range(len(metadata_table_ORF))]\n",
+ "\n",
+ "#only look at files that exist\n",
+ "filepaths = [f for f in filepaths if os.path.exists(f)]\n",
+ "\n",
+ "#read in\n",
+ "df = pd.concat(map(lambda file: pd.read_csv(file, index_col=False,), filepaths))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get target 2 plates (normalized) and filter the features down to those that are in the df we already have\n",
+ "\n",
+ "metadata_table_target2 = metadata_table.loc[(metadata_table[\"Metadata_Batch\"].isin(batch_list)) & (metadata_table[\"Metadata_PlateType\"]==\"TARGET2\")]\n",
+ "\n",
+ "filesuffix=\"_normalized_negcon.csv\" # can also do .csv.gz files\n",
+ "filepaths = [os.path.join(topfolder, metadata_table_target2.Metadata_Batch.values[row], metadata_table_target2.Metadata_Plate.values[row],metadata_table_target2.Metadata_Plate.values[row]+filesuffix) for row in range(len(metadata_table_target2))]\n",
+ "\n",
+ "#only look at files that exist\n",
+ "filepaths = [f for f in filepaths if os.path.exists(f)]\n",
+ "\n",
+ "#read in\n",
+ "df_t2 = pd.concat(map(lambda file: pd.read_csv(file, index_col=False,), filepaths))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Filter the target2 df to only columns that exist in the df\n",
+ "\n",
+ "t2_metadata_col = [x for x in df_t2.columns if \"Metadata\" in x]\n",
+ "df_col = list(df.columns)\n",
+ "\n",
+ "\n",
+ "cols2Keep = list(set(df_col+t2_metadata_col))\n",
+ "cols2Keep = [x for x in cols2Keep if x in list(df_t2.columns)]\n",
+ "\n",
+ "df_t2 = df_t2[cols2Keep]"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Filter out ORFs that don't replicate\n",
+ "From Alex: \n",
+ "\n",
+ "Calculate mAP for replicability for each perturbation and filter out those below the random baseline.\n",
+ "The long answer involves the fact that we’ve recently changed what “below random baseline” means. Before, we suggested to subtract mAP of randomly ranked profiles (as suggested in “3.4.2 Computation of the exact random AP value” of my draft on mAP). But recently, we decided that we will consider not mean of random baseline APs, but 95th percentile, such that we can construct a significance test and report p-value instead. Let me know if you want to know more about this, I will also talk about it in special topics on Thursday!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Metadata_plate_map_name | \n",
+ " Metadata_broad_sample | \n",
+ " Metadata_Plate | \n",
+ " Metadata_Well | \n",
+ " Metadata_Site_Count | \n",
+ " Metadata_Count_Cells | \n",
+ " Metadata_Count_CellsIncludingEdges | \n",
+ " Metadata_Count_Cytoplasm | \n",
+ " Metadata_Count_Nuclei | \n",
+ " Metadata_Count_NucleiIncludingEdges | \n",
+ " ... | \n",
+ " Nuclei_Texture_InfoMeas2_Brightfield_3_02_256 | \n",
+ " Nuclei_Texture_InfoMeas2_Mito_3_02_256 | \n",
+ " Nuclei_Texture_InverseDifferenceMoment_AGP_3_02_256 | \n",
+ " Nuclei_Texture_InverseDifferenceMoment_DNA_3_02_256 | \n",
+ " Nuclei_Texture_InverseDifferenceMoment_Mito_10_03_256 | \n",
+ " Nuclei_Texture_SumVariance_AGP_10_03_256 | \n",
+ " Nuclei_Texture_SumVariance_BFHigh_3_03_256 | \n",
+ " Nuclei_Texture_SumVariance_BFLow_3_00_256 | \n",
+ " Nuclei_Texture_SumVariance_Brightfield_3_03_256 | \n",
+ " Nuclei_Texture_SumVariance_ER_10_01_256 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " OAA01.02.03.04.A | \n",
+ " ccsbBroad304_05979 | \n",
+ " BR00117035 | \n",
+ " A01 | \n",
+ " 9 | \n",
+ " 845 | \n",
+ " 970 | \n",
+ " 845 | \n",
+ " 845 | \n",
+ " 970 | \n",
+ " ... | \n",
+ " -0.087364 | \n",
+ " -0.97526 | \n",
+ " 2.72760 | \n",
+ " 6.7332 | \n",
+ " 3.3709 | \n",
+ " -1.63900 | \n",
+ " 1.53450 | \n",
+ " 0.690930 | \n",
+ " 1.1301 | \n",
+ " -2.7376 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " OAA01.02.03.04.A | \n",
+ " ccsbBroad304_13129 | \n",
+ " BR00117035 | \n",
+ " A02 | \n",
+ " 9 | \n",
+ " 873 | \n",
+ " 988 | \n",
+ " 873 | \n",
+ " 873 | \n",
+ " 988 | \n",
+ " ... | \n",
+ " 0.198090 | \n",
+ " -1.85970 | \n",
+ " 1.19280 | \n",
+ " 5.3221 | \n",
+ " 2.9869 | \n",
+ " -1.13510 | \n",
+ " 6.72200 | \n",
+ " 2.194400 | \n",
+ " 2.5605 | \n",
+ " -2.2424 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " OAA01.02.03.04.A | \n",
+ " ccsbBroad304_00289 | \n",
+ " BR00117035 | \n",
+ " A03 | \n",
+ " 9 | \n",
+ " 889 | \n",
+ " 989 | \n",
+ " 889 | \n",
+ " 889 | \n",
+ " 989 | \n",
+ " ... | \n",
+ " 0.619550 | \n",
+ " -2.28570 | \n",
+ " 0.54443 | \n",
+ " 3.2157 | \n",
+ " 2.8953 | \n",
+ " 0.21932 | \n",
+ " 4.92190 | \n",
+ " 2.838000 | \n",
+ " 4.3082 | \n",
+ " -1.5694 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " OAA01.02.03.04.A | \n",
+ " ccsbBroad304_99988 | \n",
+ " BR00117035 | \n",
+ " A04 | \n",
+ " 9 | \n",
+ " 898 | \n",
+ " 995 | \n",
+ " 898 | \n",
+ " 898 | \n",
+ " 995 | \n",
+ " ... | \n",
+ " -0.230090 | \n",
+ " -0.87500 | \n",
+ " 0.46420 | \n",
+ " 2.5104 | \n",
+ " 2.3601 | \n",
+ " -0.57261 | \n",
+ " 0.37809 | \n",
+ " 0.672830 | \n",
+ " 1.5367 | \n",
+ " -1.3152 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " OAA01.02.03.04.A | \n",
+ " ccsbBroad304_07679 | \n",
+ " BR00117035 | \n",
+ " A05 | \n",
+ " 9 | \n",
+ " 876 | \n",
+ " 982 | \n",
+ " 876 | \n",
+ " 876 | \n",
+ " 982 | \n",
+ " ... | \n",
+ " -1.605600 | \n",
+ " -1.79160 | \n",
+ " 0.64160 | \n",
+ " 2.5965 | \n",
+ " 3.3969 | \n",
+ " -0.80713 | \n",
+ " -0.97239 | \n",
+ " 0.032586 | \n",
+ " 1.3970 | \n",
+ " -2.0045 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 1478 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Metadata_plate_map_name Metadata_broad_sample Metadata_Plate Metadata_Well \\\n",
+ "0 OAA01.02.03.04.A ccsbBroad304_05979 BR00117035 A01 \n",
+ "1 OAA01.02.03.04.A ccsbBroad304_13129 BR00117035 A02 \n",
+ "2 OAA01.02.03.04.A ccsbBroad304_00289 BR00117035 A03 \n",
+ "3 OAA01.02.03.04.A ccsbBroad304_99988 BR00117035 A04 \n",
+ "4 OAA01.02.03.04.A ccsbBroad304_07679 BR00117035 A05 \n",
+ "\n",
+ " Metadata_Site_Count Metadata_Count_Cells \\\n",
+ "0 9 845 \n",
+ "1 9 873 \n",
+ "2 9 889 \n",
+ "3 9 898 \n",
+ "4 9 876 \n",
+ "\n",
+ " Metadata_Count_CellsIncludingEdges Metadata_Count_Cytoplasm \\\n",
+ "0 970 845 \n",
+ "1 988 873 \n",
+ "2 989 889 \n",
+ "3 995 898 \n",
+ "4 982 876 \n",
+ "\n",
+ " Metadata_Count_Nuclei Metadata_Count_NucleiIncludingEdges ... \\\n",
+ "0 845 970 ... \n",
+ "1 873 988 ... \n",
+ "2 889 989 ... \n",
+ "3 898 995 ... \n",
+ "4 876 982 ... \n",
+ "\n",
+ " Nuclei_Texture_InfoMeas2_Brightfield_3_02_256 \\\n",
+ "0 -0.087364 \n",
+ "1 0.198090 \n",
+ "2 0.619550 \n",
+ "3 -0.230090 \n",
+ "4 -1.605600 \n",
+ "\n",
+ " Nuclei_Texture_InfoMeas2_Mito_3_02_256 \\\n",
+ "0 -0.97526 \n",
+ "1 -1.85970 \n",
+ "2 -2.28570 \n",
+ "3 -0.87500 \n",
+ "4 -1.79160 \n",
+ "\n",
+ " Nuclei_Texture_InverseDifferenceMoment_AGP_3_02_256 \\\n",
+ "0 2.72760 \n",
+ "1 1.19280 \n",
+ "2 0.54443 \n",
+ "3 0.46420 \n",
+ "4 0.64160 \n",
+ "\n",
+ " Nuclei_Texture_InverseDifferenceMoment_DNA_3_02_256 \\\n",
+ "0 6.7332 \n",
+ "1 5.3221 \n",
+ "2 3.2157 \n",
+ "3 2.5104 \n",
+ "4 2.5965 \n",
+ "\n",
+ " Nuclei_Texture_InverseDifferenceMoment_Mito_10_03_256 \\\n",
+ "0 3.3709 \n",
+ "1 2.9869 \n",
+ "2 2.8953 \n",
+ "3 2.3601 \n",
+ "4 3.3969 \n",
+ "\n",
+ " Nuclei_Texture_SumVariance_AGP_10_03_256 \\\n",
+ "0 -1.63900 \n",
+ "1 -1.13510 \n",
+ "2 0.21932 \n",
+ "3 -0.57261 \n",
+ "4 -0.80713 \n",
+ "\n",
+ " Nuclei_Texture_SumVariance_BFHigh_3_03_256 \\\n",
+ "0 1.53450 \n",
+ "1 6.72200 \n",
+ "2 4.92190 \n",
+ "3 0.37809 \n",
+ "4 -0.97239 \n",
+ "\n",
+ " Nuclei_Texture_SumVariance_BFLow_3_00_256 \\\n",
+ "0 0.690930 \n",
+ "1 2.194400 \n",
+ "2 2.838000 \n",
+ "3 0.672830 \n",
+ "4 0.032586 \n",
+ "\n",
+ " Nuclei_Texture_SumVariance_Brightfield_3_03_256 \\\n",
+ "0 1.1301 \n",
+ "1 2.5605 \n",
+ "2 4.3082 \n",
+ "3 1.5367 \n",
+ "4 1.3970 \n",
+ "\n",
+ " Nuclei_Texture_SumVariance_ER_10_01_256 \n",
+ "0 -2.7376 \n",
+ "1 -2.2424 \n",
+ "2 -1.5694 \n",
+ "3 -1.3152 \n",
+ "4 -2.0045 \n",
+ "\n",
+ "[5 rows x 1478 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "How many columns were dropped? 19\n"
+ ]
+ }
+ ],
+ "source": [
+ "# feature-select the data\n",
+ "\n",
+ "import pycytominer\n",
+ "df_selected = pycytominer.feature_select(df, operation = ['correlation_threshold', 'variance_threshold', 'drop_na_columns', 'blocklist','drop_outliers'], outlier_cutoff = 500)\n",
+ "print('How many columns were dropped?',df.shape[1] - df_selected.shape[1])\n",
+ "df_final = df_selected.loc[:,~df_selected.columns.duplicated()].copy()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_final.to_parquet(f\"JUMP_ORF_all.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_parquet = pd.read_parquet(f\"JUMP_ORF_all.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Metadata_broad_sample',\n",
+ " 'Metadata_Name',\n",
+ " 'Metadata_Vector',\n",
+ " 'Metadata_Transcript',\n",
+ " 'Metadata_Symbol',\n",
+ " 'Metadata_NCBI Gene ID',\n",
+ " 'Metadata_Taxon ID',\n",
+ " 'Metadata_Gene Description',\n",
+ " 'Metadata_Annot. Gene Symbol',\n",
+ " 'Metadata_Annot. Gene ID',\n",
+ " 'Metadata_Prot Match %',\n",
+ " 'Metadata_MOI',\n",
+ " 'Metadata_Virus / ml',\n",
+ " 'Metadata_Insert Length',\n",
+ " 'Metadata_pert_type',\n",
+ " 'Metadata_control_type',\n",
+ " 'Cells_AreaShape_FormFactor']"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# identify columns with NaN values\n",
+ "[col for col in df_selected.columns if df[col].isnull().values.any()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# remove measurement column with NaNs\n",
+ "df_selected.drop(columns='Cells_AreaShape_FormFactor', inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Computing compound plate replicability...\n"
+ ]
+ },
+ {
+ "ename": "ValueError",
+ "evalue": "Input contains NaN.",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[5], line 24\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[39m# Calculate replicability mAP\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mComputing \u001b[39m\u001b[39m{\u001b[39;00mdescription\u001b[39m}\u001b[39;00m\u001b[39m replicability...\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m---> 24\u001b[0m precision \u001b[39m=\u001b[39m utilitary\u001b[39m.\u001b[39;49mPrecisionScores(all_plates_df, all_plates_df, feature_to_group_by, \u001b[39m\"\u001b[39;49m\u001b[39mreplicability\u001b[39;49m\u001b[39m\"\u001b[39;49m, feature_to_group_by, within\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, against_negcon\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m 26\u001b[0m replicability_ap_df \u001b[39m=\u001b[39m precision\u001b[39m.\u001b[39map_group\n\u001b[1;32m 27\u001b[0m replicability_map \u001b[39m=\u001b[39m precision\u001b[39m.\u001b[39mmap\n",
+ "File \u001b[0;32m~/Documents/GitHub/jump-orf-data/analysis_notebook/utilitary.py:248\u001b[0m, in \u001b[0;36mPrecisionScores.__init__\u001b[0;34m(self, profile1, profile2, group_by_feature, mode, identify_perturbation_feature, within, anti_correlation, against_negcon)\u001b[0m\n\u001b[1;32m 245\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmap1 \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile1[[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39midentify_perturbation_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfeature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcontrol_type_feature]]\u001b[39m.\u001b[39mcopy()\n\u001b[1;32m 246\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmap2 \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile2[[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39midentify_perturbation_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfeature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcontrol_type_feature]]\u001b[39m.\u001b[39mcopy()\n\u001b[0;32m--> 248\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcorr \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcompute_correlation()\n\u001b[1;32m 249\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtruth_matrix \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcreate_truth_matrix()\n\u001b[1;32m 250\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcleanup()\n",
+ "File \u001b[0;32m~/Documents/GitHub/jump-orf-data/analysis_notebook/utilitary.py:292\u001b[0m, in \u001b[0;36mPrecisionScores.compute_correlation\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 290\u001b[0m _sample_names_1 \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile1[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature])\n\u001b[1;32m 291\u001b[0m _sample_names_2 \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile2[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature])\n\u001b[0;32m--> 292\u001b[0m _corr \u001b[39m=\u001b[39m cosine_similarity(_profile1, _profile2)\n\u001b[1;32m 293\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39manti_correlation:\n\u001b[1;32m 294\u001b[0m _corr \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mabs(_corr)\n",
+ "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/metrics/pairwise.py:1393\u001b[0m, in \u001b[0;36mcosine_similarity\u001b[0;34m(X, Y, dense_output)\u001b[0m\n\u001b[1;32m 1358\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Compute cosine similarity between samples in X and Y.\u001b[39;00m\n\u001b[1;32m 1359\u001b[0m \n\u001b[1;32m 1360\u001b[0m \u001b[39mCosine similarity, or the cosine kernel, computes similarity as the\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1389\u001b[0m \u001b[39m Returns the cosine similarity between samples in X and Y.\u001b[39;00m\n\u001b[1;32m 1390\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 1391\u001b[0m \u001b[39m# to avoid recursive import\u001b[39;00m\n\u001b[0;32m-> 1393\u001b[0m X, Y \u001b[39m=\u001b[39m check_pairwise_arrays(X, Y)\n\u001b[1;32m 1395\u001b[0m X_normalized \u001b[39m=\u001b[39m normalize(X, copy\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m 1396\u001b[0m \u001b[39mif\u001b[39;00m X \u001b[39mis\u001b[39;00m Y:\n",
+ "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/metrics/pairwise.py:155\u001b[0m, in \u001b[0;36mcheck_pairwise_arrays\u001b[0;34m(X, Y, precomputed, dtype, accept_sparse, force_all_finite, copy)\u001b[0m\n\u001b[1;32m 146\u001b[0m X \u001b[39m=\u001b[39m Y \u001b[39m=\u001b[39m check_array(\n\u001b[1;32m 147\u001b[0m X,\n\u001b[1;32m 148\u001b[0m accept_sparse\u001b[39m=\u001b[39maccept_sparse,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 152\u001b[0m estimator\u001b[39m=\u001b[39mestimator,\n\u001b[1;32m 153\u001b[0m )\n\u001b[1;32m 154\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 155\u001b[0m X \u001b[39m=\u001b[39m check_array(\n\u001b[1;32m 156\u001b[0m X,\n\u001b[1;32m 157\u001b[0m accept_sparse\u001b[39m=\u001b[39;49maccept_sparse,\n\u001b[1;32m 158\u001b[0m dtype\u001b[39m=\u001b[39;49mdtype,\n\u001b[1;32m 159\u001b[0m copy\u001b[39m=\u001b[39;49mcopy,\n\u001b[1;32m 160\u001b[0m force_all_finite\u001b[39m=\u001b[39;49mforce_all_finite,\n\u001b[1;32m 161\u001b[0m estimator\u001b[39m=\u001b[39;49mestimator,\n\u001b[1;32m 162\u001b[0m )\n\u001b[1;32m 163\u001b[0m Y \u001b[39m=\u001b[39m check_array(\n\u001b[1;32m 164\u001b[0m Y,\n\u001b[1;32m 165\u001b[0m accept_sparse\u001b[39m=\u001b[39maccept_sparse,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 169\u001b[0m estimator\u001b[39m=\u001b[39mestimator,\n\u001b[1;32m 170\u001b[0m )\n\u001b[1;32m 172\u001b[0m \u001b[39mif\u001b[39;00m precomputed:\n",
+ "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/utils/validation.py:921\u001b[0m, in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 915\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 916\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFound array with dim \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m. \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m expected <= 2.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 917\u001b[0m \u001b[39m%\u001b[39m (array\u001b[39m.\u001b[39mndim, estimator_name)\n\u001b[1;32m 918\u001b[0m )\n\u001b[1;32m 920\u001b[0m \u001b[39mif\u001b[39;00m force_all_finite:\n\u001b[0;32m--> 921\u001b[0m _assert_all_finite(\n\u001b[1;32m 922\u001b[0m array,\n\u001b[1;32m 923\u001b[0m input_name\u001b[39m=\u001b[39;49minput_name,\n\u001b[1;32m 924\u001b[0m estimator_name\u001b[39m=\u001b[39;49mestimator_name,\n\u001b[1;32m 925\u001b[0m allow_nan\u001b[39m=\u001b[39;49mforce_all_finite \u001b[39m==\u001b[39;49m \u001b[39m\"\u001b[39;49m\u001b[39mallow-nan\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 926\u001b[0m )\n\u001b[1;32m 928\u001b[0m \u001b[39mif\u001b[39;00m ensure_min_samples \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 929\u001b[0m n_samples \u001b[39m=\u001b[39m _num_samples(array)\n",
+ "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/utils/validation.py:161\u001b[0m, in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan, msg_dtype, estimator_name, input_name)\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[39mif\u001b[39;00m estimator_name \u001b[39mand\u001b[39;00m input_name \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mX\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mand\u001b[39;00m has_nan_error:\n\u001b[1;32m 145\u001b[0m \u001b[39m# Improve the error message on how to handle missing values in\u001b[39;00m\n\u001b[1;32m 146\u001b[0m \u001b[39m# scikit-learn.\u001b[39;00m\n\u001b[1;32m 147\u001b[0m msg_err \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (\n\u001b[1;32m 148\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m{\u001b[39;00mestimator_name\u001b[39m}\u001b[39;00m\u001b[39m does not accept missing values\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 149\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m encoded as NaN natively. For supervised learning, you might want\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m#estimators-that-handle-nan-values\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 160\u001b[0m )\n\u001b[0;32m--> 161\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(msg_err)\n",
+ "\u001b[0;31mValueError\u001b[0m: Input contains NaN."
+ ]
+ }
+ ],
+ "source": [
+ "# this one does not quite work yet!\n",
+ "import utilitary\n",
+ "\n",
+ "# get replicability - setup\n",
+ "replicability_ap_df = pd.DataFrame()\n",
+ "matching_ap_df = pd.DataFrame()\n",
+ "\n",
+ "#add metadata_control_type column\n",
+ "all_plates_df = df_selected.copy()\n",
+ "all_plates_df['Metadata_control_type'] = all_plates_df['Metadata_control_type'].fillna('')\n",
+ "# all_plates_df['Metadata_control_type'] = ''\n",
+ "# cmpd = all_plates_df['Metadata_Compound'].values\n",
+ "# ctrl = all_plates_df['Metadata_control_type'].values\n",
+ "# for vals in range(len(cmpd)):\n",
+ "# if cmpd[vals] == \"DMSO\":\n",
+ "# ctrl[vals] = 'negcon'\n",
+ "\n",
+ "\n",
+ "feature_to_group_by = 'Metadata_Symbol'\n",
+ "# Description\n",
+ "description = f'compound plate'\n",
+ "\n",
+ "# Calculate replicability mAP\n",
+ "print(f'Computing {description} replicability...')\n",
+ "precision = utilitary.PrecisionScores(all_plates_df, all_plates_df, feature_to_group_by, \"replicability\", feature_to_group_by, within=True, against_negcon=True)\n",
+ "\n",
+ "replicability_ap_df = precision.ap_group\n",
+ "replicability_map = precision.map\n",
+ "\n",
+ "replicability_ap_df.head()\n",
+ "# Construct a random baseline\n",
+ "\n",
+ "# Filter the dataframe to only include ORFs that have > 95 percentile of the random baseline (aka, <5% chance of seeing that mAP or something more extreme under the null hypothesis that replicability is random)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Collapse the dataframe within genes\n",
+ "\n",
+ "* Median collapse into 1 row per gene (most genes have 5 replicate ORFs) --> data goes down to ~12600 rows\n",
+ "* Metadata_Symbol is the gene name\n",
+ "* Note that the controls include "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Parameters (to be moved to the top of the notebook)\n",
+ "aggregation_type =\"median\"\n",
+ "\n",
+ "\n",
+ "#which control types do you want to include? \n",
+ "controltypes_orf = ['negcon', 'poscon']\n",
+ "controltypes = ['negcon', 'poscon_cp', 'poscon_orf', 'poscon_diverse']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#filter to gene of interest\n",
+ "df_subset_orf = df.loc[df['Metadata_Symbol'].isin(gene_list)].reset_index(drop=True)\n",
+ "\n",
+ "# get controls \n",
+ "df_subset_orf_con = df.loc[df['Metadata_control_type'].isin(controltypes_orf)].reset_index(drop=True)\n",
+ "\n",
+ "#get target 2 data\n",
+ "df_subset_t2 = df_t2.loc[df_t2['Metadata_control_type'].isin(controltypes)].reset_index(drop=True)\n",
+ "df_subset_t2['Metadata_broad_sample'] = df_subset_t2['Metadata_broad_sample'].fillna('empty')\n",
+ "\n",
+ "# aggregate\n",
+ "if aggregation_type == \"mean\":\n",
+ " df_subset_orf = df_subset_orf.groupby('Metadata_Symbol').mean(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_orf_con = df_subset_orf_con.groupby(['Metadata_control_type','Metadata_broad_sample']).mean(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_t2 = df_subset_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).mean(numeric_only=True).reset_index(drop=True)\n",
+ "\n",
+ "elif aggregation_type == \"median\":\n",
+ " df_subset_orf = df_subset_orf.groupby('Metadata_Symbol').median().reset_index(drop=True)\n",
+ " df_subset_orf_con = df_subset_orf_con.groupby(['Metadata_control_type','Metadata_broad_sample']).median().reset_index(drop=True)\n",
+ " df_subset_t2 = df_subset_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).median().reset_index(drop=True)\n",
+ "\n",
+ "df_subset_orf['Metadata_data_source'] = 'ORF'\n",
+ "df_subset_orf_con['Metadata_data_source'] = 'ORF'\n",
+ "df_subset_t2['Metadata_data_source'] = 'T2'\n",
+ "\n",
+ "\n",
+ "#merge the separate subsets together\n",
+ "df_subset = pd.concat([df_subset_orf,df_subset_orf_con,df_subset_t2], ignore_index=True)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## for all genes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/0y/5d1shp9n6kq856jtm9t10mcw0000gq/T/ipykernel_42906/2054350032.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
+ " df_subset_orf = df.groupby('Metadata_Symbol').median().reset_index(drop=True)\n",
+ "/var/folders/0y/5d1shp9n6kq856jtm9t10mcw0000gq/T/ipykernel_42906/2054350032.py:12: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
+ " df_subset_orf_con = df.groupby(['Metadata_control_type','Metadata_broad_sample']).median().reset_index(drop=True)\n",
+ "/var/folders/0y/5d1shp9n6kq856jtm9t10mcw0000gq/T/ipykernel_42906/2054350032.py:13: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
+ " df_subset_t2 = df_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).median().reset_index(drop=True)\n"
+ ]
+ }
+ ],
+ "source": [
+ "#fill nas in Metadata_broad_sample column to keep untreated negcons\n",
+ "df_t2['Metadata_broad_sample'] = df_t2['Metadata_broad_sample'].fillna('empty')\n",
+ "\n",
+ "# aggregate\n",
+ "if aggregation_type == \"mean\":\n",
+ " df_subset_orf = df.groupby('Metadata_Symbol').mean(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_orf_con = df.groupby(['Metadata_control_type','Metadata_broad_sample']).mean(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_t2 = df_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).mean(numeric_only=True).reset_index(drop=True)\n",
+ "\n",
+ "elif aggregation_type == \"median\":\n",
+ " df_subset_orf = df.groupby('Metadata_Symbol').median(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_orf_con = df.groupby(['Metadata_control_type','Metadata_broad_sample']).median(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_t2 = df_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).median(numeric_only=True).reset_index(drop=True)\n",
+ "\n",
+ "df_subset_orf['Metadata_data_source'] = 'ORF'\n",
+ "df_subset_orf_con['Metadata_data_source'] = 'ORF'\n",
+ "df_subset_t2['Metadata_data_source'] = 'T2'\n",
+ "\n",
+ "\n",
+ "#merge the separate subsets together\n",
+ "df_collapsed = pd.concat([df_subset_orf,df_subset_orf_con,df_subset_t2], ignore_index=True)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Put metadata back in the dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pycytominer\n",
+ "\n",
+ "metadata_column_list = ['Metadata_Symbol',\n",
+ " 'Metadata_control_type', \n",
+ " 'Metadata_broad_sample',\n",
+ " 'Metadata_plate_map_name',\n",
+ " 'Metadata_Plate', \n",
+ " 'Metadata_Name', \n",
+ " 'Metadata_Vector',\n",
+ " 'Metadata_Transcript', \n",
+ " 'Metadata_NCBI Gene ID', \n",
+ " 'Metadata_Taxon ID',\n",
+ " 'Metadata_Gene Description',\n",
+ " 'Metadata_Annot. Gene Symbol',\n",
+ " 'Metadata_Annot. Gene ID',\n",
+ " 'Metadata_Prot Match %',\n",
+ " 'Metadata_MOI',\n",
+ " 'Metadata_Virus / ml',\n",
+ " 'Metadata_Insert Length',\n",
+ " 'Metadata_pert_type',]\n",
+ "#aggregate ORF\n",
+ "df_ORF_aggregated = pycytominer.aggregate(df, \n",
+ " strata=metadata_column_list,\n",
+ " features=\"infer\",\n",
+ " operation=\"mean\",\n",
+ " output_file=\"none\",\n",
+ " compute_object_count=False,\n",
+ " object_feature=\"Metadata_ObjectNumber\",\n",
+ " subset_data_df=\"none\",\n",
+ " compression_options=None,\n",
+ " float_format=None,)\n",
+ "\n",
+ "#aggregate t2 plates from ORF batches\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Metadata_Well',\n",
+ " 'Metadata_Site_Count',\n",
+ " 'Metadata_Count_Cells',\n",
+ " 'Metadata_Count_CellsIncludingEdges',\n",
+ " 'Metadata_Count_Cytoplasm',\n",
+ " 'Metadata_Count_Nuclei',\n",
+ " 'Metadata_Count_NucleiIncludingEdges',\n",
+ " 'Metadata_Object_Count',\n",
+ " 'Image_Granularity_10_AGP',\n",
+ " 'Image_Granularity_10_BFHigh',\n",
+ " 'Image_Granularity_10_BFLow',\n",
+ " 'Image_Granularity_10_Brightfield',\n",
+ " 'Image_Granularity_10_DNA',\n",
+ " 'Image_Granularity_10_ER',\n",
+ " 'Image_Granularity_10_Mito',\n",
+ " 'Image_Granularity_10_RNA',\n",
+ " 'Image_Granularity_11_AGP',\n",
+ " 'Image_Granularity_11_BFHigh',\n",
+ " 'Image_Granularity_11_BFLow',\n",
+ " 'Image_Granularity_11_Brightfield',\n",
+ " 'Image_Granularity_11_DNA',\n",
+ " 'Image_Granularity_11_ER',\n",
+ " 'Image_Granularity_11_Mito',\n",
+ " 'Image_Granularity_11_RNA',\n",
+ " 'Image_Granularity_12_AGP',\n",
+ " 'Image_Granularity_12_BFHigh',\n",
+ " 'Image_Granularity_12_BFLow',\n",
+ " 'Image_Granularity_12_Brightfield',\n",
+ " 'Image_Granularity_12_ER',\n",
+ " 'Image_Granularity_12_Mito',\n",
+ " 'Image_Granularity_12_RNA',\n",
+ " 'Image_Granularity_13_AGP',\n",
+ " 'Image_Granularity_13_BFHigh',\n",
+ " 'Image_Granularity_13_BFLow',\n",
+ " 'Image_Granularity_13_Brightfield',\n",
+ " 'Image_Granularity_13_ER',\n",
+ " 'Image_Granularity_13_Mito',\n",
+ " 'Image_Granularity_13_RNA',\n",
+ " 'Image_Granularity_14_AGP',\n",
+ " 'Image_Granularity_14_BFHigh',\n",
+ " 'Image_Granularity_14_BFLow',\n",
+ " 'Image_Granularity_14_Brightfield',\n",
+ " 'Image_Granularity_14_ER',\n",
+ " 'Image_Granularity_14_Mito',\n",
+ " 'Image_Granularity_14_RNA',\n",
+ " 'Image_Granularity_15_AGP',\n",
+ " 'Image_Granularity_15_BFHigh',\n",
+ " 'Image_Granularity_15_BFLow',\n",
+ " 'Image_Granularity_15_Brightfield',\n",
+ " 'Image_Granularity_15_DNA',\n",
+ " 'Image_Granularity_15_ER',\n",
+ " 'Image_Granularity_15_Mito',\n",
+ " 'Image_Granularity_15_RNA',\n",
+ " 'Image_Granularity_16_AGP',\n",
+ " 'Image_Granularity_16_BFHigh',\n",
+ " 'Image_Granularity_16_BFLow',\n",
+ " 'Image_Granularity_16_Brightfield',\n",
+ " 'Image_Granularity_16_DNA',\n",
+ " 'Image_Granularity_16_ER',\n",
+ " 'Image_Granularity_16_Mito',\n",
+ " 'Image_Granularity_16_RNA',\n",
+ " 'Image_Granularity_1_BFHigh',\n",
+ " 'Image_Granularity_1_BFLow',\n",
+ " 'Image_Granularity_1_Brightfield',\n",
+ " 'Image_Granularity_1_DNA',\n",
+ " 'Image_Granularity_1_Mito',\n",
+ " 'Image_Granularity_1_RNA',\n",
+ " 'Image_Granularity_2_AGP',\n",
+ " 'Image_Granularity_2_BFHigh',\n",
+ " 'Image_Granularity_2_BFLow',\n",
+ " 'Image_Granularity_2_Brightfield',\n",
+ " 'Image_Granularity_2_DNA',\n",
+ " 'Image_Granularity_2_ER',\n",
+ " 'Image_Granularity_2_Mito',\n",
+ " 'Image_Granularity_2_RNA',\n",
+ " 'Image_Granularity_3_AGP',\n",
+ " 'Image_Granularity_3_BFHigh',\n",
+ " 'Image_Granularity_3_BFLow',\n",
+ " 'Image_Granularity_3_Brightfield',\n",
+ " 'Image_Granularity_3_ER',\n",
+ " 'Image_Granularity_3_Mito',\n",
+ " 'Image_Granularity_3_RNA',\n",
+ " 'Image_Granularity_4_AGP',\n",
+ " 'Image_Granularity_4_BFHigh',\n",
+ " 'Image_Granularity_4_BFLow',\n",
+ " 'Image_Granularity_4_Brightfield',\n",
+ " 'Image_Granularity_4_DNA',\n",
+ " 'Image_Granularity_4_ER',\n",
+ " 'Image_Granularity_4_Mito',\n",
+ " 'Image_Granularity_4_RNA',\n",
+ " 'Image_Granularity_5_AGP',\n",
+ " 'Image_Granularity_5_BFHigh',\n",
+ " 'Image_Granularity_5_BFLow',\n",
+ " 'Image_Granularity_5_Brightfield',\n",
+ " 'Image_Granularity_5_DNA',\n",
+ " 'Image_Granularity_5_ER',\n",
+ " 'Image_Granularity_5_Mito',\n",
+ " 'Image_Granularity_5_RNA',\n",
+ " 'Image_Granularity_6_AGP',\n",
+ " 'Image_Granularity_6_BFHigh',\n",
+ " 'Image_Granularity_6_BFLow',\n",
+ " 'Image_Granularity_6_Brightfield',\n",
+ " 'Image_Granularity_6_DNA',\n",
+ " 'Image_Granularity_6_ER',\n",
+ " 'Image_Granularity_6_Mito',\n",
+ " 'Image_Granularity_6_RNA',\n",
+ " 'Image_Granularity_7_AGP',\n",
+ " 'Image_Granularity_7_BFHigh',\n",
+ " 'Image_Granularity_7_BFLow',\n",
+ " 'Image_Granularity_7_Brightfield',\n",
+ " 'Image_Granularity_7_DNA',\n",
+ " 'Image_Granularity_7_ER',\n",
+ " 'Image_Granularity_7_Mito',\n",
+ " 'Image_Granularity_7_RNA',\n",
+ " 'Image_Granularity_8_AGP',\n",
+ " 'Image_Granularity_8_BFHigh',\n",
+ " 'Image_Granularity_8_BFLow',\n",
+ " 'Image_Granularity_8_Brightfield',\n",
+ " 'Image_Granularity_8_DNA',\n",
+ " 'Image_Granularity_8_ER',\n",
+ " 'Image_Granularity_8_Mito',\n",
+ " 'Image_Granularity_8_RNA',\n",
+ " 'Image_Granularity_9_AGP',\n",
+ " 'Image_Granularity_9_BFHigh',\n",
+ " 'Image_Granularity_9_BFLow',\n",
+ " 'Image_Granularity_9_Brightfield',\n",
+ " 'Image_Granularity_9_DNA',\n",
+ " 'Image_Granularity_9_ER',\n",
+ " 'Image_Granularity_9_Mito',\n",
+ " 'Image_Granularity_9_RNA',\n",
+ " 'Image_ImageQuality_Correlation_OrigAGP_5',\n",
+ " 'Image_ImageQuality_Correlation_OrigAGP_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigBrightfield_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigBrightfield_H_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigBrightfield_L_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigDNA_20',\n",
+ " 'Image_ImageQuality_Correlation_OrigDNA_5',\n",
+ " 'Image_ImageQuality_Correlation_OrigDNA_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigER_10',\n",
+ " 'Image_ImageQuality_Correlation_OrigER_5',\n",
+ " 'Image_ImageQuality_Correlation_OrigER_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigMito_5',\n",
+ " 'Image_ImageQuality_Correlation_OrigMito_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigRNA_10',\n",
+ " 'Image_ImageQuality_Correlation_OrigRNA_20',\n",
+ " 'Image_ImageQuality_Correlation_OrigRNA_5',\n",
+ " 'Image_ImageQuality_FocusScore_OrigAGP',\n",
+ " 'Image_ImageQuality_FocusScore_OrigBrightfield',\n",
+ " 'Image_ImageQuality_FocusScore_OrigDNA',\n",
+ " 'Image_ImageQuality_FocusScore_OrigER',\n",
+ " 'Image_ImageQuality_FocusScore_OrigMito',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigAGP_10',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigAGP_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigAGP_5',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigAGP_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_10',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_5',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_10',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_5',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigDNA_10',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigDNA_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigDNA_5',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigDNA_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigER_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigMito_10',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigMito_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigMito_5',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigMito_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigRNA_50',\n",
+ " 'Image_ImageQuality_MADIntensity_OrigDNA',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigAGP',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigBrightfield_H',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigDNA',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigER',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigMito',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigRNA',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigBrightfield',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigBrightfield_H',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigBrightfield_L',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigER',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigMito',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigRNA',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigAGP',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigBrightfield',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigBrightfield_H',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigBrightfield_L',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigDNA',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigER',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigMito',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigRNA',\n",
+ " 'Image_ImageQuality_StdIntensity_OrigAGP',\n",
+ " 'Image_ImageQuality_StdIntensity_OrigBrightfield',\n",
+ " 'Image_ImageQuality_ThresholdOtsu_OrigDNA_2W',\n",
+ " 'Image_ImageQuality_TotalIntensity_OrigDNA',\n",
+ " 'Image_Intensity_LowerQuartileIntensity_AGP',\n",
+ " 'Image_Intensity_LowerQuartileIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_MADIntensity_AGP',\n",
+ " 'Image_Intensity_MADIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_MADIntensity_Brightfield_BackgroundOnly',\n",
+ " 'Image_Intensity_MADIntensity_DNA',\n",
+ " 'Image_Intensity_MADIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_MADIntensity_ER__BackgroundOnly',\n",
+ " 'Image_Intensity_MADIntensity_Mito',\n",
+ " 'Image_Intensity_MADIntensity_RNA',\n",
+ " 'Image_Intensity_MaxIntensity_AGP',\n",
+ " 'Image_Intensity_MaxIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_BFHigh__BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_BFLow',\n",
+ " 'Image_Intensity_MaxIntensity_BFLow_BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_Brightfield',\n",
+ " 'Image_Intensity_MaxIntensity_Brightfield_BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_DNA',\n",
+ " 'Image_Intensity_MaxIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_ER',\n",
+ " 'Image_Intensity_MaxIntensity_ER__BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_Mito',\n",
+ " 'Image_Intensity_MaxIntensity_Mito_BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_RNA',\n",
+ " 'Image_Intensity_MaxIntensity_RNA_BackgroundOnly',\n",
+ " 'Image_Intensity_MedianIntensity_AGP',\n",
+ " 'Image_Intensity_MedianIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_MedianIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_MedianIntensity_ER__BackgroundOnly',\n",
+ " 'Image_Intensity_MinIntensity_AGP',\n",
+ " 'Image_Intensity_MinIntensity_BFHigh__BackgroundOnly',\n",
+ " 'Image_Intensity_MinIntensity_BFLow_BackgroundOnly',\n",
+ " 'Image_Intensity_MinIntensity_Brightfield_BackgroundOnly',\n",
+ " 'Image_Intensity_MinIntensity_DNA',\n",
+ " 'Image_Intensity_MinIntensity_Mito_BackgroundOnly',\n",
+ " 'Image_Intensity_PercentMaximal_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_PercentMaximal_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_PercentMaximal_ER__BackgroundOnly',\n",
+ " 'Image_Intensity_PercentMaximal_Mito_BackgroundOnly',\n",
+ " 'Image_Intensity_PercentMaximal_RNA_BackgroundOnly',\n",
+ " 'Image_Intensity_StdIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_StdIntensity_Brightfield_BackgroundOnly',\n",
+ " 'Image_Intensity_StdIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_StdIntensity_ER__BackgroundOnly',\n",
+ " 'Image_Intensity_StdIntensity_RNA_BackgroundOnly',\n",
+ " 'Image_Intensity_TotalIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_TotalIntensity_BFLow_BackgroundOnly',\n",
+ " 'Image_Intensity_TotalIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_TotalIntensity_RNA_BackgroundOnly',\n",
+ " 'Image_Intensity_UpperQuartileIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_UpperQuartileIntensity_DNA',\n",
+ " 'Image_Intensity_UpperQuartileIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Texture_AngularSecondMoment_AGP_3_02_256',\n",
+ " 'Image_Texture_AngularSecondMoment_BFLow_10_01_256',\n",
+ " 'Image_Texture_AngularSecondMoment_Brightfield_3_02_256',\n",
+ " 'Image_Texture_AngularSecondMoment_DNA_3_00_256',\n",
+ " 'Image_Texture_AngularSecondMoment_ER_5_02_256',\n",
+ " 'Image_Texture_AngularSecondMoment_Mito_10_01_256',\n",
+ " 'Image_Texture_AngularSecondMoment_RNA_10_01_256',\n",
+ " 'Image_Texture_Contrast_AGP_10_02_256',\n",
+ " 'Image_Texture_Contrast_BFHigh_10_01_256',\n",
+ " 'Image_Texture_Contrast_BFLow_10_01_256',\n",
+ " 'Image_Texture_Contrast_Brightfield_10_00_256',\n",
+ " 'Image_Texture_Contrast_DNA_3_02_256',\n",
+ " 'Image_Texture_Contrast_ER_3_03_256',\n",
+ " 'Image_Texture_Contrast_Mito_3_00_256',\n",
+ " 'Image_Texture_Correlation_AGP_10_01_256',\n",
+ " 'Image_Texture_Correlation_BFLow_10_02_256',\n",
+ " 'Image_Texture_Correlation_Brightfield_10_01_256',\n",
+ " 'Image_Texture_Correlation_DNA_10_00_256',\n",
+ " 'Image_Texture_Correlation_DNA_10_01_256',\n",
+ " 'Image_Texture_Correlation_DNA_10_02_256',\n",
+ " 'Image_Texture_Correlation_DNA_10_03_256',\n",
+ " 'Image_Texture_Correlation_DNA_5_00_256',\n",
+ " 'Image_Texture_Correlation_DNA_5_01_256',\n",
+ " 'Image_Texture_Correlation_DNA_5_03_256',\n",
+ " 'Image_Texture_Correlation_ER_10_02_256',\n",
+ " 'Image_Texture_Correlation_ER_5_00_256',\n",
+ " 'Image_Texture_Correlation_Mito_3_02_256',\n",
+ " 'Image_Texture_Correlation_RNA_10_03_256',\n",
+ " 'Image_Texture_Correlation_RNA_5_03_256',\n",
+ " 'Image_Texture_DifferenceEntropy_AGP_3_02_256',\n",
+ " 'Image_Texture_DifferenceEntropy_BFHigh_3_00_256',\n",
+ " 'Image_Texture_DifferenceEntropy_BFLow_3_02_256',\n",
+ " 'Image_Texture_DifferenceEntropy_Brightfield_10_03_256',\n",
+ " 'Image_Texture_DifferenceEntropy_DNA_10_01_256',\n",
+ " 'Image_Texture_DifferenceEntropy_Mito_10_01_256',\n",
+ " 'Image_Texture_DifferenceVariance_AGP_3_01_256',\n",
+ " 'Image_Texture_DifferenceVariance_BFHigh_3_00_256',\n",
+ " 'Image_Texture_DifferenceVariance_BFLow_3_02_256',\n",
+ " 'Image_Texture_DifferenceVariance_Brightfield_3_00_256',\n",
+ " 'Image_Texture_DifferenceVariance_DNA_3_02_256',\n",
+ " 'Image_Texture_DifferenceVariance_Mito_10_03_256',\n",
+ " 'Image_Texture_Entropy_BFLow_3_00_256',\n",
+ " 'Image_Texture_InfoMeas1_AGP_10_03_256',\n",
+ " 'Image_Texture_InfoMeas1_BFLow_10_03_256',\n",
+ " 'Image_Texture_InfoMeas1_Brightfield_10_03_256',\n",
+ " 'Image_Texture_InfoMeas1_DNA_10_03_256',\n",
+ " 'Image_Texture_InfoMeas1_DNA_3_01_256',\n",
+ " 'Image_Texture_InfoMeas1_DNA_5_01_256',\n",
+ " 'Image_Texture_InfoMeas1_ER_5_00_256',\n",
+ " 'Image_Texture_InfoMeas1_Mito_10_03_256',\n",
+ " 'Image_Texture_InfoMeas1_Mito_3_02_256',\n",
+ " 'Image_Texture_InfoMeas2_AGP_10_03_256',\n",
+ " 'Image_Texture_InfoMeas2_DNA_10_01_256',\n",
+ " 'Image_Texture_InfoMeas2_DNA_10_02_256',\n",
+ " 'Image_Texture_InfoMeas2_ER_3_03_256',\n",
+ " 'Image_Texture_InfoMeas2_ER_5_01_256',\n",
+ " 'Image_Texture_InfoMeas2_Mito_10_01_256',\n",
+ " 'Image_Texture_InfoMeas2_Mito_10_02_256',\n",
+ " 'Image_Texture_InfoMeas2_Mito_10_03_256',\n",
+ " 'Image_Texture_InfoMeas2_Mito_3_02_256',\n",
+ " 'Image_Texture_InfoMeas2_Mito_5_01_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_AGP_3_02_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_BFHigh_3_00_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_BFLow_3_00_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_Brightfield_3_02_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_DNA_10_03_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_ER_10_01_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_Mito_10_01_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_RNA_3_00_256',\n",
+ " 'Image_Texture_SumEntropy_DNA_3_02_256',\n",
+ " 'Image_Texture_SumVariance_AGP_10_03_256',\n",
+ " 'Image_Texture_SumVariance_Brightfield_5_02_256',\n",
+ " 'Image_Texture_SumVariance_DNA_10_03_256',\n",
+ " 'Image_Texture_SumVariance_ER_10_03_256',\n",
+ " 'Image_Texture_SumVariance_Mito_10_01_256',\n",
+ " 'Image_Texture_SumVariance_RNA_10_01_256',\n",
+ " 'Image_Threshold_FinalThreshold_NucleiIncludingEdges',\n",
+ " 'Image_Threshold_FinalThreshold_mito_bw',\n",
+ " 'Image_Threshold_SumOfEntropies_CellsIncludingEdges',\n",
+ " 'Image_Threshold_SumOfEntropies_NucleiIncludingEdges',\n",
+ " 'Image_Threshold_SumOfEntropies_mito_bw',\n",
+ " 'Image_Threshold_WeightedVariance_CellsIncludingEdges',\n",
+ " 'Image_Threshold_WeightedVariance_NucleiIncludingEdges',\n",
+ " 'Image_Threshold_WeightedVariance_mito_bw']"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "[c for c in df.columns if c not in df_ORF_aggregated.columns]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_collapsed.to_csv(f\"JUMP_ORF_{aggregation_type}_collapsed.csv\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "jumpORF",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.15"
+ },
+ "orig_nbformat": 4,
+ "vscode": {
+ "interpreter": {
+ "hash": "2f3fec36f6be95d788e5f03a928d042624f9ad08087c2484cba824ceb7727375"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/profiles/2021_08_23_Batch12/BR00126734/BR00126734.csv.gz b/profiles/2021_08_23_Batch12/BR00126734/BR00126734.csv.gz
new file mode 100644
index 0000000..ff2e511
--- /dev/null
+++ b/profiles/2021_08_23_Batch12/BR00126734/BR00126734.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2709e051d13811dd9ee0e5b45439ffb77320f3947468334c5e2908a718478e7d
+size 9166733
diff --git a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_augmented.csv.gz b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_augmented.csv.gz
new file mode 100644
index 0000000..91aae60
--- /dev/null
+++ b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_augmented.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f542a3db7824bb8b91f85c6b24994e287de789404874092ef8ba474cfec94e7d
+size 9190214
diff --git a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized.csv.gz b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized.csv.gz
new file mode 100644
index 0000000..cf8fb33
--- /dev/null
+++ b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2eb0867ec9cc7ab69ee884bc147417330a7c6d8a3fdf7e860a7eacd814127df5
+size 9172996
diff --git a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_batch.csv.gz b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_batch.csv.gz
new file mode 100644
index 0000000..66cf263
--- /dev/null
+++ b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_batch.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:242b6133b468abef12384a7b011ee4724313d7c581b332d4ac57c1abfa1ae337
+size 1341789
diff --git a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_negcon_batch.csv.gz b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_negcon_batch.csv.gz
new file mode 100644
index 0000000..7a069ed
--- /dev/null
+++ b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_negcon_batch.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4672b909ca417854ea4ce2da336d21ec053bf3d65d65bcd5f5f31e569c7f270c
+size 1393809
diff --git a/profiles/2021_08_23_Batch12/BR00126735/BR00126735.csv.gz b/profiles/2021_08_23_Batch12/BR00126735/BR00126735.csv.gz
new file mode 100644
index 0000000..816ef0a
--- /dev/null
+++ b/profiles/2021_08_23_Batch12/BR00126735/BR00126735.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9009d701c18e6514578dca973d4c1088985e84b54a77e27ccc537b910044cad0
+size 9105873
diff --git a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_augmented.csv.gz b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_augmented.csv.gz
new file mode 100644
index 0000000..9f714e5
--- /dev/null
+++ b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_augmented.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d7dab958ae5ba793549cda0338fa93b862fe881c9b6d18ddae6e9b4c9d09aef
+size 9129375
diff --git a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized.csv.gz b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized.csv.gz
new file mode 100644
index 0000000..51f2760
--- /dev/null
+++ b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68e0eab8f9afb68df9c4d8851673f62a0ed535a7f27dc16134da6651fe4fa812
+size 9160469
diff --git a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_batch.csv.gz b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_batch.csv.gz
new file mode 100644
index 0000000..5d7f401
--- /dev/null
+++ b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_batch.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae107a9cd5631a0a3c955d678d769466e22b3c6b8088ff584bb5d81b3a45a655
+size 1342048
diff --git a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_negcon_batch.csv.gz b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_negcon_batch.csv.gz
new file mode 100644
index 0000000..f012d5a
--- /dev/null
+++ b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_negcon_batch.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4fcc20ff0782d8d463182a99cccc232a4abb744c4bd710e857803859debaeea
+size 1393066
diff --git a/profiles/2021_08_30_Batch13/BR00126634/BR00126634.csv.gz b/profiles/2021_08_30_Batch13/BR00126634/BR00126634.csv.gz
new file mode 100644
index 0000000..87114ad
--- /dev/null
+++ b/profiles/2021_08_30_Batch13/BR00126634/BR00126634.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a765ee64c2e41059962b3f9075c2c9d3159bd486d6313b2a89a11c5ef0f4fdb
+size 9378298
diff --git a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_augmented.csv.gz b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_augmented.csv.gz
new file mode 100644
index 0000000..c3c3e46
--- /dev/null
+++ b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_augmented.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:600d7c1ae26ce2f4e3848f8fd20a3f79c55e760c9cc9e765fe85f340a7b6fab6
+size 9401762
diff --git a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized.csv.gz b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized.csv.gz
new file mode 100644
index 0000000..f258e69
--- /dev/null
+++ b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91553357b047baf0c69da3e64c17cc241787e11bb57b13b7af6a343638301924
+size 9154704
diff --git a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_batch.csv.gz b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_batch.csv.gz
new file mode 100644
index 0000000..196e5bd
--- /dev/null
+++ b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_batch.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9bd63683e6a14cc254263067c5224af05e6481106cb3ba451a60e1e0357459d
+size 1259547
diff --git a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_negcon_batch.csv.gz b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_negcon_batch.csv.gz
new file mode 100644
index 0000000..78931c6
--- /dev/null
+++ b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_negcon_batch.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58e1dcc6bcba5d00bcd2bd956bd94b1f3fc3ed85208e15e0f4f8d6976dfe82d0
+size 1324069
diff --git a/profiles/2021_08_30_Batch13/BR00126641/BR00126641.csv.gz b/profiles/2021_08_30_Batch13/BR00126641/BR00126641.csv.gz
new file mode 100644
index 0000000..f39259b
--- /dev/null
+++ b/profiles/2021_08_30_Batch13/BR00126641/BR00126641.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d48470709e7561a57e4580e19669f3f5eaa7f4321eaa9e991a530f3a00289da
+size 9366855
diff --git a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_augmented.csv.gz b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_augmented.csv.gz
new file mode 100644
index 0000000..d9f6ba6
--- /dev/null
+++ b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_augmented.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0436ea8ab1ae84b49798a9bcb25e68f40d9981dc6ffbecce680aa754b3b3a0c
+size 9390463
diff --git a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized.csv.gz b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized.csv.gz
new file mode 100644
index 0000000..d502c3b
--- /dev/null
+++ b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d804371c17d17c272a838ac2a48c5c29c6e1029a28f1c843dbdc07e29d8e9b4
+size 9160620
diff --git a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_batch.csv.gz b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_batch.csv.gz
new file mode 100644
index 0000000..07e5ec9
--- /dev/null
+++ b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_batch.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43300808d91fcdebb52606f502b7479ea6bd3a7258140e731b7cad9feeb2e526
+size 1259936
diff --git a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_negcon_batch.csv.gz b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_negcon_batch.csv.gz
new file mode 100644
index 0000000..aab491e
--- /dev/null
+++ b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_negcon_batch.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62b05872e0bdeb03c7e3184f953af9226fb5e79d33c3bb5cead1ed06e14a01b4
+size 1326259
From c2e3b3469ffa909e46b87a2a49e83f31cfcbddd3 Mon Sep 17 00:00:00 2001
From: rsenft1 <28116530+rsenft1@users.noreply.github.com>
Date: Fri, 3 Mar 2023 10:11:02 -0500
Subject: [PATCH 2/3] Revert "add analysis notebook"
This reverts commit faf1d21e86cd988d5c69c55eee973b781775c122.
---
.../01_jumpORF_create_collapsed_df.ipynb | 1133 -----------------
.../BR00126734/BR00126734.csv.gz | 3 -
.../BR00126734/BR00126734_augmented.csv.gz | 3 -
.../BR00126734/BR00126734_normalized.csv.gz | 3 -
...734_normalized_feature_select_batch.csv.gz | 3 -
...malized_feature_select_negcon_batch.csv.gz | 3 -
.../BR00126735/BR00126735.csv.gz | 3 -
.../BR00126735/BR00126735_augmented.csv.gz | 3 -
.../BR00126735/BR00126735_normalized.csv.gz | 3 -
...735_normalized_feature_select_batch.csv.gz | 3 -
...malized_feature_select_negcon_batch.csv.gz | 3 -
.../BR00126634/BR00126634.csv.gz | 3 -
.../BR00126634/BR00126634_augmented.csv.gz | 3 -
.../BR00126634/BR00126634_normalized.csv.gz | 3 -
...634_normalized_feature_select_batch.csv.gz | 3 -
...malized_feature_select_negcon_batch.csv.gz | 3 -
.../BR00126641/BR00126641.csv.gz | 3 -
.../BR00126641/BR00126641_augmented.csv.gz | 3 -
.../BR00126641/BR00126641_normalized.csv.gz | 3 -
...641_normalized_feature_select_batch.csv.gz | 3 -
...malized_feature_select_negcon_batch.csv.gz | 3 -
21 files changed, 1193 deletions(-)
delete mode 100644 analysis_notebook/01_jumpORF_create_collapsed_df.ipynb
delete mode 100644 profiles/2021_08_23_Batch12/BR00126734/BR00126734.csv.gz
delete mode 100644 profiles/2021_08_23_Batch12/BR00126734/BR00126734_augmented.csv.gz
delete mode 100644 profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized.csv.gz
delete mode 100644 profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_batch.csv.gz
delete mode 100644 profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_negcon_batch.csv.gz
delete mode 100644 profiles/2021_08_23_Batch12/BR00126735/BR00126735.csv.gz
delete mode 100644 profiles/2021_08_23_Batch12/BR00126735/BR00126735_augmented.csv.gz
delete mode 100644 profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized.csv.gz
delete mode 100644 profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_batch.csv.gz
delete mode 100644 profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_negcon_batch.csv.gz
delete mode 100644 profiles/2021_08_30_Batch13/BR00126634/BR00126634.csv.gz
delete mode 100644 profiles/2021_08_30_Batch13/BR00126634/BR00126634_augmented.csv.gz
delete mode 100644 profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized.csv.gz
delete mode 100644 profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_batch.csv.gz
delete mode 100644 profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_negcon_batch.csv.gz
delete mode 100644 profiles/2021_08_30_Batch13/BR00126641/BR00126641.csv.gz
delete mode 100644 profiles/2021_08_30_Batch13/BR00126641/BR00126641_augmented.csv.gz
delete mode 100644 profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized.csv.gz
delete mode 100644 profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_batch.csv.gz
delete mode 100644 profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_negcon_batch.csv.gz
diff --git a/analysis_notebook/01_jumpORF_create_collapsed_df.ipynb b/analysis_notebook/01_jumpORF_create_collapsed_df.ipynb
deleted file mode 100644
index 18c689a..0000000
--- a/analysis_notebook/01_jumpORF_create_collapsed_df.ipynb
+++ /dev/null
@@ -1,1133 +0,0 @@
-{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# JUMP ORF data analysis notebook\n",
- "\n",
- "## Set up environment:\n",
- " \n",
- "1. Create a new environment for this project: `mamba create --name jumpORF python=3.9`\n",
- "\n",
- "2. Activate that environment `conda activate jumpORF` (note, I'm not sure why but I can't `mamba activate` even after `mamba init` but this seems to work)\n",
- "\n",
- "3. Install dependencies: \n",
- "* `mamba install -c conda-forge dvc-s3` (more instructions here: https://dvc.org/doc/install/macos, specifically you need dvc with aws s3 since this is where the profiles are stored)\n",
- "\n",
- "## Get data on your local machine:\n",
- "\n",
- "1. Download the data repo: `git clone https://github.com/jump-cellpainting/jump-orf-data.git` (I use GitHub Desktop for this!)\n",
- "\n",
- "2. Download the metadata repo: `git clone https://github.com/jump-cellpainting/datasets.git`\n",
- "\n",
- "3. Pull the files in dvc down to your local computer. In terminal, in the folder where you've cloned `jump-orf-data`: `dvc pull` _(note this step can take a while)_\n",
- "\n",
- "6. Select the `jumpORF` environment for the kernel for this notebook (upper right of notebook in VScode) or otherwise ensure the jumpORF environment is activated \n",
- " * _note that if you have the notebook open while you make the environment, you may need to restart VScode to see the updated list of environments_\n",
- "\n",
- "\n",
- "## to do...\n",
- "\n",
- "2. Use new metadata here as source for finding plates/batches/etc: https://github.com/jump-cellpainting/datasets/tree/main/metadata \n",
- "3. Controls include BFP, HcRed, Luciferase, LacZ (but we have excluded eGFP, though it is still showing as a control in the metadata sheet)\n"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Read in all JUMP ORF data\n",
- "\n",
- "* Grab the paths to all the profiles from the different batches. \n",
- "* Read them into one dataframe (~13000 genes x ~ 5 replicates and a varying # of features depending on whether feature-selected (~1300) or the full data (~5900) profiles are used)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "* What do we want to provide? Just the collapsed data? Or the non-collapsed version as well? \n",
- "* Is there enough of a reason that we want people to have access to the precollapsed version? \n",
- "* Perhaps do not save out these large csvs! Get through cleaning to collapsed data then save that out. \n",
- "* concat, collapse, clean as separate function "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "# read data\n",
- "import os\n",
- "import pandas as pd\n",
- "\n",
- "\n",
- "#get paths to files using the most truthful metadata\n",
- "topfolder = \"../profiles\"\n",
- "metadata_table = pd.read_csv(\"../../jump-datasets/metadata/plate.csv\",index_col=False)\n",
- "metadata_table_ORF = metadata_table[(metadata_table.Metadata_PlateType == \"ORF\")]\n",
- "\n",
- "batch_list = metadata_table_ORF[(metadata_table_ORF.Metadata_PlateType == \"ORF\")].Metadata_Batch.unique()\n",
- "\n",
- "batch_list_2 = metadata_table_ORF.loc[metadata_table_ORF[\"Metadata_PlateType\"] == \"ORF\"][\"Metadata_Batch\"].unique()\n",
- "\n",
- "\n",
- "filesuffix=\"_normalized_feature_select_negcon_all.csv.gz\"\n",
- "filepaths = [os.path.join(topfolder, metadata_table_ORF.Metadata_Batch.values[row], metadata_table_ORF.Metadata_Plate.values[row],metadata_table_ORF.Metadata_Plate.values[row]+filesuffix) for row in range(len(metadata_table_ORF))]\n",
- "\n",
- "#only look at files that exist\n",
- "filepaths = [f for f in filepaths if os.path.exists(f)]\n",
- "\n",
- "#read in\n",
- "df = pd.concat(map(lambda file: pd.read_csv(file, index_col=False,), filepaths))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Get target 2 plates (normalized) and filter the features down to those that are in the df we already have\n",
- "\n",
- "metadata_table_target2 = metadata_table.loc[(metadata_table[\"Metadata_Batch\"].isin(batch_list)) & (metadata_table[\"Metadata_PlateType\"]==\"TARGET2\")]\n",
- "\n",
- "filesuffix=\"_normalized_negcon.csv\" # can also do .csv.gz files\n",
- "filepaths = [os.path.join(topfolder, metadata_table_target2.Metadata_Batch.values[row], metadata_table_target2.Metadata_Plate.values[row],metadata_table_target2.Metadata_Plate.values[row]+filesuffix) for row in range(len(metadata_table_target2))]\n",
- "\n",
- "#only look at files that exist\n",
- "filepaths = [f for f in filepaths if os.path.exists(f)]\n",
- "\n",
- "#read in\n",
- "df_t2 = pd.concat(map(lambda file: pd.read_csv(file, index_col=False,), filepaths))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Filter the target2 df to only columns that exist in the df\n",
- "\n",
- "t2_metadata_col = [x for x in df_t2.columns if \"Metadata\" in x]\n",
- "df_col = list(df.columns)\n",
- "\n",
- "\n",
- "cols2Keep = list(set(df_col+t2_metadata_col))\n",
- "cols2Keep = [x for x in cols2Keep if x in list(df_t2.columns)]\n",
- "\n",
- "df_t2 = df_t2[cols2Keep]"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Filter out ORFs that don't replicate\n",
- "From Alex: \n",
- "\n",
- "Calculate mAP for replicability for each perturbation and filter out those below the random baseline.\n",
- "The long answer involves the fact that we’ve recently changed what “below random baseline” means. Before, we suggested to subtract mAP of randomly ranked profiles (as suggested in “3.4.2 Computation of the exact random AP value” of my draft on mAP). But recently, we decided that we will consider not mean of random baseline APs, but 95th percentile, such that we can construct a significance test and report p-value instead. Let me know if you want to know more about this, I will also talk about it in special topics on Thursday!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Metadata_plate_map_name | \n",
- " Metadata_broad_sample | \n",
- " Metadata_Plate | \n",
- " Metadata_Well | \n",
- " Metadata_Site_Count | \n",
- " Metadata_Count_Cells | \n",
- " Metadata_Count_CellsIncludingEdges | \n",
- " Metadata_Count_Cytoplasm | \n",
- " Metadata_Count_Nuclei | \n",
- " Metadata_Count_NucleiIncludingEdges | \n",
- " ... | \n",
- " Nuclei_Texture_InfoMeas2_Brightfield_3_02_256 | \n",
- " Nuclei_Texture_InfoMeas2_Mito_3_02_256 | \n",
- " Nuclei_Texture_InverseDifferenceMoment_AGP_3_02_256 | \n",
- " Nuclei_Texture_InverseDifferenceMoment_DNA_3_02_256 | \n",
- " Nuclei_Texture_InverseDifferenceMoment_Mito_10_03_256 | \n",
- " Nuclei_Texture_SumVariance_AGP_10_03_256 | \n",
- " Nuclei_Texture_SumVariance_BFHigh_3_03_256 | \n",
- " Nuclei_Texture_SumVariance_BFLow_3_00_256 | \n",
- " Nuclei_Texture_SumVariance_Brightfield_3_03_256 | \n",
- " Nuclei_Texture_SumVariance_ER_10_01_256 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " OAA01.02.03.04.A | \n",
- " ccsbBroad304_05979 | \n",
- " BR00117035 | \n",
- " A01 | \n",
- " 9 | \n",
- " 845 | \n",
- " 970 | \n",
- " 845 | \n",
- " 845 | \n",
- " 970 | \n",
- " ... | \n",
- " -0.087364 | \n",
- " -0.97526 | \n",
- " 2.72760 | \n",
- " 6.7332 | \n",
- " 3.3709 | \n",
- " -1.63900 | \n",
- " 1.53450 | \n",
- " 0.690930 | \n",
- " 1.1301 | \n",
- " -2.7376 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " OAA01.02.03.04.A | \n",
- " ccsbBroad304_13129 | \n",
- " BR00117035 | \n",
- " A02 | \n",
- " 9 | \n",
- " 873 | \n",
- " 988 | \n",
- " 873 | \n",
- " 873 | \n",
- " 988 | \n",
- " ... | \n",
- " 0.198090 | \n",
- " -1.85970 | \n",
- " 1.19280 | \n",
- " 5.3221 | \n",
- " 2.9869 | \n",
- " -1.13510 | \n",
- " 6.72200 | \n",
- " 2.194400 | \n",
- " 2.5605 | \n",
- " -2.2424 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " OAA01.02.03.04.A | \n",
- " ccsbBroad304_00289 | \n",
- " BR00117035 | \n",
- " A03 | \n",
- " 9 | \n",
- " 889 | \n",
- " 989 | \n",
- " 889 | \n",
- " 889 | \n",
- " 989 | \n",
- " ... | \n",
- " 0.619550 | \n",
- " -2.28570 | \n",
- " 0.54443 | \n",
- " 3.2157 | \n",
- " 2.8953 | \n",
- " 0.21932 | \n",
- " 4.92190 | \n",
- " 2.838000 | \n",
- " 4.3082 | \n",
- " -1.5694 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " OAA01.02.03.04.A | \n",
- " ccsbBroad304_99988 | \n",
- " BR00117035 | \n",
- " A04 | \n",
- " 9 | \n",
- " 898 | \n",
- " 995 | \n",
- " 898 | \n",
- " 898 | \n",
- " 995 | \n",
- " ... | \n",
- " -0.230090 | \n",
- " -0.87500 | \n",
- " 0.46420 | \n",
- " 2.5104 | \n",
- " 2.3601 | \n",
- " -0.57261 | \n",
- " 0.37809 | \n",
- " 0.672830 | \n",
- " 1.5367 | \n",
- " -1.3152 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " OAA01.02.03.04.A | \n",
- " ccsbBroad304_07679 | \n",
- " BR00117035 | \n",
- " A05 | \n",
- " 9 | \n",
- " 876 | \n",
- " 982 | \n",
- " 876 | \n",
- " 876 | \n",
- " 982 | \n",
- " ... | \n",
- " -1.605600 | \n",
- " -1.79160 | \n",
- " 0.64160 | \n",
- " 2.5965 | \n",
- " 3.3969 | \n",
- " -0.80713 | \n",
- " -0.97239 | \n",
- " 0.032586 | \n",
- " 1.3970 | \n",
- " -2.0045 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 1478 columns
\n",
- "
"
- ],
- "text/plain": [
- " Metadata_plate_map_name Metadata_broad_sample Metadata_Plate Metadata_Well \\\n",
- "0 OAA01.02.03.04.A ccsbBroad304_05979 BR00117035 A01 \n",
- "1 OAA01.02.03.04.A ccsbBroad304_13129 BR00117035 A02 \n",
- "2 OAA01.02.03.04.A ccsbBroad304_00289 BR00117035 A03 \n",
- "3 OAA01.02.03.04.A ccsbBroad304_99988 BR00117035 A04 \n",
- "4 OAA01.02.03.04.A ccsbBroad304_07679 BR00117035 A05 \n",
- "\n",
- " Metadata_Site_Count Metadata_Count_Cells \\\n",
- "0 9 845 \n",
- "1 9 873 \n",
- "2 9 889 \n",
- "3 9 898 \n",
- "4 9 876 \n",
- "\n",
- " Metadata_Count_CellsIncludingEdges Metadata_Count_Cytoplasm \\\n",
- "0 970 845 \n",
- "1 988 873 \n",
- "2 989 889 \n",
- "3 995 898 \n",
- "4 982 876 \n",
- "\n",
- " Metadata_Count_Nuclei Metadata_Count_NucleiIncludingEdges ... \\\n",
- "0 845 970 ... \n",
- "1 873 988 ... \n",
- "2 889 989 ... \n",
- "3 898 995 ... \n",
- "4 876 982 ... \n",
- "\n",
- " Nuclei_Texture_InfoMeas2_Brightfield_3_02_256 \\\n",
- "0 -0.087364 \n",
- "1 0.198090 \n",
- "2 0.619550 \n",
- "3 -0.230090 \n",
- "4 -1.605600 \n",
- "\n",
- " Nuclei_Texture_InfoMeas2_Mito_3_02_256 \\\n",
- "0 -0.97526 \n",
- "1 -1.85970 \n",
- "2 -2.28570 \n",
- "3 -0.87500 \n",
- "4 -1.79160 \n",
- "\n",
- " Nuclei_Texture_InverseDifferenceMoment_AGP_3_02_256 \\\n",
- "0 2.72760 \n",
- "1 1.19280 \n",
- "2 0.54443 \n",
- "3 0.46420 \n",
- "4 0.64160 \n",
- "\n",
- " Nuclei_Texture_InverseDifferenceMoment_DNA_3_02_256 \\\n",
- "0 6.7332 \n",
- "1 5.3221 \n",
- "2 3.2157 \n",
- "3 2.5104 \n",
- "4 2.5965 \n",
- "\n",
- " Nuclei_Texture_InverseDifferenceMoment_Mito_10_03_256 \\\n",
- "0 3.3709 \n",
- "1 2.9869 \n",
- "2 2.8953 \n",
- "3 2.3601 \n",
- "4 3.3969 \n",
- "\n",
- " Nuclei_Texture_SumVariance_AGP_10_03_256 \\\n",
- "0 -1.63900 \n",
- "1 -1.13510 \n",
- "2 0.21932 \n",
- "3 -0.57261 \n",
- "4 -0.80713 \n",
- "\n",
- " Nuclei_Texture_SumVariance_BFHigh_3_03_256 \\\n",
- "0 1.53450 \n",
- "1 6.72200 \n",
- "2 4.92190 \n",
- "3 0.37809 \n",
- "4 -0.97239 \n",
- "\n",
- " Nuclei_Texture_SumVariance_BFLow_3_00_256 \\\n",
- "0 0.690930 \n",
- "1 2.194400 \n",
- "2 2.838000 \n",
- "3 0.672830 \n",
- "4 0.032586 \n",
- "\n",
- " Nuclei_Texture_SumVariance_Brightfield_3_03_256 \\\n",
- "0 1.1301 \n",
- "1 2.5605 \n",
- "2 4.3082 \n",
- "3 1.5367 \n",
- "4 1.3970 \n",
- "\n",
- " Nuclei_Texture_SumVariance_ER_10_01_256 \n",
- "0 -2.7376 \n",
- "1 -2.2424 \n",
- "2 -1.5694 \n",
- "3 -1.3152 \n",
- "4 -2.0045 \n",
- "\n",
- "[5 rows x 1478 columns]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "How many columns were dropped? 19\n"
- ]
- }
- ],
- "source": [
- "# feature-select the data\n",
- "\n",
- "import pycytominer\n",
- "df_selected = pycytominer.feature_select(df, operation = ['correlation_threshold', 'variance_threshold', 'drop_na_columns', 'blocklist','drop_outliers'], outlier_cutoff = 500)\n",
- "print('How many columns were dropped?',df.shape[1] - df_selected.shape[1])\n",
- "df_final = df_selected.loc[:,~df_selected.columns.duplicated()].copy()\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_final.to_parquet(f\"JUMP_ORF_all.parquet\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_parquet = pd.read_parquet(f\"JUMP_ORF_all.parquet\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['Metadata_broad_sample',\n",
- " 'Metadata_Name',\n",
- " 'Metadata_Vector',\n",
- " 'Metadata_Transcript',\n",
- " 'Metadata_Symbol',\n",
- " 'Metadata_NCBI Gene ID',\n",
- " 'Metadata_Taxon ID',\n",
- " 'Metadata_Gene Description',\n",
- " 'Metadata_Annot. Gene Symbol',\n",
- " 'Metadata_Annot. Gene ID',\n",
- " 'Metadata_Prot Match %',\n",
- " 'Metadata_MOI',\n",
- " 'Metadata_Virus / ml',\n",
- " 'Metadata_Insert Length',\n",
- " 'Metadata_pert_type',\n",
- " 'Metadata_control_type',\n",
- " 'Cells_AreaShape_FormFactor']"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# identify columns with NaN values\n",
- "[col for col in df_selected.columns if df[col].isnull().values.any()]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "# remove measurement column with NaNs\n",
- "df_selected.drop(columns='Cells_AreaShape_FormFactor', inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Computing compound plate replicability...\n"
- ]
- },
- {
- "ename": "ValueError",
- "evalue": "Input contains NaN.",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[5], line 24\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[39m# Calculate replicability mAP\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mComputing \u001b[39m\u001b[39m{\u001b[39;00mdescription\u001b[39m}\u001b[39;00m\u001b[39m replicability...\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m---> 24\u001b[0m precision \u001b[39m=\u001b[39m utilitary\u001b[39m.\u001b[39;49mPrecisionScores(all_plates_df, all_plates_df, feature_to_group_by, \u001b[39m\"\u001b[39;49m\u001b[39mreplicability\u001b[39;49m\u001b[39m\"\u001b[39;49m, feature_to_group_by, within\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, against_negcon\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m 26\u001b[0m replicability_ap_df \u001b[39m=\u001b[39m precision\u001b[39m.\u001b[39map_group\n\u001b[1;32m 27\u001b[0m replicability_map \u001b[39m=\u001b[39m precision\u001b[39m.\u001b[39mmap\n",
- "File \u001b[0;32m~/Documents/GitHub/jump-orf-data/analysis_notebook/utilitary.py:248\u001b[0m, in \u001b[0;36mPrecisionScores.__init__\u001b[0;34m(self, profile1, profile2, group_by_feature, mode, identify_perturbation_feature, within, anti_correlation, against_negcon)\u001b[0m\n\u001b[1;32m 245\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmap1 \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile1[[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39midentify_perturbation_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfeature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcontrol_type_feature]]\u001b[39m.\u001b[39mcopy()\n\u001b[1;32m 246\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmap2 \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile2[[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39midentify_perturbation_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfeature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcontrol_type_feature]]\u001b[39m.\u001b[39mcopy()\n\u001b[0;32m--> 248\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcorr \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcompute_correlation()\n\u001b[1;32m 249\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtruth_matrix \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcreate_truth_matrix()\n\u001b[1;32m 250\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcleanup()\n",
- "File \u001b[0;32m~/Documents/GitHub/jump-orf-data/analysis_notebook/utilitary.py:292\u001b[0m, in \u001b[0;36mPrecisionScores.compute_correlation\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 290\u001b[0m _sample_names_1 \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile1[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature])\n\u001b[1;32m 291\u001b[0m _sample_names_2 \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile2[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature])\n\u001b[0;32m--> 292\u001b[0m _corr \u001b[39m=\u001b[39m cosine_similarity(_profile1, _profile2)\n\u001b[1;32m 293\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39manti_correlation:\n\u001b[1;32m 294\u001b[0m _corr \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mabs(_corr)\n",
- "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/metrics/pairwise.py:1393\u001b[0m, in \u001b[0;36mcosine_similarity\u001b[0;34m(X, Y, dense_output)\u001b[0m\n\u001b[1;32m 1358\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Compute cosine similarity between samples in X and Y.\u001b[39;00m\n\u001b[1;32m 1359\u001b[0m \n\u001b[1;32m 1360\u001b[0m \u001b[39mCosine similarity, or the cosine kernel, computes similarity as the\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1389\u001b[0m \u001b[39m Returns the cosine similarity between samples in X and Y.\u001b[39;00m\n\u001b[1;32m 1390\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 1391\u001b[0m \u001b[39m# to avoid recursive import\u001b[39;00m\n\u001b[0;32m-> 1393\u001b[0m X, Y \u001b[39m=\u001b[39m check_pairwise_arrays(X, Y)\n\u001b[1;32m 1395\u001b[0m X_normalized \u001b[39m=\u001b[39m normalize(X, copy\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m 1396\u001b[0m \u001b[39mif\u001b[39;00m X \u001b[39mis\u001b[39;00m Y:\n",
- "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/metrics/pairwise.py:155\u001b[0m, in \u001b[0;36mcheck_pairwise_arrays\u001b[0;34m(X, Y, precomputed, dtype, accept_sparse, force_all_finite, copy)\u001b[0m\n\u001b[1;32m 146\u001b[0m X \u001b[39m=\u001b[39m Y \u001b[39m=\u001b[39m check_array(\n\u001b[1;32m 147\u001b[0m X,\n\u001b[1;32m 148\u001b[0m accept_sparse\u001b[39m=\u001b[39maccept_sparse,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 152\u001b[0m estimator\u001b[39m=\u001b[39mestimator,\n\u001b[1;32m 153\u001b[0m )\n\u001b[1;32m 154\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 155\u001b[0m X \u001b[39m=\u001b[39m check_array(\n\u001b[1;32m 156\u001b[0m X,\n\u001b[1;32m 157\u001b[0m accept_sparse\u001b[39m=\u001b[39;49maccept_sparse,\n\u001b[1;32m 158\u001b[0m dtype\u001b[39m=\u001b[39;49mdtype,\n\u001b[1;32m 159\u001b[0m copy\u001b[39m=\u001b[39;49mcopy,\n\u001b[1;32m 160\u001b[0m force_all_finite\u001b[39m=\u001b[39;49mforce_all_finite,\n\u001b[1;32m 161\u001b[0m estimator\u001b[39m=\u001b[39;49mestimator,\n\u001b[1;32m 162\u001b[0m )\n\u001b[1;32m 163\u001b[0m Y \u001b[39m=\u001b[39m check_array(\n\u001b[1;32m 164\u001b[0m Y,\n\u001b[1;32m 165\u001b[0m accept_sparse\u001b[39m=\u001b[39maccept_sparse,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 169\u001b[0m estimator\u001b[39m=\u001b[39mestimator,\n\u001b[1;32m 170\u001b[0m )\n\u001b[1;32m 172\u001b[0m \u001b[39mif\u001b[39;00m precomputed:\n",
- "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/utils/validation.py:921\u001b[0m, in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 915\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 916\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFound array with dim \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m. \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m expected <= 2.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 917\u001b[0m \u001b[39m%\u001b[39m (array\u001b[39m.\u001b[39mndim, estimator_name)\n\u001b[1;32m 918\u001b[0m )\n\u001b[1;32m 920\u001b[0m \u001b[39mif\u001b[39;00m force_all_finite:\n\u001b[0;32m--> 921\u001b[0m _assert_all_finite(\n\u001b[1;32m 922\u001b[0m array,\n\u001b[1;32m 923\u001b[0m input_name\u001b[39m=\u001b[39;49minput_name,\n\u001b[1;32m 924\u001b[0m estimator_name\u001b[39m=\u001b[39;49mestimator_name,\n\u001b[1;32m 925\u001b[0m allow_nan\u001b[39m=\u001b[39;49mforce_all_finite \u001b[39m==\u001b[39;49m \u001b[39m\"\u001b[39;49m\u001b[39mallow-nan\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 926\u001b[0m )\n\u001b[1;32m 928\u001b[0m \u001b[39mif\u001b[39;00m ensure_min_samples \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 929\u001b[0m n_samples \u001b[39m=\u001b[39m _num_samples(array)\n",
- "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/utils/validation.py:161\u001b[0m, in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan, msg_dtype, estimator_name, input_name)\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[39mif\u001b[39;00m estimator_name \u001b[39mand\u001b[39;00m input_name \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mX\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mand\u001b[39;00m has_nan_error:\n\u001b[1;32m 145\u001b[0m \u001b[39m# Improve the error message on how to handle missing values in\u001b[39;00m\n\u001b[1;32m 146\u001b[0m \u001b[39m# scikit-learn.\u001b[39;00m\n\u001b[1;32m 147\u001b[0m msg_err \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (\n\u001b[1;32m 148\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m{\u001b[39;00mestimator_name\u001b[39m}\u001b[39;00m\u001b[39m does not accept missing values\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 149\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m encoded as NaN natively. For supervised learning, you might want\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m#estimators-that-handle-nan-values\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 160\u001b[0m )\n\u001b[0;32m--> 161\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(msg_err)\n",
- "\u001b[0;31mValueError\u001b[0m: Input contains NaN."
- ]
- }
- ],
- "source": [
- "# this one does not quite work yet!\n",
- "import utilitary\n",
- "\n",
- "# get replicability - setup\n",
- "replicability_ap_df = pd.DataFrame()\n",
- "matching_ap_df = pd.DataFrame()\n",
- "\n",
- "#add metadata_control_type column\n",
- "all_plates_df = df_selected.copy()\n",
- "all_plates_df['Metadata_control_type'] = all_plates_df['Metadata_control_type'].fillna('')\n",
- "# all_plates_df['Metadata_control_type'] = ''\n",
- "# cmpd = all_plates_df['Metadata_Compound'].values\n",
- "# ctrl = all_plates_df['Metadata_control_type'].values\n",
- "# for vals in range(len(cmpd)):\n",
- "# if cmpd[vals] == \"DMSO\":\n",
- "# ctrl[vals] = 'negcon'\n",
- "\n",
- "\n",
- "feature_to_group_by = 'Metadata_Symbol'\n",
- "# Description\n",
- "description = f'compound plate'\n",
- "\n",
- "# Calculate replicability mAP\n",
- "print(f'Computing {description} replicability...')\n",
- "precision = utilitary.PrecisionScores(all_plates_df, all_plates_df, feature_to_group_by, \"replicability\", feature_to_group_by, within=True, against_negcon=True)\n",
- "\n",
- "replicability_ap_df = precision.ap_group\n",
- "replicability_map = precision.map\n",
- "\n",
- "replicability_ap_df.head()\n",
- "# Construct a random baseline\n",
- "\n",
- "# Filter the dataframe to only include ORFs that have > 95 percentile of the random baseline (aka, <5% chance of seeing that mAP or something more extreme under the null hypothesis that replicability is random)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Collapse the dataframe within genes\n",
- "\n",
- "* Median collapse into 1 row per gene (most genes have 5 replicate ORFs) --> data goes down to ~12600 rows\n",
- "* Metadata_Symbol is the gene name\n",
- "* Note that the controls include "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Parameters (to be moved to the top of the notebook)\n",
- "aggregation_type =\"median\"\n",
- "\n",
- "\n",
- "#which control types do you want to include? \n",
- "controltypes_orf = ['negcon', 'poscon']\n",
- "controltypes = ['negcon', 'poscon_cp', 'poscon_orf', 'poscon_diverse']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "#filter to gene of interest\n",
- "df_subset_orf = df.loc[df['Metadata_Symbol'].isin(gene_list)].reset_index(drop=True)\n",
- "\n",
- "# get controls \n",
- "df_subset_orf_con = df.loc[df['Metadata_control_type'].isin(controltypes_orf)].reset_index(drop=True)\n",
- "\n",
- "#get target 2 data\n",
- "df_subset_t2 = df_t2.loc[df_t2['Metadata_control_type'].isin(controltypes)].reset_index(drop=True)\n",
- "df_subset_t2['Metadata_broad_sample'] = df_subset_t2['Metadata_broad_sample'].fillna('empty')\n",
- "\n",
- "# aggregate\n",
- "if aggregation_type == \"mean\":\n",
- " df_subset_orf = df_subset_orf.groupby('Metadata_Symbol').mean(numeric_only=True).reset_index(drop=True)\n",
- " df_subset_orf_con = df_subset_orf_con.groupby(['Metadata_control_type','Metadata_broad_sample']).mean(numeric_only=True).reset_index(drop=True)\n",
- " df_subset_t2 = df_subset_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).mean(numeric_only=True).reset_index(drop=True)\n",
- "\n",
- "elif aggregation_type == \"median\":\n",
- " df_subset_orf = df_subset_orf.groupby('Metadata_Symbol').median().reset_index(drop=True)\n",
- " df_subset_orf_con = df_subset_orf_con.groupby(['Metadata_control_type','Metadata_broad_sample']).median().reset_index(drop=True)\n",
- " df_subset_t2 = df_subset_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).median().reset_index(drop=True)\n",
- "\n",
- "df_subset_orf['Metadata_data_source'] = 'ORF'\n",
- "df_subset_orf_con['Metadata_data_source'] = 'ORF'\n",
- "df_subset_t2['Metadata_data_source'] = 'T2'\n",
- "\n",
- "\n",
- "#merge the separate subsets together\n",
- "df_subset = pd.concat([df_subset_orf,df_subset_orf_con,df_subset_t2], ignore_index=True)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## for all genes"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/var/folders/0y/5d1shp9n6kq856jtm9t10mcw0000gq/T/ipykernel_42906/2054350032.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
- " df_subset_orf = df.groupby('Metadata_Symbol').median().reset_index(drop=True)\n",
- "/var/folders/0y/5d1shp9n6kq856jtm9t10mcw0000gq/T/ipykernel_42906/2054350032.py:12: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
- " df_subset_orf_con = df.groupby(['Metadata_control_type','Metadata_broad_sample']).median().reset_index(drop=True)\n",
- "/var/folders/0y/5d1shp9n6kq856jtm9t10mcw0000gq/T/ipykernel_42906/2054350032.py:13: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
- " df_subset_t2 = df_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).median().reset_index(drop=True)\n"
- ]
- }
- ],
- "source": [
- "#fill nas in Metadata_broad_sample column to keep untreated negcons\n",
- "df_t2['Metadata_broad_sample'] = df_t2['Metadata_broad_sample'].fillna('empty')\n",
- "\n",
- "# aggregate\n",
- "if aggregation_type == \"mean\":\n",
- " df_subset_orf = df.groupby('Metadata_Symbol').mean(numeric_only=True).reset_index(drop=True)\n",
- " df_subset_orf_con = df.groupby(['Metadata_control_type','Metadata_broad_sample']).mean(numeric_only=True).reset_index(drop=True)\n",
- " df_subset_t2 = df_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).mean(numeric_only=True).reset_index(drop=True)\n",
- "\n",
- "elif aggregation_type == \"median\":\n",
- " df_subset_orf = df.groupby('Metadata_Symbol').median(numeric_only=True).reset_index(drop=True)\n",
- " df_subset_orf_con = df.groupby(['Metadata_control_type','Metadata_broad_sample']).median(numeric_only=True).reset_index(drop=True)\n",
- " df_subset_t2 = df_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).median(numeric_only=True).reset_index(drop=True)\n",
- "\n",
- "df_subset_orf['Metadata_data_source'] = 'ORF'\n",
- "df_subset_orf_con['Metadata_data_source'] = 'ORF'\n",
- "df_subset_t2['Metadata_data_source'] = 'T2'\n",
- "\n",
- "\n",
- "#merge the separate subsets together\n",
- "df_collapsed = pd.concat([df_subset_orf,df_subset_orf_con,df_subset_t2], ignore_index=True)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Put metadata back in the dataframe"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pycytominer\n",
- "\n",
- "metadata_column_list = ['Metadata_Symbol',\n",
- " 'Metadata_control_type', \n",
- " 'Metadata_broad_sample',\n",
- " 'Metadata_plate_map_name',\n",
- " 'Metadata_Plate', \n",
- " 'Metadata_Name', \n",
- " 'Metadata_Vector',\n",
- " 'Metadata_Transcript', \n",
- " 'Metadata_NCBI Gene ID', \n",
- " 'Metadata_Taxon ID',\n",
- " 'Metadata_Gene Description',\n",
- " 'Metadata_Annot. Gene Symbol',\n",
- " 'Metadata_Annot. Gene ID',\n",
- " 'Metadata_Prot Match %',\n",
- " 'Metadata_MOI',\n",
- " 'Metadata_Virus / ml',\n",
- " 'Metadata_Insert Length',\n",
- " 'Metadata_pert_type',]\n",
- "#aggregate ORF\n",
- "df_ORF_aggregated = pycytominer.aggregate(df, \n",
- " strata=metadata_column_list,\n",
- " features=\"infer\",\n",
- " operation=\"mean\",\n",
- " output_file=\"none\",\n",
- " compute_object_count=False,\n",
- " object_feature=\"Metadata_ObjectNumber\",\n",
- " subset_data_df=\"none\",\n",
- " compression_options=None,\n",
- " float_format=None,)\n",
- "\n",
- "#aggregate t2 plates from ORF batches\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['Metadata_Well',\n",
- " 'Metadata_Site_Count',\n",
- " 'Metadata_Count_Cells',\n",
- " 'Metadata_Count_CellsIncludingEdges',\n",
- " 'Metadata_Count_Cytoplasm',\n",
- " 'Metadata_Count_Nuclei',\n",
- " 'Metadata_Count_NucleiIncludingEdges',\n",
- " 'Metadata_Object_Count',\n",
- " 'Image_Granularity_10_AGP',\n",
- " 'Image_Granularity_10_BFHigh',\n",
- " 'Image_Granularity_10_BFLow',\n",
- " 'Image_Granularity_10_Brightfield',\n",
- " 'Image_Granularity_10_DNA',\n",
- " 'Image_Granularity_10_ER',\n",
- " 'Image_Granularity_10_Mito',\n",
- " 'Image_Granularity_10_RNA',\n",
- " 'Image_Granularity_11_AGP',\n",
- " 'Image_Granularity_11_BFHigh',\n",
- " 'Image_Granularity_11_BFLow',\n",
- " 'Image_Granularity_11_Brightfield',\n",
- " 'Image_Granularity_11_DNA',\n",
- " 'Image_Granularity_11_ER',\n",
- " 'Image_Granularity_11_Mito',\n",
- " 'Image_Granularity_11_RNA',\n",
- " 'Image_Granularity_12_AGP',\n",
- " 'Image_Granularity_12_BFHigh',\n",
- " 'Image_Granularity_12_BFLow',\n",
- " 'Image_Granularity_12_Brightfield',\n",
- " 'Image_Granularity_12_ER',\n",
- " 'Image_Granularity_12_Mito',\n",
- " 'Image_Granularity_12_RNA',\n",
- " 'Image_Granularity_13_AGP',\n",
- " 'Image_Granularity_13_BFHigh',\n",
- " 'Image_Granularity_13_BFLow',\n",
- " 'Image_Granularity_13_Brightfield',\n",
- " 'Image_Granularity_13_ER',\n",
- " 'Image_Granularity_13_Mito',\n",
- " 'Image_Granularity_13_RNA',\n",
- " 'Image_Granularity_14_AGP',\n",
- " 'Image_Granularity_14_BFHigh',\n",
- " 'Image_Granularity_14_BFLow',\n",
- " 'Image_Granularity_14_Brightfield',\n",
- " 'Image_Granularity_14_ER',\n",
- " 'Image_Granularity_14_Mito',\n",
- " 'Image_Granularity_14_RNA',\n",
- " 'Image_Granularity_15_AGP',\n",
- " 'Image_Granularity_15_BFHigh',\n",
- " 'Image_Granularity_15_BFLow',\n",
- " 'Image_Granularity_15_Brightfield',\n",
- " 'Image_Granularity_15_DNA',\n",
- " 'Image_Granularity_15_ER',\n",
- " 'Image_Granularity_15_Mito',\n",
- " 'Image_Granularity_15_RNA',\n",
- " 'Image_Granularity_16_AGP',\n",
- " 'Image_Granularity_16_BFHigh',\n",
- " 'Image_Granularity_16_BFLow',\n",
- " 'Image_Granularity_16_Brightfield',\n",
- " 'Image_Granularity_16_DNA',\n",
- " 'Image_Granularity_16_ER',\n",
- " 'Image_Granularity_16_Mito',\n",
- " 'Image_Granularity_16_RNA',\n",
- " 'Image_Granularity_1_BFHigh',\n",
- " 'Image_Granularity_1_BFLow',\n",
- " 'Image_Granularity_1_Brightfield',\n",
- " 'Image_Granularity_1_DNA',\n",
- " 'Image_Granularity_1_Mito',\n",
- " 'Image_Granularity_1_RNA',\n",
- " 'Image_Granularity_2_AGP',\n",
- " 'Image_Granularity_2_BFHigh',\n",
- " 'Image_Granularity_2_BFLow',\n",
- " 'Image_Granularity_2_Brightfield',\n",
- " 'Image_Granularity_2_DNA',\n",
- " 'Image_Granularity_2_ER',\n",
- " 'Image_Granularity_2_Mito',\n",
- " 'Image_Granularity_2_RNA',\n",
- " 'Image_Granularity_3_AGP',\n",
- " 'Image_Granularity_3_BFHigh',\n",
- " 'Image_Granularity_3_BFLow',\n",
- " 'Image_Granularity_3_Brightfield',\n",
- " 'Image_Granularity_3_ER',\n",
- " 'Image_Granularity_3_Mito',\n",
- " 'Image_Granularity_3_RNA',\n",
- " 'Image_Granularity_4_AGP',\n",
- " 'Image_Granularity_4_BFHigh',\n",
- " 'Image_Granularity_4_BFLow',\n",
- " 'Image_Granularity_4_Brightfield',\n",
- " 'Image_Granularity_4_DNA',\n",
- " 'Image_Granularity_4_ER',\n",
- " 'Image_Granularity_4_Mito',\n",
- " 'Image_Granularity_4_RNA',\n",
- " 'Image_Granularity_5_AGP',\n",
- " 'Image_Granularity_5_BFHigh',\n",
- " 'Image_Granularity_5_BFLow',\n",
- " 'Image_Granularity_5_Brightfield',\n",
- " 'Image_Granularity_5_DNA',\n",
- " 'Image_Granularity_5_ER',\n",
- " 'Image_Granularity_5_Mito',\n",
- " 'Image_Granularity_5_RNA',\n",
- " 'Image_Granularity_6_AGP',\n",
- " 'Image_Granularity_6_BFHigh',\n",
- " 'Image_Granularity_6_BFLow',\n",
- " 'Image_Granularity_6_Brightfield',\n",
- " 'Image_Granularity_6_DNA',\n",
- " 'Image_Granularity_6_ER',\n",
- " 'Image_Granularity_6_Mito',\n",
- " 'Image_Granularity_6_RNA',\n",
- " 'Image_Granularity_7_AGP',\n",
- " 'Image_Granularity_7_BFHigh',\n",
- " 'Image_Granularity_7_BFLow',\n",
- " 'Image_Granularity_7_Brightfield',\n",
- " 'Image_Granularity_7_DNA',\n",
- " 'Image_Granularity_7_ER',\n",
- " 'Image_Granularity_7_Mito',\n",
- " 'Image_Granularity_7_RNA',\n",
- " 'Image_Granularity_8_AGP',\n",
- " 'Image_Granularity_8_BFHigh',\n",
- " 'Image_Granularity_8_BFLow',\n",
- " 'Image_Granularity_8_Brightfield',\n",
- " 'Image_Granularity_8_DNA',\n",
- " 'Image_Granularity_8_ER',\n",
- " 'Image_Granularity_8_Mito',\n",
- " 'Image_Granularity_8_RNA',\n",
- " 'Image_Granularity_9_AGP',\n",
- " 'Image_Granularity_9_BFHigh',\n",
- " 'Image_Granularity_9_BFLow',\n",
- " 'Image_Granularity_9_Brightfield',\n",
- " 'Image_Granularity_9_DNA',\n",
- " 'Image_Granularity_9_ER',\n",
- " 'Image_Granularity_9_Mito',\n",
- " 'Image_Granularity_9_RNA',\n",
- " 'Image_ImageQuality_Correlation_OrigAGP_5',\n",
- " 'Image_ImageQuality_Correlation_OrigAGP_50',\n",
- " 'Image_ImageQuality_Correlation_OrigBrightfield_50',\n",
- " 'Image_ImageQuality_Correlation_OrigBrightfield_H_50',\n",
- " 'Image_ImageQuality_Correlation_OrigBrightfield_L_50',\n",
- " 'Image_ImageQuality_Correlation_OrigDNA_20',\n",
- " 'Image_ImageQuality_Correlation_OrigDNA_5',\n",
- " 'Image_ImageQuality_Correlation_OrigDNA_50',\n",
- " 'Image_ImageQuality_Correlation_OrigER_10',\n",
- " 'Image_ImageQuality_Correlation_OrigER_5',\n",
- " 'Image_ImageQuality_Correlation_OrigER_50',\n",
- " 'Image_ImageQuality_Correlation_OrigMito_5',\n",
- " 'Image_ImageQuality_Correlation_OrigMito_50',\n",
- " 'Image_ImageQuality_Correlation_OrigRNA_10',\n",
- " 'Image_ImageQuality_Correlation_OrigRNA_20',\n",
- " 'Image_ImageQuality_Correlation_OrigRNA_5',\n",
- " 'Image_ImageQuality_FocusScore_OrigAGP',\n",
- " 'Image_ImageQuality_FocusScore_OrigBrightfield',\n",
- " 'Image_ImageQuality_FocusScore_OrigDNA',\n",
- " 'Image_ImageQuality_FocusScore_OrigER',\n",
- " 'Image_ImageQuality_FocusScore_OrigMito',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigAGP_10',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigAGP_20',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigAGP_5',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigAGP_50',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_20',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_50',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_10',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_20',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_5',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_50',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_10',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_20',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_5',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_50',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigDNA_10',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigDNA_20',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigDNA_5',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigDNA_50',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigER_20',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigMito_10',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigMito_20',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigMito_5',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigMito_50',\n",
- " 'Image_ImageQuality_LocalFocusScore_OrigRNA_50',\n",
- " 'Image_ImageQuality_MADIntensity_OrigDNA',\n",
- " 'Image_ImageQuality_MaxIntensity_OrigAGP',\n",
- " 'Image_ImageQuality_MaxIntensity_OrigBrightfield_H',\n",
- " 'Image_ImageQuality_MaxIntensity_OrigDNA',\n",
- " 'Image_ImageQuality_MaxIntensity_OrigER',\n",
- " 'Image_ImageQuality_MaxIntensity_OrigMito',\n",
- " 'Image_ImageQuality_MaxIntensity_OrigRNA',\n",
- " 'Image_ImageQuality_MinIntensity_OrigBrightfield',\n",
- " 'Image_ImageQuality_MinIntensity_OrigBrightfield_H',\n",
- " 'Image_ImageQuality_MinIntensity_OrigBrightfield_L',\n",
- " 'Image_ImageQuality_MinIntensity_OrigER',\n",
- " 'Image_ImageQuality_MinIntensity_OrigMito',\n",
- " 'Image_ImageQuality_MinIntensity_OrigRNA',\n",
- " 'Image_ImageQuality_PowerLogLogSlope_OrigAGP',\n",
- " 'Image_ImageQuality_PowerLogLogSlope_OrigBrightfield',\n",
- " 'Image_ImageQuality_PowerLogLogSlope_OrigBrightfield_H',\n",
- " 'Image_ImageQuality_PowerLogLogSlope_OrigBrightfield_L',\n",
- " 'Image_ImageQuality_PowerLogLogSlope_OrigDNA',\n",
- " 'Image_ImageQuality_PowerLogLogSlope_OrigER',\n",
- " 'Image_ImageQuality_PowerLogLogSlope_OrigMito',\n",
- " 'Image_ImageQuality_PowerLogLogSlope_OrigRNA',\n",
- " 'Image_ImageQuality_StdIntensity_OrigAGP',\n",
- " 'Image_ImageQuality_StdIntensity_OrigBrightfield',\n",
- " 'Image_ImageQuality_ThresholdOtsu_OrigDNA_2W',\n",
- " 'Image_ImageQuality_TotalIntensity_OrigDNA',\n",
- " 'Image_Intensity_LowerQuartileIntensity_AGP',\n",
- " 'Image_Intensity_LowerQuartileIntensity_AGP__BackgroundOnly',\n",
- " 'Image_Intensity_MADIntensity_AGP',\n",
- " 'Image_Intensity_MADIntensity_AGP__BackgroundOnly',\n",
- " 'Image_Intensity_MADIntensity_Brightfield_BackgroundOnly',\n",
- " 'Image_Intensity_MADIntensity_DNA',\n",
- " 'Image_Intensity_MADIntensity_DNA_BackgroundOnly',\n",
- " 'Image_Intensity_MADIntensity_ER__BackgroundOnly',\n",
- " 'Image_Intensity_MADIntensity_Mito',\n",
- " 'Image_Intensity_MADIntensity_RNA',\n",
- " 'Image_Intensity_MaxIntensity_AGP',\n",
- " 'Image_Intensity_MaxIntensity_AGP__BackgroundOnly',\n",
- " 'Image_Intensity_MaxIntensity_BFHigh__BackgroundOnly',\n",
- " 'Image_Intensity_MaxIntensity_BFLow',\n",
- " 'Image_Intensity_MaxIntensity_BFLow_BackgroundOnly',\n",
- " 'Image_Intensity_MaxIntensity_Brightfield',\n",
- " 'Image_Intensity_MaxIntensity_Brightfield_BackgroundOnly',\n",
- " 'Image_Intensity_MaxIntensity_DNA',\n",
- " 'Image_Intensity_MaxIntensity_DNA_BackgroundOnly',\n",
- " 'Image_Intensity_MaxIntensity_ER',\n",
- " 'Image_Intensity_MaxIntensity_ER__BackgroundOnly',\n",
- " 'Image_Intensity_MaxIntensity_Mito',\n",
- " 'Image_Intensity_MaxIntensity_Mito_BackgroundOnly',\n",
- " 'Image_Intensity_MaxIntensity_RNA',\n",
- " 'Image_Intensity_MaxIntensity_RNA_BackgroundOnly',\n",
- " 'Image_Intensity_MedianIntensity_AGP',\n",
- " 'Image_Intensity_MedianIntensity_AGP__BackgroundOnly',\n",
- " 'Image_Intensity_MedianIntensity_DNA_BackgroundOnly',\n",
- " 'Image_Intensity_MedianIntensity_ER__BackgroundOnly',\n",
- " 'Image_Intensity_MinIntensity_AGP',\n",
- " 'Image_Intensity_MinIntensity_BFHigh__BackgroundOnly',\n",
- " 'Image_Intensity_MinIntensity_BFLow_BackgroundOnly',\n",
- " 'Image_Intensity_MinIntensity_Brightfield_BackgroundOnly',\n",
- " 'Image_Intensity_MinIntensity_DNA',\n",
- " 'Image_Intensity_MinIntensity_Mito_BackgroundOnly',\n",
- " 'Image_Intensity_PercentMaximal_AGP__BackgroundOnly',\n",
- " 'Image_Intensity_PercentMaximal_DNA_BackgroundOnly',\n",
- " 'Image_Intensity_PercentMaximal_ER__BackgroundOnly',\n",
- " 'Image_Intensity_PercentMaximal_Mito_BackgroundOnly',\n",
- " 'Image_Intensity_PercentMaximal_RNA_BackgroundOnly',\n",
- " 'Image_Intensity_StdIntensity_AGP__BackgroundOnly',\n",
- " 'Image_Intensity_StdIntensity_Brightfield_BackgroundOnly',\n",
- " 'Image_Intensity_StdIntensity_DNA_BackgroundOnly',\n",
- " 'Image_Intensity_StdIntensity_ER__BackgroundOnly',\n",
- " 'Image_Intensity_StdIntensity_RNA_BackgroundOnly',\n",
- " 'Image_Intensity_TotalIntensity_AGP__BackgroundOnly',\n",
- " 'Image_Intensity_TotalIntensity_BFLow_BackgroundOnly',\n",
- " 'Image_Intensity_TotalIntensity_DNA_BackgroundOnly',\n",
- " 'Image_Intensity_TotalIntensity_RNA_BackgroundOnly',\n",
- " 'Image_Intensity_UpperQuartileIntensity_AGP__BackgroundOnly',\n",
- " 'Image_Intensity_UpperQuartileIntensity_DNA',\n",
- " 'Image_Intensity_UpperQuartileIntensity_DNA_BackgroundOnly',\n",
- " 'Image_Texture_AngularSecondMoment_AGP_3_02_256',\n",
- " 'Image_Texture_AngularSecondMoment_BFLow_10_01_256',\n",
- " 'Image_Texture_AngularSecondMoment_Brightfield_3_02_256',\n",
- " 'Image_Texture_AngularSecondMoment_DNA_3_00_256',\n",
- " 'Image_Texture_AngularSecondMoment_ER_5_02_256',\n",
- " 'Image_Texture_AngularSecondMoment_Mito_10_01_256',\n",
- " 'Image_Texture_AngularSecondMoment_RNA_10_01_256',\n",
- " 'Image_Texture_Contrast_AGP_10_02_256',\n",
- " 'Image_Texture_Contrast_BFHigh_10_01_256',\n",
- " 'Image_Texture_Contrast_BFLow_10_01_256',\n",
- " 'Image_Texture_Contrast_Brightfield_10_00_256',\n",
- " 'Image_Texture_Contrast_DNA_3_02_256',\n",
- " 'Image_Texture_Contrast_ER_3_03_256',\n",
- " 'Image_Texture_Contrast_Mito_3_00_256',\n",
- " 'Image_Texture_Correlation_AGP_10_01_256',\n",
- " 'Image_Texture_Correlation_BFLow_10_02_256',\n",
- " 'Image_Texture_Correlation_Brightfield_10_01_256',\n",
- " 'Image_Texture_Correlation_DNA_10_00_256',\n",
- " 'Image_Texture_Correlation_DNA_10_01_256',\n",
- " 'Image_Texture_Correlation_DNA_10_02_256',\n",
- " 'Image_Texture_Correlation_DNA_10_03_256',\n",
- " 'Image_Texture_Correlation_DNA_5_00_256',\n",
- " 'Image_Texture_Correlation_DNA_5_01_256',\n",
- " 'Image_Texture_Correlation_DNA_5_03_256',\n",
- " 'Image_Texture_Correlation_ER_10_02_256',\n",
- " 'Image_Texture_Correlation_ER_5_00_256',\n",
- " 'Image_Texture_Correlation_Mito_3_02_256',\n",
- " 'Image_Texture_Correlation_RNA_10_03_256',\n",
- " 'Image_Texture_Correlation_RNA_5_03_256',\n",
- " 'Image_Texture_DifferenceEntropy_AGP_3_02_256',\n",
- " 'Image_Texture_DifferenceEntropy_BFHigh_3_00_256',\n",
- " 'Image_Texture_DifferenceEntropy_BFLow_3_02_256',\n",
- " 'Image_Texture_DifferenceEntropy_Brightfield_10_03_256',\n",
- " 'Image_Texture_DifferenceEntropy_DNA_10_01_256',\n",
- " 'Image_Texture_DifferenceEntropy_Mito_10_01_256',\n",
- " 'Image_Texture_DifferenceVariance_AGP_3_01_256',\n",
- " 'Image_Texture_DifferenceVariance_BFHigh_3_00_256',\n",
- " 'Image_Texture_DifferenceVariance_BFLow_3_02_256',\n",
- " 'Image_Texture_DifferenceVariance_Brightfield_3_00_256',\n",
- " 'Image_Texture_DifferenceVariance_DNA_3_02_256',\n",
- " 'Image_Texture_DifferenceVariance_Mito_10_03_256',\n",
- " 'Image_Texture_Entropy_BFLow_3_00_256',\n",
- " 'Image_Texture_InfoMeas1_AGP_10_03_256',\n",
- " 'Image_Texture_InfoMeas1_BFLow_10_03_256',\n",
- " 'Image_Texture_InfoMeas1_Brightfield_10_03_256',\n",
- " 'Image_Texture_InfoMeas1_DNA_10_03_256',\n",
- " 'Image_Texture_InfoMeas1_DNA_3_01_256',\n",
- " 'Image_Texture_InfoMeas1_DNA_5_01_256',\n",
- " 'Image_Texture_InfoMeas1_ER_5_00_256',\n",
- " 'Image_Texture_InfoMeas1_Mito_10_03_256',\n",
- " 'Image_Texture_InfoMeas1_Mito_3_02_256',\n",
- " 'Image_Texture_InfoMeas2_AGP_10_03_256',\n",
- " 'Image_Texture_InfoMeas2_DNA_10_01_256',\n",
- " 'Image_Texture_InfoMeas2_DNA_10_02_256',\n",
- " 'Image_Texture_InfoMeas2_ER_3_03_256',\n",
- " 'Image_Texture_InfoMeas2_ER_5_01_256',\n",
- " 'Image_Texture_InfoMeas2_Mito_10_01_256',\n",
- " 'Image_Texture_InfoMeas2_Mito_10_02_256',\n",
- " 'Image_Texture_InfoMeas2_Mito_10_03_256',\n",
- " 'Image_Texture_InfoMeas2_Mito_3_02_256',\n",
- " 'Image_Texture_InfoMeas2_Mito_5_01_256',\n",
- " 'Image_Texture_InverseDifferenceMoment_AGP_3_02_256',\n",
- " 'Image_Texture_InverseDifferenceMoment_BFHigh_3_00_256',\n",
- " 'Image_Texture_InverseDifferenceMoment_BFLow_3_00_256',\n",
- " 'Image_Texture_InverseDifferenceMoment_Brightfield_3_02_256',\n",
- " 'Image_Texture_InverseDifferenceMoment_DNA_10_03_256',\n",
- " 'Image_Texture_InverseDifferenceMoment_ER_10_01_256',\n",
- " 'Image_Texture_InverseDifferenceMoment_Mito_10_01_256',\n",
- " 'Image_Texture_InverseDifferenceMoment_RNA_3_00_256',\n",
- " 'Image_Texture_SumEntropy_DNA_3_02_256',\n",
- " 'Image_Texture_SumVariance_AGP_10_03_256',\n",
- " 'Image_Texture_SumVariance_Brightfield_5_02_256',\n",
- " 'Image_Texture_SumVariance_DNA_10_03_256',\n",
- " 'Image_Texture_SumVariance_ER_10_03_256',\n",
- " 'Image_Texture_SumVariance_Mito_10_01_256',\n",
- " 'Image_Texture_SumVariance_RNA_10_01_256',\n",
- " 'Image_Threshold_FinalThreshold_NucleiIncludingEdges',\n",
- " 'Image_Threshold_FinalThreshold_mito_bw',\n",
- " 'Image_Threshold_SumOfEntropies_CellsIncludingEdges',\n",
- " 'Image_Threshold_SumOfEntropies_NucleiIncludingEdges',\n",
- " 'Image_Threshold_SumOfEntropies_mito_bw',\n",
- " 'Image_Threshold_WeightedVariance_CellsIncludingEdges',\n",
- " 'Image_Threshold_WeightedVariance_NucleiIncludingEdges',\n",
- " 'Image_Threshold_WeightedVariance_mito_bw']"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "[c for c in df.columns if c not in df_ORF_aggregated.columns]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_collapsed.to_csv(f\"JUMP_ORF_{aggregation_type}_collapsed.csv\")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "jumpORF",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.15"
- },
- "orig_nbformat": 4,
- "vscode": {
- "interpreter": {
- "hash": "2f3fec36f6be95d788e5f03a928d042624f9ad08087c2484cba824ceb7727375"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/profiles/2021_08_23_Batch12/BR00126734/BR00126734.csv.gz b/profiles/2021_08_23_Batch12/BR00126734/BR00126734.csv.gz
deleted file mode 100644
index ff2e511..0000000
--- a/profiles/2021_08_23_Batch12/BR00126734/BR00126734.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2709e051d13811dd9ee0e5b45439ffb77320f3947468334c5e2908a718478e7d
-size 9166733
diff --git a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_augmented.csv.gz b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_augmented.csv.gz
deleted file mode 100644
index 91aae60..0000000
--- a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_augmented.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f542a3db7824bb8b91f85c6b24994e287de789404874092ef8ba474cfec94e7d
-size 9190214
diff --git a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized.csv.gz b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized.csv.gz
deleted file mode 100644
index cf8fb33..0000000
--- a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2eb0867ec9cc7ab69ee884bc147417330a7c6d8a3fdf7e860a7eacd814127df5
-size 9172996
diff --git a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_batch.csv.gz b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_batch.csv.gz
deleted file mode 100644
index 66cf263..0000000
--- a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_batch.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:242b6133b468abef12384a7b011ee4724313d7c581b332d4ac57c1abfa1ae337
-size 1341789
diff --git a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_negcon_batch.csv.gz b/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_negcon_batch.csv.gz
deleted file mode 100644
index 7a069ed..0000000
--- a/profiles/2021_08_23_Batch12/BR00126734/BR00126734_normalized_feature_select_negcon_batch.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4672b909ca417854ea4ce2da336d21ec053bf3d65d65bcd5f5f31e569c7f270c
-size 1393809
diff --git a/profiles/2021_08_23_Batch12/BR00126735/BR00126735.csv.gz b/profiles/2021_08_23_Batch12/BR00126735/BR00126735.csv.gz
deleted file mode 100644
index 816ef0a..0000000
--- a/profiles/2021_08_23_Batch12/BR00126735/BR00126735.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9009d701c18e6514578dca973d4c1088985e84b54a77e27ccc537b910044cad0
-size 9105873
diff --git a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_augmented.csv.gz b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_augmented.csv.gz
deleted file mode 100644
index 9f714e5..0000000
--- a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_augmented.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3d7dab958ae5ba793549cda0338fa93b862fe881c9b6d18ddae6e9b4c9d09aef
-size 9129375
diff --git a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized.csv.gz b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized.csv.gz
deleted file mode 100644
index 51f2760..0000000
--- a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:68e0eab8f9afb68df9c4d8851673f62a0ed535a7f27dc16134da6651fe4fa812
-size 9160469
diff --git a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_batch.csv.gz b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_batch.csv.gz
deleted file mode 100644
index 5d7f401..0000000
--- a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_batch.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ae107a9cd5631a0a3c955d678d769466e22b3c6b8088ff584bb5d81b3a45a655
-size 1342048
diff --git a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_negcon_batch.csv.gz b/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_negcon_batch.csv.gz
deleted file mode 100644
index f012d5a..0000000
--- a/profiles/2021_08_23_Batch12/BR00126735/BR00126735_normalized_feature_select_negcon_batch.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e4fcc20ff0782d8d463182a99cccc232a4abb744c4bd710e857803859debaeea
-size 1393066
diff --git a/profiles/2021_08_30_Batch13/BR00126634/BR00126634.csv.gz b/profiles/2021_08_30_Batch13/BR00126634/BR00126634.csv.gz
deleted file mode 100644
index 87114ad..0000000
--- a/profiles/2021_08_30_Batch13/BR00126634/BR00126634.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7a765ee64c2e41059962b3f9075c2c9d3159bd486d6313b2a89a11c5ef0f4fdb
-size 9378298
diff --git a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_augmented.csv.gz b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_augmented.csv.gz
deleted file mode 100644
index c3c3e46..0000000
--- a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_augmented.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:600d7c1ae26ce2f4e3848f8fd20a3f79c55e760c9cc9e765fe85f340a7b6fab6
-size 9401762
diff --git a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized.csv.gz b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized.csv.gz
deleted file mode 100644
index f258e69..0000000
--- a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:91553357b047baf0c69da3e64c17cc241787e11bb57b13b7af6a343638301924
-size 9154704
diff --git a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_batch.csv.gz b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_batch.csv.gz
deleted file mode 100644
index 196e5bd..0000000
--- a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_batch.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e9bd63683e6a14cc254263067c5224af05e6481106cb3ba451a60e1e0357459d
-size 1259547
diff --git a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_negcon_batch.csv.gz b/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_negcon_batch.csv.gz
deleted file mode 100644
index 78931c6..0000000
--- a/profiles/2021_08_30_Batch13/BR00126634/BR00126634_normalized_feature_select_negcon_batch.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:58e1dcc6bcba5d00bcd2bd956bd94b1f3fc3ed85208e15e0f4f8d6976dfe82d0
-size 1324069
diff --git a/profiles/2021_08_30_Batch13/BR00126641/BR00126641.csv.gz b/profiles/2021_08_30_Batch13/BR00126641/BR00126641.csv.gz
deleted file mode 100644
index f39259b..0000000
--- a/profiles/2021_08_30_Batch13/BR00126641/BR00126641.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2d48470709e7561a57e4580e19669f3f5eaa7f4321eaa9e991a530f3a00289da
-size 9366855
diff --git a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_augmented.csv.gz b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_augmented.csv.gz
deleted file mode 100644
index d9f6ba6..0000000
--- a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_augmented.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c0436ea8ab1ae84b49798a9bcb25e68f40d9981dc6ffbecce680aa754b3b3a0c
-size 9390463
diff --git a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized.csv.gz b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized.csv.gz
deleted file mode 100644
index d502c3b..0000000
--- a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0d804371c17d17c272a838ac2a48c5c29c6e1029a28f1c843dbdc07e29d8e9b4
-size 9160620
diff --git a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_batch.csv.gz b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_batch.csv.gz
deleted file mode 100644
index 07e5ec9..0000000
--- a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_batch.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:43300808d91fcdebb52606f502b7479ea6bd3a7258140e731b7cad9feeb2e526
-size 1259936
diff --git a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_negcon_batch.csv.gz b/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_negcon_batch.csv.gz
deleted file mode 100644
index aab491e..0000000
--- a/profiles/2021_08_30_Batch13/BR00126641/BR00126641_normalized_feature_select_negcon_batch.csv.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:62b05872e0bdeb03c7e3184f953af9226fb5e79d33c3bb5cead1ed06e14a01b4
-size 1326259
From 357d8bc7ba293140bbc05146ee8bec231624658d Mon Sep 17 00:00:00 2001
From: rsenft1 <28116530+rsenft1@users.noreply.github.com>
Date: Fri, 3 Mar 2023 10:13:00 -0500
Subject: [PATCH 3/3] Create 01_jumpORF_create_collapsed_df.ipynb
---
.../01_jumpORF_create_collapsed_df.ipynb | 1133 +++++++++++++++++
1 file changed, 1133 insertions(+)
create mode 100644 analysis_notebook/01_jumpORF_create_collapsed_df.ipynb
diff --git a/analysis_notebook/01_jumpORF_create_collapsed_df.ipynb b/analysis_notebook/01_jumpORF_create_collapsed_df.ipynb
new file mode 100644
index 0000000..18c689a
--- /dev/null
+++ b/analysis_notebook/01_jumpORF_create_collapsed_df.ipynb
@@ -0,0 +1,1133 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# JUMP ORF data analysis notebook\n",
+ "\n",
+ "## Set up environment:\n",
+ " \n",
+ "1. Create a new environment for this project: `mamba create --name jumpORF python=3.9`\n",
+ "\n",
+ "2. Activate that environment `conda activate jumpORF` (note, I'm not sure why but I can't `mamba activate` even after `mamba init` but this seems to work)\n",
+ "\n",
+ "3. Install dependencies: \n",
+ "* `mamba install -c conda-forge dvc-s3` (more instructions here: https://dvc.org/doc/install/macos, specifically you need dvc with aws s3 since this is where the profiles are stored)\n",
+ "\n",
+ "## Get data on your local machine:\n",
+ "\n",
+ "1. Download the data repo: `git clone https://github.com/jump-cellpainting/jump-orf-data.git` (I use GitHub Desktop for this!)\n",
+ "\n",
+ "2. Download the metadata repo: `git clone https://github.com/jump-cellpainting/datasets.git`\n",
+ "\n",
+ "3. Pull the files in dvc down to your local computer. In terminal, in the folder where you've cloned `jump-orf-data`: `dvc pull` _(note this step can take a while)_\n",
+ "\n",
+ "6. Select the `jumpORF` environment for the kernel for this notebook (upper right of notebook in VScode) or otherwise ensure the jumpORF environment is activated \n",
+ " * _note that if you have the notebook open while you make the environment, you may need to restart VScode to see the updated list of environments_\n",
+ "\n",
+ "\n",
+ "## to do...\n",
+ "\n",
+ "2. Use new metadata here as source for finding plates/batches/etc: https://github.com/jump-cellpainting/datasets/tree/main/metadata \n",
+ "3. Controls include BFP, HcRed, Luciferase, LacZ (but we have excluded eGFP, though it is still showing as a control in the metadata sheet)\n"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Read in all JUMP ORF data\n",
+ "\n",
+ "* Grab the paths to all the profiles from the different batches. \n",
+ "* Read them into one dataframe (~13000 genes x ~ 5 replicates and a varying # of features depending on whether feature-selected (~1300) or the full data (~5900) profiles are used)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* What do we want to provide? Just the collapsed data? Or the non-collapsed version as well? \n",
+ "* Is there enough of a reason that we want people to have access to the precollapsed version? \n",
+ "* Perhaps do not save out these large csvs! Get through cleaning to collapsed data then save that out. \n",
+ "* concat, collapse, clean as separate function "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read data\n",
+ "import os\n",
+ "import pandas as pd\n",
+ "\n",
+ "\n",
+ "#get paths to files using the most truthful metadata\n",
+ "topfolder = \"../profiles\"\n",
+ "metadata_table = pd.read_csv(\"../../jump-datasets/metadata/plate.csv\",index_col=False)\n",
+ "metadata_table_ORF = metadata_table[(metadata_table.Metadata_PlateType == \"ORF\")]\n",
+ "\n",
+ "batch_list = metadata_table_ORF[(metadata_table_ORF.Metadata_PlateType == \"ORF\")].Metadata_Batch.unique()\n",
+ "\n",
+ "batch_list_2 = metadata_table_ORF.loc[metadata_table_ORF[\"Metadata_PlateType\"] == \"ORF\"][\"Metadata_Batch\"].unique()\n",
+ "\n",
+ "\n",
+ "filesuffix=\"_normalized_feature_select_negcon_all.csv.gz\"\n",
+ "filepaths = [os.path.join(topfolder, metadata_table_ORF.Metadata_Batch.values[row], metadata_table_ORF.Metadata_Plate.values[row],metadata_table_ORF.Metadata_Plate.values[row]+filesuffix) for row in range(len(metadata_table_ORF))]\n",
+ "\n",
+ "#only look at files that exist\n",
+ "filepaths = [f for f in filepaths if os.path.exists(f)]\n",
+ "\n",
+ "#read in\n",
+ "df = pd.concat(map(lambda file: pd.read_csv(file, index_col=False,), filepaths))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get target 2 plates (normalized) and filter the features down to those that are in the df we already have\n",
+ "\n",
+ "metadata_table_target2 = metadata_table.loc[(metadata_table[\"Metadata_Batch\"].isin(batch_list)) & (metadata_table[\"Metadata_PlateType\"]==\"TARGET2\")]\n",
+ "\n",
+ "filesuffix=\"_normalized_negcon.csv\" # can also do .csv.gz files\n",
+ "filepaths = [os.path.join(topfolder, metadata_table_target2.Metadata_Batch.values[row], metadata_table_target2.Metadata_Plate.values[row],metadata_table_target2.Metadata_Plate.values[row]+filesuffix) for row in range(len(metadata_table_target2))]\n",
+ "\n",
+ "#only look at files that exist\n",
+ "filepaths = [f for f in filepaths if os.path.exists(f)]\n",
+ "\n",
+ "#read in\n",
+ "df_t2 = pd.concat(map(lambda file: pd.read_csv(file, index_col=False,), filepaths))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Filter the target2 df to only columns that exist in the df\n",
+ "\n",
+ "t2_metadata_col = [x for x in df_t2.columns if \"Metadata\" in x]\n",
+ "df_col = list(df.columns)\n",
+ "\n",
+ "\n",
+ "cols2Keep = list(set(df_col+t2_metadata_col))\n",
+ "cols2Keep = [x for x in cols2Keep if x in list(df_t2.columns)]\n",
+ "\n",
+ "df_t2 = df_t2[cols2Keep]"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Filter out ORFs that don't replicate\n",
+ "From Alex: \n",
+ "\n",
+ "Calculate mAP for replicability for each perturbation and filter out those below the random baseline.\n",
+ "The long answer involves the fact that we’ve recently changed what “below random baseline” means. Before, we suggested to subtract mAP of randomly ranked profiles (as suggested in “3.4.2 Computation of the exact random AP value” of my draft on mAP). But recently, we decided that we will consider not mean of random baseline APs, but 95th percentile, such that we can construct a significance test and report p-value instead. Let me know if you want to know more about this, I will also talk about it in special topics on Thursday!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Metadata_plate_map_name | \n",
+ " Metadata_broad_sample | \n",
+ " Metadata_Plate | \n",
+ " Metadata_Well | \n",
+ " Metadata_Site_Count | \n",
+ " Metadata_Count_Cells | \n",
+ " Metadata_Count_CellsIncludingEdges | \n",
+ " Metadata_Count_Cytoplasm | \n",
+ " Metadata_Count_Nuclei | \n",
+ " Metadata_Count_NucleiIncludingEdges | \n",
+ " ... | \n",
+ " Nuclei_Texture_InfoMeas2_Brightfield_3_02_256 | \n",
+ " Nuclei_Texture_InfoMeas2_Mito_3_02_256 | \n",
+ " Nuclei_Texture_InverseDifferenceMoment_AGP_3_02_256 | \n",
+ " Nuclei_Texture_InverseDifferenceMoment_DNA_3_02_256 | \n",
+ " Nuclei_Texture_InverseDifferenceMoment_Mito_10_03_256 | \n",
+ " Nuclei_Texture_SumVariance_AGP_10_03_256 | \n",
+ " Nuclei_Texture_SumVariance_BFHigh_3_03_256 | \n",
+ " Nuclei_Texture_SumVariance_BFLow_3_00_256 | \n",
+ " Nuclei_Texture_SumVariance_Brightfield_3_03_256 | \n",
+ " Nuclei_Texture_SumVariance_ER_10_01_256 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " OAA01.02.03.04.A | \n",
+ " ccsbBroad304_05979 | \n",
+ " BR00117035 | \n",
+ " A01 | \n",
+ " 9 | \n",
+ " 845 | \n",
+ " 970 | \n",
+ " 845 | \n",
+ " 845 | \n",
+ " 970 | \n",
+ " ... | \n",
+ " -0.087364 | \n",
+ " -0.97526 | \n",
+ " 2.72760 | \n",
+ " 6.7332 | \n",
+ " 3.3709 | \n",
+ " -1.63900 | \n",
+ " 1.53450 | \n",
+ " 0.690930 | \n",
+ " 1.1301 | \n",
+ " -2.7376 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " OAA01.02.03.04.A | \n",
+ " ccsbBroad304_13129 | \n",
+ " BR00117035 | \n",
+ " A02 | \n",
+ " 9 | \n",
+ " 873 | \n",
+ " 988 | \n",
+ " 873 | \n",
+ " 873 | \n",
+ " 988 | \n",
+ " ... | \n",
+ " 0.198090 | \n",
+ " -1.85970 | \n",
+ " 1.19280 | \n",
+ " 5.3221 | \n",
+ " 2.9869 | \n",
+ " -1.13510 | \n",
+ " 6.72200 | \n",
+ " 2.194400 | \n",
+ " 2.5605 | \n",
+ " -2.2424 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " OAA01.02.03.04.A | \n",
+ " ccsbBroad304_00289 | \n",
+ " BR00117035 | \n",
+ " A03 | \n",
+ " 9 | \n",
+ " 889 | \n",
+ " 989 | \n",
+ " 889 | \n",
+ " 889 | \n",
+ " 989 | \n",
+ " ... | \n",
+ " 0.619550 | \n",
+ " -2.28570 | \n",
+ " 0.54443 | \n",
+ " 3.2157 | \n",
+ " 2.8953 | \n",
+ " 0.21932 | \n",
+ " 4.92190 | \n",
+ " 2.838000 | \n",
+ " 4.3082 | \n",
+ " -1.5694 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " OAA01.02.03.04.A | \n",
+ " ccsbBroad304_99988 | \n",
+ " BR00117035 | \n",
+ " A04 | \n",
+ " 9 | \n",
+ " 898 | \n",
+ " 995 | \n",
+ " 898 | \n",
+ " 898 | \n",
+ " 995 | \n",
+ " ... | \n",
+ " -0.230090 | \n",
+ " -0.87500 | \n",
+ " 0.46420 | \n",
+ " 2.5104 | \n",
+ " 2.3601 | \n",
+ " -0.57261 | \n",
+ " 0.37809 | \n",
+ " 0.672830 | \n",
+ " 1.5367 | \n",
+ " -1.3152 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " OAA01.02.03.04.A | \n",
+ " ccsbBroad304_07679 | \n",
+ " BR00117035 | \n",
+ " A05 | \n",
+ " 9 | \n",
+ " 876 | \n",
+ " 982 | \n",
+ " 876 | \n",
+ " 876 | \n",
+ " 982 | \n",
+ " ... | \n",
+ " -1.605600 | \n",
+ " -1.79160 | \n",
+ " 0.64160 | \n",
+ " 2.5965 | \n",
+ " 3.3969 | \n",
+ " -0.80713 | \n",
+ " -0.97239 | \n",
+ " 0.032586 | \n",
+ " 1.3970 | \n",
+ " -2.0045 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 1478 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Metadata_plate_map_name Metadata_broad_sample Metadata_Plate Metadata_Well \\\n",
+ "0 OAA01.02.03.04.A ccsbBroad304_05979 BR00117035 A01 \n",
+ "1 OAA01.02.03.04.A ccsbBroad304_13129 BR00117035 A02 \n",
+ "2 OAA01.02.03.04.A ccsbBroad304_00289 BR00117035 A03 \n",
+ "3 OAA01.02.03.04.A ccsbBroad304_99988 BR00117035 A04 \n",
+ "4 OAA01.02.03.04.A ccsbBroad304_07679 BR00117035 A05 \n",
+ "\n",
+ " Metadata_Site_Count Metadata_Count_Cells \\\n",
+ "0 9 845 \n",
+ "1 9 873 \n",
+ "2 9 889 \n",
+ "3 9 898 \n",
+ "4 9 876 \n",
+ "\n",
+ " Metadata_Count_CellsIncludingEdges Metadata_Count_Cytoplasm \\\n",
+ "0 970 845 \n",
+ "1 988 873 \n",
+ "2 989 889 \n",
+ "3 995 898 \n",
+ "4 982 876 \n",
+ "\n",
+ " Metadata_Count_Nuclei Metadata_Count_NucleiIncludingEdges ... \\\n",
+ "0 845 970 ... \n",
+ "1 873 988 ... \n",
+ "2 889 989 ... \n",
+ "3 898 995 ... \n",
+ "4 876 982 ... \n",
+ "\n",
+ " Nuclei_Texture_InfoMeas2_Brightfield_3_02_256 \\\n",
+ "0 -0.087364 \n",
+ "1 0.198090 \n",
+ "2 0.619550 \n",
+ "3 -0.230090 \n",
+ "4 -1.605600 \n",
+ "\n",
+ " Nuclei_Texture_InfoMeas2_Mito_3_02_256 \\\n",
+ "0 -0.97526 \n",
+ "1 -1.85970 \n",
+ "2 -2.28570 \n",
+ "3 -0.87500 \n",
+ "4 -1.79160 \n",
+ "\n",
+ " Nuclei_Texture_InverseDifferenceMoment_AGP_3_02_256 \\\n",
+ "0 2.72760 \n",
+ "1 1.19280 \n",
+ "2 0.54443 \n",
+ "3 0.46420 \n",
+ "4 0.64160 \n",
+ "\n",
+ " Nuclei_Texture_InverseDifferenceMoment_DNA_3_02_256 \\\n",
+ "0 6.7332 \n",
+ "1 5.3221 \n",
+ "2 3.2157 \n",
+ "3 2.5104 \n",
+ "4 2.5965 \n",
+ "\n",
+ " Nuclei_Texture_InverseDifferenceMoment_Mito_10_03_256 \\\n",
+ "0 3.3709 \n",
+ "1 2.9869 \n",
+ "2 2.8953 \n",
+ "3 2.3601 \n",
+ "4 3.3969 \n",
+ "\n",
+ " Nuclei_Texture_SumVariance_AGP_10_03_256 \\\n",
+ "0 -1.63900 \n",
+ "1 -1.13510 \n",
+ "2 0.21932 \n",
+ "3 -0.57261 \n",
+ "4 -0.80713 \n",
+ "\n",
+ " Nuclei_Texture_SumVariance_BFHigh_3_03_256 \\\n",
+ "0 1.53450 \n",
+ "1 6.72200 \n",
+ "2 4.92190 \n",
+ "3 0.37809 \n",
+ "4 -0.97239 \n",
+ "\n",
+ " Nuclei_Texture_SumVariance_BFLow_3_00_256 \\\n",
+ "0 0.690930 \n",
+ "1 2.194400 \n",
+ "2 2.838000 \n",
+ "3 0.672830 \n",
+ "4 0.032586 \n",
+ "\n",
+ " Nuclei_Texture_SumVariance_Brightfield_3_03_256 \\\n",
+ "0 1.1301 \n",
+ "1 2.5605 \n",
+ "2 4.3082 \n",
+ "3 1.5367 \n",
+ "4 1.3970 \n",
+ "\n",
+ " Nuclei_Texture_SumVariance_ER_10_01_256 \n",
+ "0 -2.7376 \n",
+ "1 -2.2424 \n",
+ "2 -1.5694 \n",
+ "3 -1.3152 \n",
+ "4 -2.0045 \n",
+ "\n",
+ "[5 rows x 1478 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "How many columns were dropped? 19\n"
+ ]
+ }
+ ],
+ "source": [
+ "# feature-select the data\n",
+ "\n",
+ "import pycytominer\n",
+ "df_selected = pycytominer.feature_select(df, operation = ['correlation_threshold', 'variance_threshold', 'drop_na_columns', 'blocklist','drop_outliers'], outlier_cutoff = 500)\n",
+ "print('How many columns were dropped?',df.shape[1] - df_selected.shape[1])\n",
+ "df_final = df_selected.loc[:,~df_selected.columns.duplicated()].copy()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_final.to_parquet(f\"JUMP_ORF_all.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_parquet = pd.read_parquet(f\"JUMP_ORF_all.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Metadata_broad_sample',\n",
+ " 'Metadata_Name',\n",
+ " 'Metadata_Vector',\n",
+ " 'Metadata_Transcript',\n",
+ " 'Metadata_Symbol',\n",
+ " 'Metadata_NCBI Gene ID',\n",
+ " 'Metadata_Taxon ID',\n",
+ " 'Metadata_Gene Description',\n",
+ " 'Metadata_Annot. Gene Symbol',\n",
+ " 'Metadata_Annot. Gene ID',\n",
+ " 'Metadata_Prot Match %',\n",
+ " 'Metadata_MOI',\n",
+ " 'Metadata_Virus / ml',\n",
+ " 'Metadata_Insert Length',\n",
+ " 'Metadata_pert_type',\n",
+ " 'Metadata_control_type',\n",
+ " 'Cells_AreaShape_FormFactor']"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# identify columns with NaN values\n",
+ "[col for col in df_selected.columns if df[col].isnull().values.any()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# remove measurement column with NaNs\n",
+ "df_selected.drop(columns='Cells_AreaShape_FormFactor', inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Computing compound plate replicability...\n"
+ ]
+ },
+ {
+ "ename": "ValueError",
+ "evalue": "Input contains NaN.",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[5], line 24\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[39m# Calculate replicability mAP\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mComputing \u001b[39m\u001b[39m{\u001b[39;00mdescription\u001b[39m}\u001b[39;00m\u001b[39m replicability...\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m---> 24\u001b[0m precision \u001b[39m=\u001b[39m utilitary\u001b[39m.\u001b[39;49mPrecisionScores(all_plates_df, all_plates_df, feature_to_group_by, \u001b[39m\"\u001b[39;49m\u001b[39mreplicability\u001b[39;49m\u001b[39m\"\u001b[39;49m, feature_to_group_by, within\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, against_negcon\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m 26\u001b[0m replicability_ap_df \u001b[39m=\u001b[39m precision\u001b[39m.\u001b[39map_group\n\u001b[1;32m 27\u001b[0m replicability_map \u001b[39m=\u001b[39m precision\u001b[39m.\u001b[39mmap\n",
+ "File \u001b[0;32m~/Documents/GitHub/jump-orf-data/analysis_notebook/utilitary.py:248\u001b[0m, in \u001b[0;36mPrecisionScores.__init__\u001b[0;34m(self, profile1, profile2, group_by_feature, mode, identify_perturbation_feature, within, anti_correlation, against_negcon)\u001b[0m\n\u001b[1;32m 245\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmap1 \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile1[[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39midentify_perturbation_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfeature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcontrol_type_feature]]\u001b[39m.\u001b[39mcopy()\n\u001b[1;32m 246\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmap2 \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile2[[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39midentify_perturbation_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfeature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcontrol_type_feature]]\u001b[39m.\u001b[39mcopy()\n\u001b[0;32m--> 248\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcorr \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcompute_correlation()\n\u001b[1;32m 249\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtruth_matrix \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcreate_truth_matrix()\n\u001b[1;32m 250\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcleanup()\n",
+ "File \u001b[0;32m~/Documents/GitHub/jump-orf-data/analysis_notebook/utilitary.py:292\u001b[0m, in \u001b[0;36mPrecisionScores.compute_correlation\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 290\u001b[0m _sample_names_1 \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile1[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature])\n\u001b[1;32m 291\u001b[0m _sample_names_2 \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprofile2[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msample_id_feature])\n\u001b[0;32m--> 292\u001b[0m _corr \u001b[39m=\u001b[39m cosine_similarity(_profile1, _profile2)\n\u001b[1;32m 293\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39manti_correlation:\n\u001b[1;32m 294\u001b[0m _corr \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mabs(_corr)\n",
+ "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/metrics/pairwise.py:1393\u001b[0m, in \u001b[0;36mcosine_similarity\u001b[0;34m(X, Y, dense_output)\u001b[0m\n\u001b[1;32m 1358\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Compute cosine similarity between samples in X and Y.\u001b[39;00m\n\u001b[1;32m 1359\u001b[0m \n\u001b[1;32m 1360\u001b[0m \u001b[39mCosine similarity, or the cosine kernel, computes similarity as the\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1389\u001b[0m \u001b[39m Returns the cosine similarity between samples in X and Y.\u001b[39;00m\n\u001b[1;32m 1390\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 1391\u001b[0m \u001b[39m# to avoid recursive import\u001b[39;00m\n\u001b[0;32m-> 1393\u001b[0m X, Y \u001b[39m=\u001b[39m check_pairwise_arrays(X, Y)\n\u001b[1;32m 1395\u001b[0m X_normalized \u001b[39m=\u001b[39m normalize(X, copy\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m 1396\u001b[0m \u001b[39mif\u001b[39;00m X \u001b[39mis\u001b[39;00m Y:\n",
+ "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/metrics/pairwise.py:155\u001b[0m, in \u001b[0;36mcheck_pairwise_arrays\u001b[0;34m(X, Y, precomputed, dtype, accept_sparse, force_all_finite, copy)\u001b[0m\n\u001b[1;32m 146\u001b[0m X \u001b[39m=\u001b[39m Y \u001b[39m=\u001b[39m check_array(\n\u001b[1;32m 147\u001b[0m X,\n\u001b[1;32m 148\u001b[0m accept_sparse\u001b[39m=\u001b[39maccept_sparse,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 152\u001b[0m estimator\u001b[39m=\u001b[39mestimator,\n\u001b[1;32m 153\u001b[0m )\n\u001b[1;32m 154\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 155\u001b[0m X \u001b[39m=\u001b[39m check_array(\n\u001b[1;32m 156\u001b[0m X,\n\u001b[1;32m 157\u001b[0m accept_sparse\u001b[39m=\u001b[39;49maccept_sparse,\n\u001b[1;32m 158\u001b[0m dtype\u001b[39m=\u001b[39;49mdtype,\n\u001b[1;32m 159\u001b[0m copy\u001b[39m=\u001b[39;49mcopy,\n\u001b[1;32m 160\u001b[0m force_all_finite\u001b[39m=\u001b[39;49mforce_all_finite,\n\u001b[1;32m 161\u001b[0m estimator\u001b[39m=\u001b[39;49mestimator,\n\u001b[1;32m 162\u001b[0m )\n\u001b[1;32m 163\u001b[0m Y \u001b[39m=\u001b[39m check_array(\n\u001b[1;32m 164\u001b[0m Y,\n\u001b[1;32m 165\u001b[0m accept_sparse\u001b[39m=\u001b[39maccept_sparse,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 169\u001b[0m estimator\u001b[39m=\u001b[39mestimator,\n\u001b[1;32m 170\u001b[0m )\n\u001b[1;32m 172\u001b[0m \u001b[39mif\u001b[39;00m precomputed:\n",
+ "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/utils/validation.py:921\u001b[0m, in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 915\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 916\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFound array with dim \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m. \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m expected <= 2.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 917\u001b[0m \u001b[39m%\u001b[39m (array\u001b[39m.\u001b[39mndim, estimator_name)\n\u001b[1;32m 918\u001b[0m )\n\u001b[1;32m 920\u001b[0m \u001b[39mif\u001b[39;00m force_all_finite:\n\u001b[0;32m--> 921\u001b[0m _assert_all_finite(\n\u001b[1;32m 922\u001b[0m array,\n\u001b[1;32m 923\u001b[0m input_name\u001b[39m=\u001b[39;49minput_name,\n\u001b[1;32m 924\u001b[0m estimator_name\u001b[39m=\u001b[39;49mestimator_name,\n\u001b[1;32m 925\u001b[0m allow_nan\u001b[39m=\u001b[39;49mforce_all_finite \u001b[39m==\u001b[39;49m \u001b[39m\"\u001b[39;49m\u001b[39mallow-nan\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 926\u001b[0m )\n\u001b[1;32m 928\u001b[0m \u001b[39mif\u001b[39;00m ensure_min_samples \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 929\u001b[0m n_samples \u001b[39m=\u001b[39m _num_samples(array)\n",
+ "File \u001b[0;32m~/mambaforge/envs/jumpORF/lib/python3.9/site-packages/sklearn/utils/validation.py:161\u001b[0m, in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan, msg_dtype, estimator_name, input_name)\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[39mif\u001b[39;00m estimator_name \u001b[39mand\u001b[39;00m input_name \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mX\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mand\u001b[39;00m has_nan_error:\n\u001b[1;32m 145\u001b[0m \u001b[39m# Improve the error message on how to handle missing values in\u001b[39;00m\n\u001b[1;32m 146\u001b[0m \u001b[39m# scikit-learn.\u001b[39;00m\n\u001b[1;32m 147\u001b[0m msg_err \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (\n\u001b[1;32m 148\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m{\u001b[39;00mestimator_name\u001b[39m}\u001b[39;00m\u001b[39m does not accept missing values\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 149\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m encoded as NaN natively. For supervised learning, you might want\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m#estimators-that-handle-nan-values\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 160\u001b[0m )\n\u001b[0;32m--> 161\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(msg_err)\n",
+ "\u001b[0;31mValueError\u001b[0m: Input contains NaN."
+ ]
+ }
+ ],
+ "source": [
+ "# this one does not quite work yet!\n",
+ "import utilitary\n",
+ "\n",
+ "# get replicability - setup\n",
+ "replicability_ap_df = pd.DataFrame()\n",
+ "matching_ap_df = pd.DataFrame()\n",
+ "\n",
+ "#add metadata_control_type column\n",
+ "all_plates_df = df_selected.copy()\n",
+ "all_plates_df['Metadata_control_type'] = all_plates_df['Metadata_control_type'].fillna('')\n",
+ "# all_plates_df['Metadata_control_type'] = ''\n",
+ "# cmpd = all_plates_df['Metadata_Compound'].values\n",
+ "# ctrl = all_plates_df['Metadata_control_type'].values\n",
+ "# for vals in range(len(cmpd)):\n",
+ "# if cmpd[vals] == \"DMSO\":\n",
+ "# ctrl[vals] = 'negcon'\n",
+ "\n",
+ "\n",
+ "feature_to_group_by = 'Metadata_Symbol'\n",
+ "# Description\n",
+ "description = f'compound plate'\n",
+ "\n",
+ "# Calculate replicability mAP\n",
+ "print(f'Computing {description} replicability...')\n",
+ "precision = utilitary.PrecisionScores(all_plates_df, all_plates_df, feature_to_group_by, \"replicability\", feature_to_group_by, within=True, against_negcon=True)\n",
+ "\n",
+ "replicability_ap_df = precision.ap_group\n",
+ "replicability_map = precision.map\n",
+ "\n",
+ "replicability_ap_df.head()\n",
+ "# Construct a random baseline\n",
+ "\n",
+ "# Filter the dataframe to only include ORFs that have > 95 percentile of the random baseline (aka, <5% chance of seeing that mAP or something more extreme under the null hypothesis that replicability is random)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Collapse the dataframe within genes\n",
+ "\n",
+ "* Median collapse into 1 row per gene (most genes have 5 replicate ORFs) --> data goes down to ~12600 rows\n",
+ "* Metadata_Symbol is the gene name\n",
+ "* Note that the controls include "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Parameters (to be moved to the top of the notebook)\n",
+ "aggregation_type =\"median\"\n",
+ "\n",
+ "\n",
+ "#which control types do you want to include? \n",
+ "controltypes_orf = ['negcon', 'poscon']\n",
+ "controltypes = ['negcon', 'poscon_cp', 'poscon_orf', 'poscon_diverse']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#filter to gene of interest\n",
+ "df_subset_orf = df.loc[df['Metadata_Symbol'].isin(gene_list)].reset_index(drop=True)\n",
+ "\n",
+ "# get controls \n",
+ "df_subset_orf_con = df.loc[df['Metadata_control_type'].isin(controltypes_orf)].reset_index(drop=True)\n",
+ "\n",
+ "#get target 2 data\n",
+ "df_subset_t2 = df_t2.loc[df_t2['Metadata_control_type'].isin(controltypes)].reset_index(drop=True)\n",
+ "df_subset_t2['Metadata_broad_sample'] = df_subset_t2['Metadata_broad_sample'].fillna('empty')\n",
+ "\n",
+ "# aggregate\n",
+ "if aggregation_type == \"mean\":\n",
+ " df_subset_orf = df_subset_orf.groupby('Metadata_Symbol').mean(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_orf_con = df_subset_orf_con.groupby(['Metadata_control_type','Metadata_broad_sample']).mean(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_t2 = df_subset_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).mean(numeric_only=True).reset_index(drop=True)\n",
+ "\n",
+ "elif aggregation_type == \"median\":\n",
+ " df_subset_orf = df_subset_orf.groupby('Metadata_Symbol').median().reset_index(drop=True)\n",
+ " df_subset_orf_con = df_subset_orf_con.groupby(['Metadata_control_type','Metadata_broad_sample']).median().reset_index(drop=True)\n",
+ " df_subset_t2 = df_subset_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).median().reset_index(drop=True)\n",
+ "\n",
+ "df_subset_orf['Metadata_data_source'] = 'ORF'\n",
+ "df_subset_orf_con['Metadata_data_source'] = 'ORF'\n",
+ "df_subset_t2['Metadata_data_source'] = 'T2'\n",
+ "\n",
+ "\n",
+ "#merge the separate subsets together\n",
+ "df_subset = pd.concat([df_subset_orf,df_subset_orf_con,df_subset_t2], ignore_index=True)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## for all genes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/0y/5d1shp9n6kq856jtm9t10mcw0000gq/T/ipykernel_42906/2054350032.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
+ " df_subset_orf = df.groupby('Metadata_Symbol').median().reset_index(drop=True)\n",
+ "/var/folders/0y/5d1shp9n6kq856jtm9t10mcw0000gq/T/ipykernel_42906/2054350032.py:12: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
+ " df_subset_orf_con = df.groupby(['Metadata_control_type','Metadata_broad_sample']).median().reset_index(drop=True)\n",
+ "/var/folders/0y/5d1shp9n6kq856jtm9t10mcw0000gq/T/ipykernel_42906/2054350032.py:13: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
+ " df_subset_t2 = df_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).median().reset_index(drop=True)\n"
+ ]
+ }
+ ],
+ "source": [
+ "#fill nas in Metadata_broad_sample column to keep untreated negcons\n",
+ "df_t2['Metadata_broad_sample'] = df_t2['Metadata_broad_sample'].fillna('empty')\n",
+ "\n",
+ "# aggregate\n",
+ "if aggregation_type == \"mean\":\n",
+ " df_subset_orf = df.groupby('Metadata_Symbol').mean(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_orf_con = df.groupby(['Metadata_control_type','Metadata_broad_sample']).mean(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_t2 = df_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).mean(numeric_only=True).reset_index(drop=True)\n",
+ "\n",
+ "elif aggregation_type == \"median\":\n",
+ " df_subset_orf = df.groupby('Metadata_Symbol').median(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_orf_con = df.groupby(['Metadata_control_type','Metadata_broad_sample']).median(numeric_only=True).reset_index(drop=True)\n",
+ " df_subset_t2 = df_t2.groupby(['Metadata_broad_sample','Metadata_control_type']).median(numeric_only=True).reset_index(drop=True)\n",
+ "\n",
+ "df_subset_orf['Metadata_data_source'] = 'ORF'\n",
+ "df_subset_orf_con['Metadata_data_source'] = 'ORF'\n",
+ "df_subset_t2['Metadata_data_source'] = 'T2'\n",
+ "\n",
+ "\n",
+ "#merge the separate subsets together\n",
+ "df_collapsed = pd.concat([df_subset_orf,df_subset_orf_con,df_subset_t2], ignore_index=True)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Put metadata back in the dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pycytominer\n",
+ "\n",
+ "metadata_column_list = ['Metadata_Symbol',\n",
+ " 'Metadata_control_type', \n",
+ " 'Metadata_broad_sample',\n",
+ " 'Metadata_plate_map_name',\n",
+ " 'Metadata_Plate', \n",
+ " 'Metadata_Name', \n",
+ " 'Metadata_Vector',\n",
+ " 'Metadata_Transcript', \n",
+ " 'Metadata_NCBI Gene ID', \n",
+ " 'Metadata_Taxon ID',\n",
+ " 'Metadata_Gene Description',\n",
+ " 'Metadata_Annot. Gene Symbol',\n",
+ " 'Metadata_Annot. Gene ID',\n",
+ " 'Metadata_Prot Match %',\n",
+ " 'Metadata_MOI',\n",
+ " 'Metadata_Virus / ml',\n",
+ " 'Metadata_Insert Length',\n",
+ " 'Metadata_pert_type',]\n",
+ "#aggregate ORF\n",
+ "df_ORF_aggregated = pycytominer.aggregate(df, \n",
+ " strata=metadata_column_list,\n",
+ " features=\"infer\",\n",
+ " operation=\"mean\",\n",
+ " output_file=\"none\",\n",
+ " compute_object_count=False,\n",
+ " object_feature=\"Metadata_ObjectNumber\",\n",
+ " subset_data_df=\"none\",\n",
+ " compression_options=None,\n",
+ " float_format=None,)\n",
+ "\n",
+ "#aggregate t2 plates from ORF batches\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Metadata_Well',\n",
+ " 'Metadata_Site_Count',\n",
+ " 'Metadata_Count_Cells',\n",
+ " 'Metadata_Count_CellsIncludingEdges',\n",
+ " 'Metadata_Count_Cytoplasm',\n",
+ " 'Metadata_Count_Nuclei',\n",
+ " 'Metadata_Count_NucleiIncludingEdges',\n",
+ " 'Metadata_Object_Count',\n",
+ " 'Image_Granularity_10_AGP',\n",
+ " 'Image_Granularity_10_BFHigh',\n",
+ " 'Image_Granularity_10_BFLow',\n",
+ " 'Image_Granularity_10_Brightfield',\n",
+ " 'Image_Granularity_10_DNA',\n",
+ " 'Image_Granularity_10_ER',\n",
+ " 'Image_Granularity_10_Mito',\n",
+ " 'Image_Granularity_10_RNA',\n",
+ " 'Image_Granularity_11_AGP',\n",
+ " 'Image_Granularity_11_BFHigh',\n",
+ " 'Image_Granularity_11_BFLow',\n",
+ " 'Image_Granularity_11_Brightfield',\n",
+ " 'Image_Granularity_11_DNA',\n",
+ " 'Image_Granularity_11_ER',\n",
+ " 'Image_Granularity_11_Mito',\n",
+ " 'Image_Granularity_11_RNA',\n",
+ " 'Image_Granularity_12_AGP',\n",
+ " 'Image_Granularity_12_BFHigh',\n",
+ " 'Image_Granularity_12_BFLow',\n",
+ " 'Image_Granularity_12_Brightfield',\n",
+ " 'Image_Granularity_12_ER',\n",
+ " 'Image_Granularity_12_Mito',\n",
+ " 'Image_Granularity_12_RNA',\n",
+ " 'Image_Granularity_13_AGP',\n",
+ " 'Image_Granularity_13_BFHigh',\n",
+ " 'Image_Granularity_13_BFLow',\n",
+ " 'Image_Granularity_13_Brightfield',\n",
+ " 'Image_Granularity_13_ER',\n",
+ " 'Image_Granularity_13_Mito',\n",
+ " 'Image_Granularity_13_RNA',\n",
+ " 'Image_Granularity_14_AGP',\n",
+ " 'Image_Granularity_14_BFHigh',\n",
+ " 'Image_Granularity_14_BFLow',\n",
+ " 'Image_Granularity_14_Brightfield',\n",
+ " 'Image_Granularity_14_ER',\n",
+ " 'Image_Granularity_14_Mito',\n",
+ " 'Image_Granularity_14_RNA',\n",
+ " 'Image_Granularity_15_AGP',\n",
+ " 'Image_Granularity_15_BFHigh',\n",
+ " 'Image_Granularity_15_BFLow',\n",
+ " 'Image_Granularity_15_Brightfield',\n",
+ " 'Image_Granularity_15_DNA',\n",
+ " 'Image_Granularity_15_ER',\n",
+ " 'Image_Granularity_15_Mito',\n",
+ " 'Image_Granularity_15_RNA',\n",
+ " 'Image_Granularity_16_AGP',\n",
+ " 'Image_Granularity_16_BFHigh',\n",
+ " 'Image_Granularity_16_BFLow',\n",
+ " 'Image_Granularity_16_Brightfield',\n",
+ " 'Image_Granularity_16_DNA',\n",
+ " 'Image_Granularity_16_ER',\n",
+ " 'Image_Granularity_16_Mito',\n",
+ " 'Image_Granularity_16_RNA',\n",
+ " 'Image_Granularity_1_BFHigh',\n",
+ " 'Image_Granularity_1_BFLow',\n",
+ " 'Image_Granularity_1_Brightfield',\n",
+ " 'Image_Granularity_1_DNA',\n",
+ " 'Image_Granularity_1_Mito',\n",
+ " 'Image_Granularity_1_RNA',\n",
+ " 'Image_Granularity_2_AGP',\n",
+ " 'Image_Granularity_2_BFHigh',\n",
+ " 'Image_Granularity_2_BFLow',\n",
+ " 'Image_Granularity_2_Brightfield',\n",
+ " 'Image_Granularity_2_DNA',\n",
+ " 'Image_Granularity_2_ER',\n",
+ " 'Image_Granularity_2_Mito',\n",
+ " 'Image_Granularity_2_RNA',\n",
+ " 'Image_Granularity_3_AGP',\n",
+ " 'Image_Granularity_3_BFHigh',\n",
+ " 'Image_Granularity_3_BFLow',\n",
+ " 'Image_Granularity_3_Brightfield',\n",
+ " 'Image_Granularity_3_ER',\n",
+ " 'Image_Granularity_3_Mito',\n",
+ " 'Image_Granularity_3_RNA',\n",
+ " 'Image_Granularity_4_AGP',\n",
+ " 'Image_Granularity_4_BFHigh',\n",
+ " 'Image_Granularity_4_BFLow',\n",
+ " 'Image_Granularity_4_Brightfield',\n",
+ " 'Image_Granularity_4_DNA',\n",
+ " 'Image_Granularity_4_ER',\n",
+ " 'Image_Granularity_4_Mito',\n",
+ " 'Image_Granularity_4_RNA',\n",
+ " 'Image_Granularity_5_AGP',\n",
+ " 'Image_Granularity_5_BFHigh',\n",
+ " 'Image_Granularity_5_BFLow',\n",
+ " 'Image_Granularity_5_Brightfield',\n",
+ " 'Image_Granularity_5_DNA',\n",
+ " 'Image_Granularity_5_ER',\n",
+ " 'Image_Granularity_5_Mito',\n",
+ " 'Image_Granularity_5_RNA',\n",
+ " 'Image_Granularity_6_AGP',\n",
+ " 'Image_Granularity_6_BFHigh',\n",
+ " 'Image_Granularity_6_BFLow',\n",
+ " 'Image_Granularity_6_Brightfield',\n",
+ " 'Image_Granularity_6_DNA',\n",
+ " 'Image_Granularity_6_ER',\n",
+ " 'Image_Granularity_6_Mito',\n",
+ " 'Image_Granularity_6_RNA',\n",
+ " 'Image_Granularity_7_AGP',\n",
+ " 'Image_Granularity_7_BFHigh',\n",
+ " 'Image_Granularity_7_BFLow',\n",
+ " 'Image_Granularity_7_Brightfield',\n",
+ " 'Image_Granularity_7_DNA',\n",
+ " 'Image_Granularity_7_ER',\n",
+ " 'Image_Granularity_7_Mito',\n",
+ " 'Image_Granularity_7_RNA',\n",
+ " 'Image_Granularity_8_AGP',\n",
+ " 'Image_Granularity_8_BFHigh',\n",
+ " 'Image_Granularity_8_BFLow',\n",
+ " 'Image_Granularity_8_Brightfield',\n",
+ " 'Image_Granularity_8_DNA',\n",
+ " 'Image_Granularity_8_ER',\n",
+ " 'Image_Granularity_8_Mito',\n",
+ " 'Image_Granularity_8_RNA',\n",
+ " 'Image_Granularity_9_AGP',\n",
+ " 'Image_Granularity_9_BFHigh',\n",
+ " 'Image_Granularity_9_BFLow',\n",
+ " 'Image_Granularity_9_Brightfield',\n",
+ " 'Image_Granularity_9_DNA',\n",
+ " 'Image_Granularity_9_ER',\n",
+ " 'Image_Granularity_9_Mito',\n",
+ " 'Image_Granularity_9_RNA',\n",
+ " 'Image_ImageQuality_Correlation_OrigAGP_5',\n",
+ " 'Image_ImageQuality_Correlation_OrigAGP_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigBrightfield_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigBrightfield_H_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigBrightfield_L_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigDNA_20',\n",
+ " 'Image_ImageQuality_Correlation_OrigDNA_5',\n",
+ " 'Image_ImageQuality_Correlation_OrigDNA_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigER_10',\n",
+ " 'Image_ImageQuality_Correlation_OrigER_5',\n",
+ " 'Image_ImageQuality_Correlation_OrigER_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigMito_5',\n",
+ " 'Image_ImageQuality_Correlation_OrigMito_50',\n",
+ " 'Image_ImageQuality_Correlation_OrigRNA_10',\n",
+ " 'Image_ImageQuality_Correlation_OrigRNA_20',\n",
+ " 'Image_ImageQuality_Correlation_OrigRNA_5',\n",
+ " 'Image_ImageQuality_FocusScore_OrigAGP',\n",
+ " 'Image_ImageQuality_FocusScore_OrigBrightfield',\n",
+ " 'Image_ImageQuality_FocusScore_OrigDNA',\n",
+ " 'Image_ImageQuality_FocusScore_OrigER',\n",
+ " 'Image_ImageQuality_FocusScore_OrigMito',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigAGP_10',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigAGP_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigAGP_5',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigAGP_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_10',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_5',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_H_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_10',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_5',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigBrightfield_L_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigDNA_10',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigDNA_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigDNA_5',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigDNA_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigER_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigMito_10',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigMito_20',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigMito_5',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigMito_50',\n",
+ " 'Image_ImageQuality_LocalFocusScore_OrigRNA_50',\n",
+ " 'Image_ImageQuality_MADIntensity_OrigDNA',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigAGP',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigBrightfield_H',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigDNA',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigER',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigMito',\n",
+ " 'Image_ImageQuality_MaxIntensity_OrigRNA',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigBrightfield',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigBrightfield_H',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigBrightfield_L',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigER',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigMito',\n",
+ " 'Image_ImageQuality_MinIntensity_OrigRNA',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigAGP',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigBrightfield',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigBrightfield_H',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigBrightfield_L',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigDNA',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigER',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigMito',\n",
+ " 'Image_ImageQuality_PowerLogLogSlope_OrigRNA',\n",
+ " 'Image_ImageQuality_StdIntensity_OrigAGP',\n",
+ " 'Image_ImageQuality_StdIntensity_OrigBrightfield',\n",
+ " 'Image_ImageQuality_ThresholdOtsu_OrigDNA_2W',\n",
+ " 'Image_ImageQuality_TotalIntensity_OrigDNA',\n",
+ " 'Image_Intensity_LowerQuartileIntensity_AGP',\n",
+ " 'Image_Intensity_LowerQuartileIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_MADIntensity_AGP',\n",
+ " 'Image_Intensity_MADIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_MADIntensity_Brightfield_BackgroundOnly',\n",
+ " 'Image_Intensity_MADIntensity_DNA',\n",
+ " 'Image_Intensity_MADIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_MADIntensity_ER__BackgroundOnly',\n",
+ " 'Image_Intensity_MADIntensity_Mito',\n",
+ " 'Image_Intensity_MADIntensity_RNA',\n",
+ " 'Image_Intensity_MaxIntensity_AGP',\n",
+ " 'Image_Intensity_MaxIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_BFHigh__BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_BFLow',\n",
+ " 'Image_Intensity_MaxIntensity_BFLow_BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_Brightfield',\n",
+ " 'Image_Intensity_MaxIntensity_Brightfield_BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_DNA',\n",
+ " 'Image_Intensity_MaxIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_ER',\n",
+ " 'Image_Intensity_MaxIntensity_ER__BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_Mito',\n",
+ " 'Image_Intensity_MaxIntensity_Mito_BackgroundOnly',\n",
+ " 'Image_Intensity_MaxIntensity_RNA',\n",
+ " 'Image_Intensity_MaxIntensity_RNA_BackgroundOnly',\n",
+ " 'Image_Intensity_MedianIntensity_AGP',\n",
+ " 'Image_Intensity_MedianIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_MedianIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_MedianIntensity_ER__BackgroundOnly',\n",
+ " 'Image_Intensity_MinIntensity_AGP',\n",
+ " 'Image_Intensity_MinIntensity_BFHigh__BackgroundOnly',\n",
+ " 'Image_Intensity_MinIntensity_BFLow_BackgroundOnly',\n",
+ " 'Image_Intensity_MinIntensity_Brightfield_BackgroundOnly',\n",
+ " 'Image_Intensity_MinIntensity_DNA',\n",
+ " 'Image_Intensity_MinIntensity_Mito_BackgroundOnly',\n",
+ " 'Image_Intensity_PercentMaximal_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_PercentMaximal_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_PercentMaximal_ER__BackgroundOnly',\n",
+ " 'Image_Intensity_PercentMaximal_Mito_BackgroundOnly',\n",
+ " 'Image_Intensity_PercentMaximal_RNA_BackgroundOnly',\n",
+ " 'Image_Intensity_StdIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_StdIntensity_Brightfield_BackgroundOnly',\n",
+ " 'Image_Intensity_StdIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_StdIntensity_ER__BackgroundOnly',\n",
+ " 'Image_Intensity_StdIntensity_RNA_BackgroundOnly',\n",
+ " 'Image_Intensity_TotalIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_TotalIntensity_BFLow_BackgroundOnly',\n",
+ " 'Image_Intensity_TotalIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Intensity_TotalIntensity_RNA_BackgroundOnly',\n",
+ " 'Image_Intensity_UpperQuartileIntensity_AGP__BackgroundOnly',\n",
+ " 'Image_Intensity_UpperQuartileIntensity_DNA',\n",
+ " 'Image_Intensity_UpperQuartileIntensity_DNA_BackgroundOnly',\n",
+ " 'Image_Texture_AngularSecondMoment_AGP_3_02_256',\n",
+ " 'Image_Texture_AngularSecondMoment_BFLow_10_01_256',\n",
+ " 'Image_Texture_AngularSecondMoment_Brightfield_3_02_256',\n",
+ " 'Image_Texture_AngularSecondMoment_DNA_3_00_256',\n",
+ " 'Image_Texture_AngularSecondMoment_ER_5_02_256',\n",
+ " 'Image_Texture_AngularSecondMoment_Mito_10_01_256',\n",
+ " 'Image_Texture_AngularSecondMoment_RNA_10_01_256',\n",
+ " 'Image_Texture_Contrast_AGP_10_02_256',\n",
+ " 'Image_Texture_Contrast_BFHigh_10_01_256',\n",
+ " 'Image_Texture_Contrast_BFLow_10_01_256',\n",
+ " 'Image_Texture_Contrast_Brightfield_10_00_256',\n",
+ " 'Image_Texture_Contrast_DNA_3_02_256',\n",
+ " 'Image_Texture_Contrast_ER_3_03_256',\n",
+ " 'Image_Texture_Contrast_Mito_3_00_256',\n",
+ " 'Image_Texture_Correlation_AGP_10_01_256',\n",
+ " 'Image_Texture_Correlation_BFLow_10_02_256',\n",
+ " 'Image_Texture_Correlation_Brightfield_10_01_256',\n",
+ " 'Image_Texture_Correlation_DNA_10_00_256',\n",
+ " 'Image_Texture_Correlation_DNA_10_01_256',\n",
+ " 'Image_Texture_Correlation_DNA_10_02_256',\n",
+ " 'Image_Texture_Correlation_DNA_10_03_256',\n",
+ " 'Image_Texture_Correlation_DNA_5_00_256',\n",
+ " 'Image_Texture_Correlation_DNA_5_01_256',\n",
+ " 'Image_Texture_Correlation_DNA_5_03_256',\n",
+ " 'Image_Texture_Correlation_ER_10_02_256',\n",
+ " 'Image_Texture_Correlation_ER_5_00_256',\n",
+ " 'Image_Texture_Correlation_Mito_3_02_256',\n",
+ " 'Image_Texture_Correlation_RNA_10_03_256',\n",
+ " 'Image_Texture_Correlation_RNA_5_03_256',\n",
+ " 'Image_Texture_DifferenceEntropy_AGP_3_02_256',\n",
+ " 'Image_Texture_DifferenceEntropy_BFHigh_3_00_256',\n",
+ " 'Image_Texture_DifferenceEntropy_BFLow_3_02_256',\n",
+ " 'Image_Texture_DifferenceEntropy_Brightfield_10_03_256',\n",
+ " 'Image_Texture_DifferenceEntropy_DNA_10_01_256',\n",
+ " 'Image_Texture_DifferenceEntropy_Mito_10_01_256',\n",
+ " 'Image_Texture_DifferenceVariance_AGP_3_01_256',\n",
+ " 'Image_Texture_DifferenceVariance_BFHigh_3_00_256',\n",
+ " 'Image_Texture_DifferenceVariance_BFLow_3_02_256',\n",
+ " 'Image_Texture_DifferenceVariance_Brightfield_3_00_256',\n",
+ " 'Image_Texture_DifferenceVariance_DNA_3_02_256',\n",
+ " 'Image_Texture_DifferenceVariance_Mito_10_03_256',\n",
+ " 'Image_Texture_Entropy_BFLow_3_00_256',\n",
+ " 'Image_Texture_InfoMeas1_AGP_10_03_256',\n",
+ " 'Image_Texture_InfoMeas1_BFLow_10_03_256',\n",
+ " 'Image_Texture_InfoMeas1_Brightfield_10_03_256',\n",
+ " 'Image_Texture_InfoMeas1_DNA_10_03_256',\n",
+ " 'Image_Texture_InfoMeas1_DNA_3_01_256',\n",
+ " 'Image_Texture_InfoMeas1_DNA_5_01_256',\n",
+ " 'Image_Texture_InfoMeas1_ER_5_00_256',\n",
+ " 'Image_Texture_InfoMeas1_Mito_10_03_256',\n",
+ " 'Image_Texture_InfoMeas1_Mito_3_02_256',\n",
+ " 'Image_Texture_InfoMeas2_AGP_10_03_256',\n",
+ " 'Image_Texture_InfoMeas2_DNA_10_01_256',\n",
+ " 'Image_Texture_InfoMeas2_DNA_10_02_256',\n",
+ " 'Image_Texture_InfoMeas2_ER_3_03_256',\n",
+ " 'Image_Texture_InfoMeas2_ER_5_01_256',\n",
+ " 'Image_Texture_InfoMeas2_Mito_10_01_256',\n",
+ " 'Image_Texture_InfoMeas2_Mito_10_02_256',\n",
+ " 'Image_Texture_InfoMeas2_Mito_10_03_256',\n",
+ " 'Image_Texture_InfoMeas2_Mito_3_02_256',\n",
+ " 'Image_Texture_InfoMeas2_Mito_5_01_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_AGP_3_02_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_BFHigh_3_00_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_BFLow_3_00_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_Brightfield_3_02_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_DNA_10_03_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_ER_10_01_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_Mito_10_01_256',\n",
+ " 'Image_Texture_InverseDifferenceMoment_RNA_3_00_256',\n",
+ " 'Image_Texture_SumEntropy_DNA_3_02_256',\n",
+ " 'Image_Texture_SumVariance_AGP_10_03_256',\n",
+ " 'Image_Texture_SumVariance_Brightfield_5_02_256',\n",
+ " 'Image_Texture_SumVariance_DNA_10_03_256',\n",
+ " 'Image_Texture_SumVariance_ER_10_03_256',\n",
+ " 'Image_Texture_SumVariance_Mito_10_01_256',\n",
+ " 'Image_Texture_SumVariance_RNA_10_01_256',\n",
+ " 'Image_Threshold_FinalThreshold_NucleiIncludingEdges',\n",
+ " 'Image_Threshold_FinalThreshold_mito_bw',\n",
+ " 'Image_Threshold_SumOfEntropies_CellsIncludingEdges',\n",
+ " 'Image_Threshold_SumOfEntropies_NucleiIncludingEdges',\n",
+ " 'Image_Threshold_SumOfEntropies_mito_bw',\n",
+ " 'Image_Threshold_WeightedVariance_CellsIncludingEdges',\n",
+ " 'Image_Threshold_WeightedVariance_NucleiIncludingEdges',\n",
+ " 'Image_Threshold_WeightedVariance_mito_bw']"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "[c for c in df.columns if c not in df_ORF_aggregated.columns]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_collapsed.to_csv(f\"JUMP_ORF_{aggregation_type}_collapsed.csv\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "jumpORF",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.15"
+ },
+ "orig_nbformat": 4,
+ "vscode": {
+ "interpreter": {
+ "hash": "2f3fec36f6be95d788e5f03a928d042624f9ad08087c2484cba824ceb7727375"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}