Skip to content

Commit

Permalink
sometimes cellbender lets through a lot of empty droplets which in ca…
Browse files Browse the repository at this point in the history
…se of a single donor pool result in failure downstream in qc metrics since 0 counts result in NaN prct calculations of mito transcripts
Matiss Ozols committed Jan 7, 2025
1 parent 93f8955 commit c6c8fab
Showing 3 changed files with 44 additions and 1 deletion.
3 changes: 2 additions & 1 deletion bin/0026-filter_outlier_cells.py
Original file line number Diff line number Diff line change
@@ -88,7 +88,7 @@ def perform_adaptiveQC_Filtering(clf,adata,method,metadata_columns):
predicted_scores = clf.negative_outlier_factor_
elif method == 'IsolationForest':
f = clf.fit_predict(
adata.obs[metadata_columns].values
adata2.obs['pct_counts_gene_group__mito_transcript'].values
) == 1
predicted_scores = clf.decision_function(adata.obs[metadata_columns].values)
elif method == 'MAD':
@@ -312,6 +312,7 @@ def main():

# Load the AnnData file.
adata = sc.read_h5ad(filename=options.h5)
adata2 = sc.read_h5ad(filename='/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/ania/analysis_trego_2025/results_cb3/handover/merged_h5ad/1.pre_QC_adata.h5ad')
adata.obs['cell_id'] = adata.obs.index

# Here we add an adaptive QC per Column
4 changes: 4 additions & 0 deletions bin/scanpy_merge_from_h5ad.py
Original file line number Diff line number Diff line change
@@ -679,6 +679,10 @@ def scanpy_merge(
],
inplace=True
)
adata_merged.obs['pct_counts_gene_group__mito_transcript'].fillna(0, inplace=True)
adata_merged.obs['pct_counts_gene_group__mito_protein'].fillna(0, inplace=True)
adata_merged.obs['pct_counts_gene_group__ribo_rna'].fillna(0, inplace=True)
adata_merged.obs['pct_counts_gene_group__ribo_protein'].fillna(0, inplace=True)
except:
_='most likely different data format such as ATAC which doesnt have gene IDs'
# adata_merged.obs = obs_prior
38 changes: 38 additions & 0 deletions bin/strip_citeseq.py
Original file line number Diff line number Diff line change
@@ -23,6 +23,42 @@
import logging
import os
import re
import pandas as pd

# Function to check if the index contains ENSG values
# Function to check if the index contains ENSG values
def contains_ensg(index):
return pd.Series(list(index)).str.contains(r"^ENSG[0-9]", na=False,regex=True).any()

# Function to ensure ENSG values are in the index
def ensure_ensg_index(adata):
# Identify the column to use for ENSG values
# Step 1: Determine which column to fallback to
if 'gene_symbols' in adata.var.columns:
fallback_var = 'gene_symbols'
elif 'gene_ids' in adata.var.columns:
fallback_var = 'gene_ids'
else:
raise ValueError("Neither 'gene_symbols' nor 'gene_ids' are available in adata.var.")

# Step 2: Check if the current index contains ENSG values
if contains_ensg(adata.var.index):
print("Index already contains ENSG values. No changes made.")
return adata # ENSG values already present, nothing to change

# Step 3: Check if the fallback column contains ENSG values
if contains_ensg(adata.var[fallback_var]):
print(f"Swapping index with ENSG values from '{fallback_var}'.")
# Explicitly swap the values
original_index = adata.var.index.copy()
original_fallback_val = adata.var[fallback_var].values.copy()
adata.var.index = original_fallback_val
adata.var[fallback_var] = original_index
else:
print(f"No ENSG values found in index or '{fallback_var}'. No changes made.")

return adata

compression_opts = 'gzip'
filter_0_count_cells=False

@@ -298,6 +334,8 @@ def main():
adata_cellranger_filtered = sc.read_10x_mtx(
options.raw_data, var_names='gene_symbols', make_unique=True,
cache=False, cache_compression=compression_opts,gex_only=False)

adata_cellranger_filtered = ensure_ensg_index(adata_cellranger_filtered)
all_feature_types = set(adata_cellranger_filtered.var['feature_types'])
hashtags = set(options.hastag_labels.split(","))
hashtags = ['Hashtag_.*']

0 comments on commit c6c8fab

Please sign in to comment.