Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Feature gene annotation #132

Merged
merged 30 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
1138286
annotate (in .uns['cnv']['var']) which window contains which genes
redst4r Sep 25, 2022
e6f0020
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 15, 2022
dc15059
Merge branch 'main' into feature_geneAnnotation
grst Oct 17, 2022
0cacaa5
remove black magic comma
grst Oct 17, 2022
28990a7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 17, 2022
54f0657
Merge branch 'main' into feature_geneAnnotation
grst Oct 20, 2022
7fecb8b
first pass at gene convolution annotation
grantn5 Jun 10, 2024
b6b660f
Merge branch 'main' into feature_geneAnnotation
grantn5 Jun 10, 2024
d8e7933
sorting some conflicts
grantn5 Jun 10, 2024
c071fa3
feature annotation working
grantn5 Jun 11, 2024
e4bed8a
adding test to calcualte gene average
grantn5 Jun 11, 2024
175bba7
Converting gene matrix into sparse matrix
grantn5 Jun 12, 2024
c71cf28
updating gene position to using pandas
grantn5 Jun 12, 2024
4216b11
linting
grantn5 Jun 12, 2024
b8aa050
adding more tests
grantn5 Jun 12, 2024
611e9e6
Delete .vscode directory
grantn5 Jun 12, 2024
45d41be
adding scanpy to conf.py
grantn5 Jun 12, 2024
8dd606a
changing scanpy.tl.pca to sc.tl.pca to get build docs to work
grantn5 Jun 12, 2024
b034b07
Saving CNV matrix as sparse matrix
grantn5 Jun 14, 2024
7b0709d
Merge branch 'main' into feature_geneAnnotation
grantn5 Jun 14, 2024
d5737b9
adding back infercnv.py
grantn5 Jun 14, 2024
79017bf
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 14, 2024
91abbb7
fixing linting error
grantn5 Jun 14, 2024
a1abd33
editing code to appease ruff
grantn5 Jun 14, 2024
38e6504
Merge branch 'main' into feature_geneAnnotation
grst Jun 20, 2024
155f520
Merge branch 'main' into feature_geneAnnotation
grantn5 Jun 24, 2024
f9984d0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 24, 2024
c548087
Converting gene conv to numpy array
grantn5 Jun 24, 2024
b2c481e
Making the calculation of per gene values optional
grantn5 Jun 24, 2024
8e821fb
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,9 @@ __pycache__/
/.idea/
/.vscode/

infercnv_env/
# Notebooks
.ipynb_checkpoints

# Node.js
node_modules/
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ authors = [
maintainers = [
{name = "Gregor Sturm", email="[email protected]"}
]
contributors = [
{name = "Grant Neilson", email = "[email protected]"}
]
urls.Documentation = "https://infercnvpy.readthedocs.io/"
urls.Source = "https://github.com/icbi-lab/infercnvpy"
urls.Home-page = "https://github.com/icbi-lab/infercnvpy"
Expand Down
4 changes: 2 additions & 2 deletions src/infercnvpy/io/_genepos.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ def genomic_position_from_gtf(
If True, add the annotations directly to adata, otherwise return a dataframe.
"""
gtf = gtfparse.read_gtf(
gtf_file, usecols=["seqname", "feature", "start", "end", "gene_id", "gene_name"]
).to_pandas()
grantn5 marked this conversation as resolved.
Show resolved Hide resolved
gtf_file, usecols=["seqname", "feature", "start", "end", "gene_id", "gene_name"], result_type="pandas"
)
gtf = (
gtf.loc[
gtf["feature"] == "gene",
Expand Down
8 changes: 4 additions & 4 deletions src/infercnvpy/tl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,24 +41,24 @@ def pca(
) -> Union[np.ndarray, None]:
"""Compute the PCA on the result of :func:`infercnvpy.tl.infercnv`.

Thin wrapper around :func:`scanpy.tl.pca`.
grantn5 marked this conversation as resolved.
Show resolved Hide resolved
Thin wrapper around :func:`sc.tl.pca`.

Parameters
----------
adata
annotated data matrix
svd_solver
See :func:`scanpy.tl.pca`.
See :func:`sc.tl.pca`.
zero_center
See :func:`scanpy.tl.pca`.
See :func:`sc.tl.pca`.
inplace
If True, store the result in adata.obsm. Otherwise return the PCA matrix.
use_rep
Key under which the result of infercnv is stored in adata
key_added
Key under which the result will be stored in adata.obsm if `inplace=True`.
**kwargs
Additional arguments passed to :func:`scanpy.tl.pca`.
Additional arguments passed to :func:`sc.tl.pca`.
"""
if f"X_{use_rep}" not in adata.obsm:
raise KeyError(f"X_{use_rep} is not in adata.obsm. Did you run `tl.infercnv`?")
Expand Down
138 changes: 108 additions & 30 deletions src/infercnvpy/tl/_infercnv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Union

import numpy as np
import pandas as pd
import scipy.ndimage
import scipy.sparse
from anndata import AnnData
Expand Down Expand Up @@ -109,7 +110,7 @@ def infercnv(

var = tmp_adata.var.loc[:, ["chromosome", "start", "end"]] # type: ignore

chr_pos, chunks = zip(
chr_pos, chunks, convolved_dfs = zip(
*process_map(
_infercnv_chunk,
[expr[i : i + chunksize, :] for i in range(0, adata.shape[0], chunksize)],
Expand All @@ -123,14 +124,20 @@ def infercnv(
max_workers=cpu_count() if n_jobs is None else n_jobs,
)
)

res = scipy.sparse.vstack(chunks)
chr_pos = chr_pos[0]
per_gene_df = pd.concat(convolved_dfs, axis=0)
# Ensure the DataFrame has the correct index
per_gene_df.index = adata.obs.index

if inplace:
adata.obsm[f"X_{key_added}"] = res
adata.uns[key_added] = {"chr_pos": chr_pos[0]}
adata.obsm[f"gene_values_{key_added}"] = per_gene_df
adata.uns[key_added] = {"chr_pos": chr_pos}

else:
return chr_pos[0], res
return chr_pos, res, per_gene_df


def _natural_sort(l: Sequence):
Expand All @@ -148,8 +155,14 @@ def alphanum_key(key):
return sorted(l, key=alphanum_key)


def _running_mean(x: Union[np.ndarray, scipy.sparse.spmatrix], n: int = 50, step: int = 10) -> np.ndarray:
"""Compute a pyramidially weighted running mean.
def _running_mean(
x: Union[np.ndarray, scipy.sparse.spmatrix],
n: int = 50,
step: int = 10,
gene_list: list = None,
) -> np.ndarray:
"""
Compute a pyramidially weighted running mean.

Densifies the matrix. Use `step` and `chunksize` to save memory.

Expand All @@ -161,26 +174,83 @@ def _running_mean(x: Union[np.ndarray, scipy.sparse.spmatrix], n: int = 50, step
Length of the running window
step
only compute running windows ever `step` columns, e.g. if step is 10
grantn5 marked this conversation as resolved.
Show resolved Hide resolved
0:100, 10:110, 20:120 etc. Saves memory.
0:99, 10:109, 20:119 etc. Saves memory.
"""
if n < x.shape[1]: # regular convolution: the filter is smaller than the #genes
r = np.arange(1, n + 1)

pyramid = np.minimum(r, r[::-1])
smoothed_x = np.apply_along_axis(lambda row: np.convolve(row, pyramid, mode="same"), axis=1, arr=x) / np.sum(
pyramid
)
return smoothed_x[:, np.arange(0, smoothed_x.shape[1], step)]
smoothed_x = np.apply_along_axis(
lambda row: np.convolve(row, pyramid, mode="valid"),
axis=1,
arr=x,
) / np.sum(pyramid)

else: # there's less genes than the filtersize. just apply a single conv with a smaller filter (no sliding)
r = np.arange(1, x.shape[1] + 1)
pyramid = np.minimum(r, r[::-1])
smoothed_x = x @ pyramid.reshape(-1, 1)
smoothed_x = smoothed_x / pyramid.sum()
return smoothed_x
## get the indices of the genes used in the convolution
convolution_indices = get_convolution_indices(x, n)[np.arange(0, smoothed_x.shape[1], step)]
## Pull out the genes used in the convolution
convolved_gene_names = gene_list[convolution_indices]
smoothed_x = smoothed_x[:, np.arange(0, smoothed_x.shape[1], step)]

convolved_gene_values = _calculate_gene_averages(convolved_gene_names, smoothed_x)

def _running_mean_by_chromosome(expr, var, window_size, step) -> tuple[dict, np.ndarray]:
return smoothed_x, convolved_gene_values

else: # If there is less genes than the window size, set the window size to the number of genes and perform a single convolution
n = x.shape[1] # set the filter size to the number of genes
r = np.arange(1, n + 1)
## As we are only doing one convolution the values should be equal
pyramid = np.array([1] * n)
# Old code to make the pyramid
# pyramid = np.minimum(r, r[::-1])
smoothed_x = np.apply_along_axis(
lambda row: np.convolve(row, pyramid, mode="valid"),
axis=1,
arr=x,
) / np.sum(pyramid)

## As all genes are used the convolution the values are identical for all genes
convolved_gene_values = pd.DataFrame(np.repeat(smoothed_x, len(gene_list), axis=1), columns=gene_list)

return smoothed_x, convolved_gene_values
grantn5 marked this conversation as resolved.
Show resolved Hide resolved


def _calculate_gene_averages(convolved_gene_names, smoothed_x):
## create a dictionary to store the gene values per sample
gene_to_values = {}
# Calculate the number of genes in each convolution, will be same as the window size default=100
length = len(convolved_gene_names[0])
# Convert the flattened convolved gene names to a list
flatten_list = list(convolved_gene_names.flatten())

# For each sample in smoothed_x find the value for each gene and store it in a dictionary
for sample, row in enumerate(smoothed_x):
# Create sample level in the dictionary
if sample not in gene_to_values:
gene_to_values[sample] = {}
# For each gene in the flattened gene list find the value and store it in the dictionary
for i, gene in enumerate(flatten_list):
if gene not in gene_to_values[sample]:
gene_to_values[sample][gene] = []
# As the gene list has been flattend we can use the floor division of the index
# to get the correct position of the gene to get the value and store it in the dictionary
gene_to_values[sample][gene].append(row[i // length])

for sample in gene_to_values:
for gene in gene_to_values[sample]:
gene_to_values[sample][gene] = np.mean(gene_to_values[sample][gene])

convolved_gene_values = pd.DataFrame(gene_to_values).T
return convolved_gene_values


def get_convolution_indices(x, n):
indices = []
for i in range(x.shape[1] - n + 1):
indices.append(np.arange(i, i + n))
return np.array(indices)


def _running_mean_by_chromosome(expr, var, window_size, step) -> tuple[dict, np.ndarray, pd.DataFrame]:
"""Compute the running mean for each chromosome independently. Stack the resulting arrays ordered by chromosome.

Parameters
Expand All @@ -206,16 +276,24 @@ def _running_mean_by_chromosome(expr, var, window_size, step) -> tuple[dict, np.
"""
chromosomes = _natural_sort([x for x in var["chromosome"].unique() if x.startswith("chr") and x != "chrM"])

def _running_mean_for_chromosome(chr):
genes = var.loc[var["chromosome"] == chr].sort_values("start").index.values
tmp_x = expr[:, var.index.get_indexer(genes)]
return _running_mean(tmp_x, n=window_size, step=step)
running_means = [_running_mean_for_chromosome(chr, expr, var, window_size, step) for chr in chromosomes]

running_means, convolved_dfs = zip(*running_means)

chr_start_pos = {chr: i for chr, i in zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]))} # noqa: C416

## Concatenate the gene dfs
convolved_dfs = pd.concat(convolved_dfs, axis=1)

return chr_start_pos, np.hstack(running_means), convolved_dfs

running_means = [_running_mean_for_chromosome(chr) for chr in chromosomes]

chr_start_pos = dict(zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means])))
def _running_mean_for_chromosome(chr, expr, var, window_size, step):
genes = var.loc[var["chromosome"] == chr].sort_values("start").index.values
tmp_x = expr[:, var.index.get_indexer(genes)]
x_conv, convolved_gene_values = _running_mean(tmp_x, n=window_size, step=step, gene_list=genes)

return chr_start_pos, np.hstack(running_means)
return x_conv, convolved_gene_values


def _get_reference(
Expand Down Expand Up @@ -291,17 +369,17 @@ def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_t
# Step 2 - clip log fold changes
x_clipped = np.clip(x_centered, -lfc_cap, lfc_cap)
# Step 3 - smooth by genomic position
chr_pos, x_smoothed = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step)
chr_pos, x_smoothed, conv_df = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step)
# Step 4 - center by cell
x_cell_centered = x_smoothed - np.median(x_smoothed, axis=1)[:, np.newaxis]
x_res = x_smoothed - np.median(x_smoothed, axis=1)[:, np.newaxis]
gene_res = conv_df - np.median(conv_df, axis=1)[:, np.newaxis]

# Noise filtering
x_res = x_cell_centered
# step 5 - standard deviation based noise filtering
if dynamic_threshold is not None:
noise_thres = dynamic_threshold * np.std(x_res)
x_res[np.abs(x_res) < noise_thres] = 0
gene_res[np.abs(gene_res) < noise_thres] = 0

x_res = scipy.sparse.csr_matrix(x_res)

return chr_pos, x_res
return chr_pos, x_res, gene_res
50 changes: 50 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,56 @@ def adata_mock(request):
return adata


@pytest.fixture
def adata_full_mock():
np.random.seed(0) # Set the seed to a fixed value
obs = pd.DataFrame().assign(sample=["sample1", "sample2", "sample3", "sample4"]) # Added "sample4"
var = pd.DataFrame().assign(
gene=["gene1", "gene2", "gene3", "gene4", "gene5", "gene6", "gene7", "gene8", "gene9", "gene10"],
start=[100, 200, 300, 400, 500, 0, 100, 200, 300, 400],
end=[199, 299, 399, 499, 599, 99, 199, 299, 399, 499],
chromosome=["chr1", "chr1", "chr1", "chr1", "chr1", "chr2", "chr2", "chr2", "chr2", "chr2"],
)
var.index = var["gene"]
X = sp.csr_matrix(np.random.randint(low=0, high=50, size=(4, 10))) # Changed size to (4, 10)
adata = sc.AnnData(X=X, obs=obs, var=var)

return adata


@pytest.fixture
def gene_res_actual():
df = pd.DataFrame(
{
"gene1": [0.75, -1.00, 0.00, 0.00],
"gene2": [0.00, 0.00, 0.75, 0.00],
"gene3": [0.000000, 0.000000, 0.916667, 0.000000],
"gene4": [0.00, 0.00, 1.25, 0.00],
"gene5": [-0.75, 0.00, 1.25, 0.00],
"gene6": [0.000000, 0.000000, 0.000000, 0.921875],
"gene7": [0.000000, 0.000000, 0.000000, 0.703125],
"gene8": [0.0, 0.0, 0.0, 0.0],
"gene9": [0.0, 0.0, 0.0, 0.0],
"gene10": [0.75, 0.00, 0.00, 0.00],
}
)
df.index = df.index.astype(str)
return df


@pytest.fixture
def x_res_actual():
arr = np.array(
[
[1.00, 0.00, 0.00, 0.00, 0.00, 1.00],
[-1.00, 0.00, 0.00, 0.00, 0.00, 0.00],
[0.00, 1.25, 1.25, 0.00, 0.00, 0.00],
[0.00, 0.00, 0.00, 0.875, 0.00, 0.00],
]
)
return arr


@pytest.fixture(params=[np.array, sp.csr_matrix, sp.csc_matrix])
def adata_ithgex(request):
return sc.AnnData(
Expand Down
Loading
Loading