Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/refactor #85

Open
wants to merge 21 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions docs/modes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Build mode
~~~~~~~~~~

.. argparse::
:module: immunopepper.get_parsers
:module: immunopepper.ip
:func: get_build_parser
:prog: immunopepper build
:passparser:
Expand All @@ -45,7 +45,7 @@ Build mode
Samplespecif mode
~~~~~~~~~~~~~~~~~
.. argparse::
:module: immunopepper.get_parsers
:module: immunopepper.ip
:func: get_samplespecif_parser
:prog: immunopepper samplespecif
:passparser:
Expand All @@ -55,7 +55,7 @@ Samplespecif mode
Cancerspecif mode
~~~~~~~~~~~~~~~~~
.. argparse::
:module: immunopepper.get_parsers
:module: immunopepper.ip
:func: get_cancerspecif_parser
:prog: immunopepper cancerspecif
:passparser:
Expand Down Expand Up @@ -143,7 +143,7 @@ Mhcbind mode
~~~~~~~~~~~~~~

.. argparse::
:module: immunopepper.get_parsers
:module: immunopepper.ip
:func: get_mhcbind_parser
:prog: immunopepper mhcbind
:passparser:
Expand All @@ -155,7 +155,7 @@ Pepquery mode
~~~~~~~~~~~~~~

.. argparse::
:module: immunopepper.get_parsers
:module: immunopepper.ip
:func: get_pepquery_parser
:prog: immunopepper pepquery
:passparser:
Expand Down
299 changes: 0 additions & 299 deletions immunopepper/get_parsers.py

This file was deleted.

353 changes: 214 additions & 139 deletions immunopepper/ip.py

Large diffs are not rendered by default.

336 changes: 142 additions & 194 deletions immunopepper/mode_cancerspecif.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions immunopepper/mode_pepQuery.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def mode_pepquery(arg):
output_peptides_file_idx = [i + 1 for i, j in enumerate(args_list) if j == '-o']
output_peptides_file = args_list[output_peptides_file_idx[0]]

if f'{output_peptides_file}psm_rank.txt' not in glob.glob(f'{output_peptides_file}*'):
if f'{output_peptides_file}/psm_rank.txt' not in glob.glob(f'{output_peptides_file}/*'):
logging.error(">>>>> PepQuery failed to generate the psm_rank.txt file. Please check the output directory. Maybe the provided spectra file does not contain any of the input peptides")
sys.exit(1)
else:
Expand Down Expand Up @@ -180,7 +180,7 @@ def mode_pepquery(arg):
ip_out["confident"] = pd.Categorical(ip_out["confident"], categories=confident_categories)
ip_out.sort_values(by=["confident", "score"], ascending=[True, False], inplace=True)
ip_out.to_csv(f'{arg.output_dir}/peptides_validated.tsv.gz', sep='\t', index=False, compression='gzip')
logging.info(">>>>> Processed output file saved to {}peptides_validated.tsv.gz \n".format(arg.output_dir))
logging.info(">>>>> Processed output file saved to {}/peptides_validated.tsv.gz \n".format(arg.output_dir))
logging.info(">>>>> Finished running immunopepper in pepquery mode \n")


Expand Down
200 changes: 200 additions & 0 deletions immunopepper/sdisk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
import logging
import numpy as np
import os
import pathlib

def save_spark(cancer_kmers, output_dir, path_final_fil, outpartitions=None):
'''
Saves a spark dataframe as a single or partitioned csv file
:param cancer_kmers: spark dataframe matrix with expression counts for cancer
:param output_dir: str path for output directory
:param path_final_fil: str path to save the spark dataframe
:param outpartitions: int number of partitions for saving
'''
# save
logging.info(f'>>>> Save to {path_final_fil}')
pathlib.Path(output_dir).mkdir(exist_ok=True, parents=True)
if outpartitions is not None:
cancer_kmers.repartition(outpartitions).write.mode('overwrite')\
.options(header="true", sep="\t", compression="gzip").format("tsv.gz").csv(path_final_fil)
else:
cancer_kmers.write.mode('overwrite')\
.options(header="true", sep="\t", compression="gzip").format("tsv.gz").csv(path_final_fil)


def output_count(perform_count, matrix, report_count, report_step, step_string):
'''
Performs a count operation on the number of kmers present in spark dataframe after a given filtering step
Note: This operation is expensive but useful if the user is interested in intermediate filtering steps
:param perform_count: bool whether to perform a count operation
:param matrix: spark dataframe with kmer expression counts
:param report_count: list to store result of successive counting operations
:param report_step: list to store name of successive counting operations
:param step_string: str name of the counting operation
'''
if perform_count:
mycount = matrix.count()
report_count.append(mycount)
report_step.append(step_string)
logging.info(f'# {step_string} n = {mycount} kmers')


def save_output_count(output_count, report_count, report_steps, prefix, cancer_sample_ori, mutation_mode,
sample_expr_support_cancer, cohort_expr_support_cancer, n_samples_lim_cancer,
cohort_expr_support_normal, n_samples_lim_normal, id_normals):
'''
Saves the number of kmers present in spark dataframe after each filtering step in a tabular file
:param output_count: str path for count file of intermediate filtering steps
:param report_count: list to store result of successive counting operations
:param report_step: list to store name of successive counting operations
:param prefix: str information to be added to the result line in an info column
:param cancer_sample_ori: str id of target cancer sample which was filtered
:param mutation_mode: str information about whether mutations where applied or not
:param sample_expr_support_cancer: float normalized expression threshold for the cancer target sample
:param cohort_expr_support_cancer: float normalized expression threshold for the cancer cohort
excluding the target sample
hich should be met in n samples
:param n_samples_lim_cancer: int number of cancer samples in which the cancer cohort expression threshold
should be met
:param cohort_expr_support_normal: float normalized expression threshold for the normal cohort
required in any sample (>=1)
:param n_samples_lim_normal: int number of normal samples in which any number of reads is required (>0)
:param id_normals: str id of the normal cohort (example gtex)
'''
if output_count:
header = (f'{"sample"}\t{"mutation_mode"}\t{"min_sample_reads"}\t{"#_of_cohort_samples"}\t'
f'{"reads_per_cohort_sample"}\t{"#_normal_samples_allowed"}\t{"normal_cohort_id"}'
f'\t{"reads_per_normal_sample"}')
line = (f'{cancer_sample_ori}\t{mutation_mode}\t{sample_expr_support_cancer}\t{n_samples_lim_cancer}'
f'\t{cohort_expr_support_cancer}\t{n_samples_lim_normal}\t{id_normals}'
f'\t{cohort_expr_support_normal}')

for idx in np.arange(len(report_count)):
header += f'\t{report_steps[idx]}'
line += f'\t{report_count[idx]}'
if prefix:
header += f'\t{"info"}'
line += f'\t{prefix}'
header += "\n"
line += "\n"
if not os.path.exists(output_count):
with open(output_count,"w") as f:
f.write(header)
with open(output_count, "a") as f:
f.write(line)
logging.info(f'Save intermediate info to {output_count}')


def redirect_interm(interm_dir_norm, interm_dir_canc, output_dir):
'''
Set the directory to save intermediary file
- The output directory
- Any other specified normal or cancer directory
Default. Uses output directory
:param interm_dir_norm: str custom scatch dir path to save intermediate normal files
:param interm_dir_canc: str custom scatch dir path to save intermediate cancer files
:param output_dir: str output directory for the filtered matrix
:return:
'''

if interm_dir_canc:
cancer_out = interm_dir_canc
else:
cancer_out = output_dir
if interm_dir_norm:
normal_out = interm_dir_norm
else:
normal_out = output_dir
return normal_out, cancer_out


def check_interm_files(out_dir, expr_limit, n_samples_lim, target_sample='', tag='normals', batch_tag=''):
'''
Filtering steps for normal (resp. cancer) samples are saved as intermediate files because it is an expensive operation
The function checks the presence of the intermediate filtering files to decide whether to perform
- the full preprocessing + threshold filtering steps
- or simply re-load the intermediate files
:param out_dir: str path for output directory
:param expr_limit: float expression limit threshold to keep a kmer
:param n_samples_lim: int number of samples that need to pass the expression limit
:param target_sample: str name of the sample of interest.
To be excluded in the number of samples that pass the expression limit
:param tag: str tag related to the type of samples. Example cancer or normal
:param batch_tag: str batch mode, batch tag to be appended to intermediate file
:returns:
- launch_preprocess: bool, whether to perform the full preprocessing + threshold filtering steps
or simply re-load the intermediate files
- path_interm_matrix_for_express_threshold, path_interm_matrix_for_sample_threshold,
path_interm_kmers_annotOnly are respectively the path (str) where
the expression-filtered matrix, the sample-filtered matrix and
the kmers derived solely from the annotation are saved
'''
base_n_samples = 1
base_expr = 0.0
format_tag = '.tsv.gz'
# For cancer matrix intermediate file the recurrence filter is not applied to the target sample
if target_sample:
suffix = f'Except{target_sample}'
else:
suffix = ''

# For normal samples the expression threshold filtering is not applied to the kmers found only in the annotation
# but not in the background samples. These kmers will be by default removed from the foreground matrix.
if tag == 'normals':
path_interm_kmers_annotOnly = os.path.join(out_dir, f'kmers_derived_solely_from_annotation{format_tag}')
else:
path_interm_kmers_annotOnly = None

# Saving paths

path_interm_matrix_for_express_threshold = os.path.join(out_dir,
f'interm_{tag}_combiExprCohortLim{expr_limit}Across{base_n_samples}{suffix}{batch_tag}{format_tag}')
path_interm_matrix_for_sample_threshold = os.path.join(out_dir,
f'interm_{tag}_combiExprCohortLim{base_expr}Across{base_n_samples}{suffix}{batch_tag}{format_tag}')
# Check existence
if (expr_limit and os.path.isfile(os.path.join(path_interm_matrix_for_express_threshold, '_SUCCESS'))) \
and (n_samples_lim is not None and os.path.isfile(os.path.join(path_interm_matrix_for_sample_threshold, '_SUCCESS'))):

logging.info((f'Intermediate {tag} filtering already performed in: {path_interm_matrix_for_express_threshold} '
f' and {path_interm_matrix_for_sample_threshold}. Re-loading {tag} intermediate data...'))
logging.info((f'Proceed with care! Using intermediate files means ignoring --filterNeojuncCoord, '
f'--filterAnnotatedRF parameter.'))
launch_preprocess = False
else:
logging.info(f'At least one intermediate {tag} filtering file is missing.')
logging.info(f'Will compute full filtering steps according to user input parameters')
launch_preprocess = True

return launch_preprocess, \
path_interm_matrix_for_express_threshold, \
path_interm_matrix_for_sample_threshold, \
path_interm_kmers_annotOnly



def filtered_path(arg, cancer_sample_ori, mutation_mode, normal_files, batch_tag, extension):
'''
:param arg: argument class from argparse
:param cancer_sample_ori: str. name of the cancer sample
:param mutation_mode: str. mutation mode flag
:param normal_files: bool. whether the normal files are used as an input
:param batch_tag: str. tag for the batch
:param extension: str. saving format extension
:return:
path_filter_final
path_filter_final_uniprot
'''
base_path_final = os.path.join(arg.output_dir,
(f'{arg.tag_prefix}{cancer_sample_ori}_{mutation_mode}_'
f'SampleLim{arg.sample_expr_support_cancer}'
f'CohortLim{arg.cohort_expr_support_cancer}'
f'Across{arg.n_samples_lim_cancer}'))
if normal_files:
base_path_final += (f'_FiltNormals{arg.tag_normals}'
f'Cohortlim{arg.cohort_expr_support_normal}'
f'Across{arg.n_samples_lim_normal}')

path_filter_final = base_path_final + batch_tag + extension
path_filter_final_uniprot = base_path_final + '_FiltUniprot' + batch_tag + extension

return path_filter_final, path_filter_final_uniprot
Loading