Skip to content

Commit

Permalink
Merge pull request #2180 from broadinstitute/jb-pairwise-diff-exp
Browse files Browse the repository at this point in the history
Adding pairwise differential expression to `DifferentialExpressionService` (SCP-5848)
  • Loading branch information
bistline authored Dec 12, 2024
2 parents 867415d + 5180217 commit 6a22526
Show file tree
Hide file tree
Showing 9 changed files with 224 additions and 29 deletions.
48 changes: 41 additions & 7 deletions app/lib/differential_expression_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def self.run_differential_expression_on_all(study_accession, user: nil, machine_
# - +user+ (User) => User initiating parse action (for email delivery)
# - +annotation_name+ (String) => Name of requested annotation
# - +annotation_scope+ (String) => Scope of requested annotation ('study' or 'cluster')
# - +de_type+ (String) => Type of differential expression calculation: 'rest' (one-vs-rest) or 'pairwise'
# - +machine_type+ (String) => Override default VM machine type
# - +dry_run+ (Boolean) => Indication of whether or not this is a pre-flight check
#
Expand All @@ -123,18 +124,21 @@ def self.run_differential_expression_on_all(study_accession, user: nil, machine_
# * *raises*
# - (ArgumentError) => if requested parameters do not validate
def self.run_differential_expression_job(cluster_group, study, user, annotation_name:, annotation_scope:,
machine_type: nil, dry_run: nil)
de_type: 'rest', group1: nil, group2: nil, machine_type: nil, dry_run: nil)
validate_study(study)
validate_annotation(cluster_group, study, annotation_name, annotation_scope)
validate_annotation(cluster_group, study, annotation_name, annotation_scope, group1:, group2:)
cluster_url = cluster_file_url(cluster_group)
study_file = cluster_group.study_file
metadata_url = study_file.is_viz_anndata? ?
RequestUtils.data_fragment_url(study_file, 'metadata') :
study.metadata_file.gs_url
# begin assembling parameters
de_params = {
annotation_name: annotation_name,
annotation_scope: annotation_scope,
annotation_name:,
annotation_scope:,
de_type:,
group1:,
group2:,
annotation_file: annotation_scope == 'cluster' ? cluster_url : metadata_url,
cluster_file: cluster_url,
cluster_name: cluster_group.name
Expand Down Expand Up @@ -321,16 +325,26 @@ def self.study_eligible?(study, skip_existing: false)
# - +study+ (Study) => Study to which StudyFile belongs
# - +annotation_name+ (String) => Name of requested annotation
# - +annotation_scope+ (String) => Scope of requested annotation ('study' or 'cluster')
# - +group1+ (String) => first annotation label for pairwise
# - +group2+ (String) => second annotation label for pairwise
#
# * *raises*
# - (ArgumentError) => if requested parameters do not validate
def self.validate_annotation(cluster_group, study, annotation_name, annotation_scope)
def self.validate_annotation(cluster_group, study, annotation_name, annotation_scope, group1: nil, group2: nil)
pairwise = group1.present? || group2.present?
validate_pairwise(group1, group2) if pairwise

result = DifferentialExpressionResult.find_by(study:, cluster_group:, annotation_name:, annotation_scope:)
if result.present?
if result.present? && !pairwise
raise ArgumentError,
"#{annotation_name} already exists for #{study.accession}:#{cluster_group.name}, " \
"please delete result #{result.id} before retrying"
elsif pairwise && result.present? && result.has_pairwise_comparison?(group1, group2)
raise ArgumentError,
"#{group1} vs. #{group2} pairwise already exists for #{annotation_name} on " \
"#{study.accession}:#{cluster_group.name}, you must remove that entry from #{result.id} before retrying"
end

can_visualize = false
if annotation_scope == 'cluster'
annotation = cluster_group.cell_annotations&.detect do |annot|
Expand All @@ -348,8 +362,15 @@ def self.validate_annotation(cluster_group, study, annotation_name, annotation_s
# last, validate that the requested annotation & cluster will provide a valid intersection of annotation values
# specifically, discard any annotation/cluster combos that only result in one distinct label
cells_by_label = ClusterVizService.cells_by_annotation_label(cluster_group, annotation_name, annotation_scope)
if cells_by_label.keys.count < 2
if !pairwise && cells_by_label.keys.count < 2
raise ArgumentError, "#{identifier} does not have enough labels represented in #{cluster_group.name}"
elsif pairwise
missing = {
"#{group1}" => cells_by_label[group1].count,
"#{group2}" => cells_by_label[group2].count
}.keep_if { |_, c| c < 2 }
raise ArgumentError,
"#{missing.keys.join(', ')} does not have enough cells represented in #{identifier}" if missing.any?
end
end

Expand All @@ -365,6 +386,19 @@ def self.validate_study(study)
raise ArgumentError, "#{study.accession} cannot view cluster plots" unless study.can_visualize_clusters?
end

# ensure both group1 and group2 are provided for pairwise calculations
#
# * *params*
# - +group1+ (String) => first annotation label for pairwise
# - +group2+ (String) => second annotation label for pairwise
#
# * *raises*
# - (ArgumentError) => If requested study is not eligible for DE
def self.validate_pairwise(group1, group2)
missing = { group1:, group2: }.keep_if { |_, v| v.blank? }
raise ArgumentError, "must provide #{missing.keys.join(', ')} for pairwise calculation" unless missing.empty?
end

# determine if a study has author-uploaded DE results
# this will stop the automatic calculation of new DE results if they add more data
#
Expand Down
8 changes: 8 additions & 0 deletions app/models/differential_expression_parameters.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ class DifferentialExpressionParameters
# annotation_file: source file for above annotation
# cluster_file: clustering file with cells to use as control list for DE
# cluster_name: name of associated ClusterGroup object
# de_type: type of differential expression calculation: 'rest' (one-vs-rest) or 'pairwise'
# group1: first group for pairwise comparison
# group2: second group for pairwise comparison
# matrix_file_path: raw counts matrix with source expression data
# matrix_file_type: type of raw counts matrix (dense, sparse)
# gene_file (optional): genes/features file for sparse matrix
Expand All @@ -28,6 +31,9 @@ class DifferentialExpressionParameters
annotation_file: nil,
cluster_file: nil,
cluster_name: nil,
de_type: 'rest',
group1: nil,
group2: nil,
matrix_file_path: nil,
matrix_file_type: nil,
gene_file: nil,
Expand All @@ -46,6 +52,8 @@ class DifferentialExpressionParameters
validates :annotation_file, :cluster_file, :matrix_file_path,
format: { with: Parameterizable::GS_URL_REGEXP, message: 'is not a valid GS url' }
validates :annotation_scope, inclusion: %w[cluster study]
validates :de_type, inclusion: %w[rest pairwise]
validates :group1, :group2, presence: true, if: -> { de_type == 'pairwise' }
validates :matrix_file_type, inclusion: %w[dense mtx h5ad]
validates :machine_type, inclusion: Parameterizable::GCE_MACHINE_TYPES
validates :gene_file, :barcode_file,
Expand Down
37 changes: 21 additions & 16 deletions app/models/differential_expression_result.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class DifferentialExpressionResult
validate :annotation_exists?

before_validation :set_cluster_name
before_validation :set_one_vs_rest_comparisons, unless: proc { study_file.present? }
before_validation :set_automated_comparisons, unless: proc { study_file.present? }
before_destroy :remove_output_files

## STUDY FILE GETTERS
Expand Down Expand Up @@ -86,12 +86,7 @@ def bucket_path_for(label, comparison_group: nil)
# will convert non-word characters to underscores "_", except plus signs "+" which are changed to "pos"
# this is to handle cases where + or - are the only difference in labels, such as CD4+ and CD4-
def filename_for(label, comparison_group: nil)
if comparison_group.present?
first_label, second_label = Naturally.sort([label, comparison_group]) # comparisons must be sorted
values = [cluster_name, annotation_name, first_label, second_label, annotation_scope, computational_method]
else
values = [cluster_name, annotation_name, label, annotation_scope, computational_method]
end
values = [cluster_name, annotation_name, label, comparison_group, annotation_scope, computational_method].compact
basename = DifferentialExpressionService.encode_filename(values)
"#{basename}.tsv"
end
Expand Down Expand Up @@ -168,6 +163,11 @@ def num_pairwise_comparisons
pairwise_comparisons.values.map(&:count).reduce(0, &:+)
end

# check if a particular a vs. b comparison exists
def has_pairwise_comparison?(group1, group2)
!!pairwise_comparisons&.[](group1)&.include?(group2)
end

# initialize one-vs-rest and pairwise comparisons from manifest contents
# will clobber any previous values and save in place once completed, so only use with new instances
#
Expand All @@ -186,17 +186,22 @@ def initialize_comparisons!(comparisons)
save!
end

private

# find the intersection of annotation values from the source, filtered for cells observed in cluster
def set_one_vs_rest_comparisons
cells_by_label = ClusterVizService.cells_by_annotation_label(cluster_group,
annotation_name,
annotation_scope)
observed = cells_by_label.keys.reject { |label| cells_by_label[label].count < MIN_OBSERVED_VALUES }
self.one_vs_rest_comparisons = observed
# set the one-vs-rest or pairwise comparisons from an automated DE run
def set_automated_comparisons(group1: nil, group2: nil)
if group1.present? && group2.present?
self.pairwise_comparisons[group1] ||= []
self.pairwise_comparisons[group1] << group2
else
cells_by_label = ClusterVizService.cells_by_annotation_label(cluster_group,
annotation_name,
annotation_scope)
observed = cells_by_label.keys.reject { |label| cells_by_label[label].count < MIN_OBSERVED_VALUES }
self.one_vs_rest_comparisons = observed
end
end

private

def set_cluster_name
self.cluster_name = cluster_group.name
end
Expand Down
18 changes: 15 additions & 3 deletions app/models/ingest_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,11 @@ def poll_for_completion(run_at: 1.minute.from_now)
set_study_state_after_ingest
study_file.invalidate_cache_by_file_type # clear visualization caches for file
log_to_mixpanel
subject = "#{study_file.file_type} file: '#{study_file.upload_file_name}' has completed parsing"
if action == :differential_expression
subject = "Differential expression analysis for #{study_file.file_type} file: '#{study_file.upload_file_name}' has completed processing"
else
subject = "#{study_file.file_type} file: '#{study_file.upload_file_name}' has completed parsing"
end
message = generate_success_email_array
if special_action?
# don't email users for 'special actions' like DE or image pipeline, instead notify admins
Expand All @@ -397,7 +401,11 @@ def poll_for_completion(run_at: 1.minute.from_now)
log_error_messages
log_to_mixpanel # log before queuing file for deletion to preserve properties
# don't delete files or notify users if this is a 'special action', like DE or image pipeline jobs
subject = "Error: #{study_file.file_type} file: '#{study_file.upload_file_name}' parse has failed"
if action == :differential_expression
subject = "Error: Differential expression analysis for #{study_file.file_type} file: '#{study_file.upload_file_name}' has failed processing"
else
subject = "Error: #{study_file.file_type} file: '#{study_file.upload_file_name}' parse has failed"
end
handle_ingest_failure(subject) unless (special_action? || should_retry?)

admin_email_content = generate_error_email_body(email_type: :dev)
Expand Down Expand Up @@ -722,11 +730,12 @@ def create_differential_expression_results
matrix_file = StudyFile.where(study: study, 'expression_file_info.is_raw_counts' => true).any_of(
{ upload_file_name: matrix_filename }, { remote_location: matrix_filename }
).first
de_result = DifferentialExpressionResult.new(
de_result = DifferentialExpressionResult.find_or_initialize_by(
study: study, cluster_group: cluster, cluster_name: cluster.name,
annotation_name: params_object.annotation_name, annotation_scope: params_object.annotation_scope,
matrix_file_id: matrix_file.id
)
de_result.set_automated_comparisons(group1: params_object.group1, group2: params_object.group2)
de_result.save
end

Expand Down Expand Up @@ -1188,6 +1197,9 @@ def generate_success_email_array
when :differential_expression
message << "Differential expression calculations for #{params_object.cluster_name} have completed"
message << "Selected annotation: #{params_object.annotation_name} (#{params_object.annotation_scope})"
if params_object.de_type == 'pairwise'
message << "Pairwise selections: #{params_object.group1} vs. #{params_object.group2}"
end
when :render_expression_arrays
matrix_name = params_object.matrix_file_path.split('/').last
matrix = study.expression_matrices.find_by(name: matrix_name)
Expand Down
2 changes: 1 addition & 1 deletion config/application.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class Application < Rails::Application
config.middleware.use Rack::Brotli

# Docker image for file parsing via scp-ingest-pipeline
config.ingest_docker_image = 'gcr.io/broad-singlecellportal-staging/scp-ingest-pipeline:1.37.1'
config.ingest_docker_image = 'gcr.io/broad-singlecellportal-staging/scp-ingest-pipeline:1.38.0'

# Docker image for image pipeline jobs
config.image_pipeline_docker_image = 'gcr.io/broad-singlecellportal-staging/image-pipeline:0.1.0_c2b090043'
Expand Down
34 changes: 32 additions & 2 deletions test/models/differential_expression_parameters_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase
matrix_file_type: 'h5ad',
file_size: 10.gigabytes
}

@pairwise_options = {
annotation_name: 'Category',
annotation_scope: 'cluster',
de_type: 'pairwise',
group1: 'foo',
group2: 'bar',
annotation_file: 'gs://test_bucket/metadata.tsv',
cluster_file: 'gs://test_bucket/cluster.tsv',
cluster_name: 'UMAP',
matrix_file_path: 'gs://test_bucket/dense.tsv',
matrix_file_type: 'dense'
}
end

test 'should instantiate and validate parameters' do
Expand All @@ -44,6 +57,8 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase
assert sparse_params.valid?
anndata_params = DifferentialExpressionParameters.new(@anndata_options)
assert anndata_params.valid?
pairwise_params = DifferentialExpressionParameters.new(@pairwise_options)
assert pairwise_params.valid?

# test conditional validations
dense_params.annotation_file = ''
Expand All @@ -56,13 +71,15 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase
sparse_params.machine_type = 'foo'
assert_not sparse_params.valid?
assert_equal [:machine_type, :gene_file], sparse_params.errors.attribute_names
pairwise_params.group1 = ''
assert_not pairwise_params.valid?
end

test 'should format differential expression parameters for python cli' do
dense_params = DifferentialExpressionParameters.new(@dense_options)
options_array = dense_params.to_options_array
dense_params.attributes.each do |name, value|
next if value.blank? # gene_file and barcode_file will not be set
next if value.blank? # some values will not be set

expected_name = Parameterizable.to_cli_opt(name)
assert_includes options_array, expected_name
Expand All @@ -73,6 +90,8 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase
sparse_params = DifferentialExpressionParameters.new(@sparse_options)
options_array = sparse_params.to_options_array
sparse_params.attributes.each do |name, value|
next if value.blank?

expected_name = Parameterizable.to_cli_opt(name)
assert_includes options_array, expected_name
assert_includes options_array, value
Expand All @@ -82,7 +101,18 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase
anndata_params = DifferentialExpressionParameters.new(@anndata_options)
options_array = anndata_params.to_options_array
anndata_params.attributes.each do |name, value|
next if value.blank? # gene_file and barcode_file will not be set
next if value.blank?

expected_name = Parameterizable.to_cli_opt(name)
assert_includes options_array, expected_name
assert_includes options_array, value
end
assert_includes options_array, DifferentialExpressionParameters::PARAMETER_NAME

pairwise_params = DifferentialExpressionParameters.new(@pairwise_options)
options_array = pairwise_params.to_options_array
pairwise_params.attributes.each do |name, value|
next if value.blank?

expected_name = Parameterizable.to_cli_opt(name)
assert_includes options_array, expected_name
Expand Down
19 changes: 19 additions & 0 deletions test/models/differential_expression_result_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -290,4 +290,23 @@ class DifferentialExpressionResultTest < ActiveSupport::TestCase
assert_equal de_file.id, result.study_file_id
end
end

test 'should set observations on results' do
cell_type = DifferentialExpressionResult.new(
study: @study, cluster_group: @cluster_file.cluster_groups.first, annotation_name: 'cell_type__ontology_label',
annotation_scope: 'study', matrix_file_id: @raw_matrix.id
)
cell_type.set_automated_comparisons
assert_equal ['B cell', 'T cell'], cell_type.one_vs_rest_comparisons
assert_empty cell_type.pairwise_comparisons

custom = DifferentialExpressionResult.new(
study: @study, cluster_group: @cluster_file.cluster_groups.first, annotation_name: 'cell_type__custom',
annotation_scope: 'study', matrix_file_id: @raw_matrix.id
)
types = @custom_cell_types.uniq
custom.set_automated_comparisons(group1: types.first, group2: types.last)
assert_equal ['Custom 10'], custom.pairwise_comparisons['Custom 2']
assert_empty custom.one_vs_rest_comparisons
end
end
Loading

0 comments on commit 6a22526

Please sign in to comment.