From de9895087603a74ac8ab85096b7dbc35249e4e85 Mon Sep 17 00:00:00 2001 From: bistline Date: Mon, 9 Dec 2024 13:02:04 -0500 Subject: [PATCH 1/5] Adding support for pairwise automated DE --- app/lib/differential_expression_service.rb | 46 +++++++++++-- .../differential_expression_parameters.rb | 8 +++ app/models/differential_expression_result.rb | 30 +++++--- app/models/ingest_job.rb | 3 +- config/application.rb | 2 +- ...differential_expression_parameters_test.rb | 34 ++++++++- test/models/ingest_job_test.rb | 69 +++++++++++++++++++ .../differential_expression_service_test.rb | 18 +++++ 8 files changed, 190 insertions(+), 20 deletions(-) diff --git a/app/lib/differential_expression_service.rb b/app/lib/differential_expression_service.rb index 79600780c..f1d736be2 100644 --- a/app/lib/differential_expression_service.rb +++ b/app/lib/differential_expression_service.rb @@ -111,6 +111,7 @@ def self.run_differential_expression_on_all(study_accession, user: nil, machine_ # - +user+ (User) => User initiating parse action (for email delivery) # - +annotation_name+ (String) => Name of requested annotation # - +annotation_scope+ (String) => Scope of requested annotation ('study' or 'cluster') + # - +de_type+ (String) => Type of differential expression calculation: 'rest' (one-vs-rest) or 'pairwise' # - +machine_type+ (String) => Override default VM machine type # - +dry_run+ (Boolean) => Indication of whether or not this is a pre-flight check # @@ -123,7 +124,7 @@ def self.run_differential_expression_on_all(study_accession, user: nil, machine_ # * *raises* # - (ArgumentError) => if requested parameters do not validate def self.run_differential_expression_job(cluster_group, study, user, annotation_name:, annotation_scope:, - machine_type: nil, dry_run: nil) + de_type: 'rest', group1: nil, group2: nil, machine_type: nil, dry_run: nil) validate_study(study) validate_annotation(cluster_group, study, annotation_name, annotation_scope) cluster_url = cluster_file_url(cluster_group) @@ -133,8 +134,11 @@ def self.run_differential_expression_job(cluster_group, study, user, annotation_ study.metadata_file.gs_url # begin assembling parameters de_params = { - annotation_name: annotation_name, - annotation_scope: annotation_scope, + annotation_name:, + annotation_scope:, + de_type:, + group1:, + group2:, annotation_file: annotation_scope == 'cluster' ? cluster_url : metadata_url, cluster_file: cluster_url, cluster_name: cluster_group.name @@ -321,16 +325,26 @@ def self.study_eligible?(study, skip_existing: false) # - +study+ (Study) => Study to which StudyFile belongs # - +annotation_name+ (String) => Name of requested annotation # - +annotation_scope+ (String) => Scope of requested annotation ('study' or 'cluster') + # - +group1+ (String) => first annotation label for pairwise + # - +group2+ (String) => second annotation label for pairwise # # * *raises* # - (ArgumentError) => if requested parameters do not validate - def self.validate_annotation(cluster_group, study, annotation_name, annotation_scope) + def self.validate_annotation(cluster_group, study, annotation_name, annotation_scope, group1: nil, group2: nil) + pairwise = group1.present? || group2.present? + validate_pairwise(group1, group2) if pairwise + result = DifferentialExpressionResult.find_by(study:, cluster_group:, annotation_name:, annotation_scope:) - if result.present? + if result.present? && !pairwise raise ArgumentError, "#{annotation_name} already exists for #{study.accession}:#{cluster_group.name}, " \ "please delete result #{result.id} before retrying" + elsif pairwise && result.present? && result.has_pairwise_comparison?(group1, group2) + raise ArgumentError, + "#{group1} vs. #{group2} pairwise already exists for #{annotation_name} on " \ + "#{study.accession}:#{cluster_group.name}, you must remove that entry from #{result.id} before retrying" end + can_visualize = false if annotation_scope == 'cluster' annotation = cluster_group.cell_annotations&.detect do |annot| @@ -348,8 +362,15 @@ def self.validate_annotation(cluster_group, study, annotation_name, annotation_s # last, validate that the requested annotation & cluster will provide a valid intersection of annotation values # specifically, discard any annotation/cluster combos that only result in one distinct label cells_by_label = ClusterVizService.cells_by_annotation_label(cluster_group, annotation_name, annotation_scope) - if cells_by_label.keys.count < 2 + if !pairwise && cells_by_label.keys.count < 2 raise ArgumentError, "#{identifier} does not have enough labels represented in #{cluster_group.name}" + elsif pairwise + missing = { + "#{group1}" => cells_by_label[group1].count, + "#{group2}" => cells_by_label[group2].count + }.keep_if { |_, c| c < 2 } + raise ArgumentError, + "#{missing.keys.join(', ')} does not have enough cells represented in #{identifier}" if missing.any? end end @@ -365,6 +386,19 @@ def self.validate_study(study) raise ArgumentError, "#{study.accession} cannot view cluster plots" unless study.can_visualize_clusters? end + # ensure both group1 and group2 are provided for pairwise calculations + # + # * *params* + # - +group1+ (String) => first annotation label for pairwise + # - +group2+ (String) => second annotation label for pairwise + # + # * *raises* + # - (ArgumentError) => If requested study is not eligible for DE + def self.validate_pairwise(group1, group2) + missing = { group1:, group2: }.keep_if { |_, v| v.blank? } + raise ArgumentError, "must provide #{missing.keys.join(', ')} for pairwise calculation" unless missing.empty? + end + # determine if a study has author-uploaded DE results # this will stop the automatic calculation of new DE results if they add more data # diff --git a/app/models/differential_expression_parameters.rb b/app/models/differential_expression_parameters.rb index 177e13484..77b509027 100644 --- a/app/models/differential_expression_parameters.rb +++ b/app/models/differential_expression_parameters.rb @@ -15,6 +15,9 @@ class DifferentialExpressionParameters # annotation_file: source file for above annotation # cluster_file: clustering file with cells to use as control list for DE # cluster_name: name of associated ClusterGroup object + # de_type: type of differential expression calculation: 'rest' (one-vs-rest) or 'pairwise' + # group1: first group for pairwise comparison + # group2: second group for pairwise comparison # matrix_file_path: raw counts matrix with source expression data # matrix_file_type: type of raw counts matrix (dense, sparse) # gene_file (optional): genes/features file for sparse matrix @@ -28,6 +31,9 @@ class DifferentialExpressionParameters annotation_file: nil, cluster_file: nil, cluster_name: nil, + de_type: 'rest', + group1: nil, + group2: nil, matrix_file_path: nil, matrix_file_type: nil, gene_file: nil, @@ -46,6 +52,8 @@ class DifferentialExpressionParameters validates :annotation_file, :cluster_file, :matrix_file_path, format: { with: Parameterizable::GS_URL_REGEXP, message: 'is not a valid GS url' } validates :annotation_scope, inclusion: %w[cluster study] + validates :de_type, inclusion: %w[rest pairwise] + validates :group1, :group2, presence: true, if: -> { de_type == 'pairwise' } validates :matrix_file_type, inclusion: %w[dense mtx h5ad] validates :machine_type, inclusion: Parameterizable::GCE_MACHINE_TYPES validates :gene_file, :barcode_file, diff --git a/app/models/differential_expression_result.rb b/app/models/differential_expression_result.rb index 8810f6374..5c3d2b091 100644 --- a/app/models/differential_expression_result.rb +++ b/app/models/differential_expression_result.rb @@ -47,7 +47,7 @@ class DifferentialExpressionResult validate :annotation_exists? before_validation :set_cluster_name - before_validation :set_one_vs_rest_comparisons, unless: proc { study_file.present? } + before_validation :set_automated_comparisons, unless: proc { study_file.present? } before_destroy :remove_output_files ## STUDY FILE GETTERS @@ -168,6 +168,11 @@ def num_pairwise_comparisons pairwise_comparisons.values.map(&:count).reduce(0, &:+) end + # check if a particular a vs. b comparison exists + def has_pairwise_comparison?(group1, group2) + !!pairwise_comparisons&.[](group1)&.include?(group2) + end + # initialize one-vs-rest and pairwise comparisons from manifest contents # will clobber any previous values and save in place once completed, so only use with new instances # @@ -186,17 +191,22 @@ def initialize_comparisons!(comparisons) save! end - private - - # find the intersection of annotation values from the source, filtered for cells observed in cluster - def set_one_vs_rest_comparisons - cells_by_label = ClusterVizService.cells_by_annotation_label(cluster_group, - annotation_name, - annotation_scope) - observed = cells_by_label.keys.reject { |label| cells_by_label[label].count < MIN_OBSERVED_VALUES } - self.one_vs_rest_comparisons = observed + # set the one-vs-rest or pairwise comparisons from an automated DE run + def set_automated_comparisons(group1: nil, group2: nil) + if group1.present? && group2.present? + self.pairwise_comparisons[group1] ||= [] + self.pairwise_comparisons[group1] << group2 + else + cells_by_label = ClusterVizService.cells_by_annotation_label(cluster_group, + annotation_name, + annotation_scope) + observed = cells_by_label.keys.reject { |label| cells_by_label[label].count < MIN_OBSERVED_VALUES } + self.one_vs_rest_comparisons = observed + end end + private + def set_cluster_name self.cluster_name = cluster_group.name end diff --git a/app/models/ingest_job.rb b/app/models/ingest_job.rb index 483a43c99..f9c2e6aa2 100644 --- a/app/models/ingest_job.rb +++ b/app/models/ingest_job.rb @@ -722,11 +722,12 @@ def create_differential_expression_results matrix_file = StudyFile.where(study: study, 'expression_file_info.is_raw_counts' => true).any_of( { upload_file_name: matrix_filename }, { remote_location: matrix_filename } ).first - de_result = DifferentialExpressionResult.new( + de_result = DifferentialExpressionResult.find_or_initialize_by( study: study, cluster_group: cluster, cluster_name: cluster.name, annotation_name: params_object.annotation_name, annotation_scope: params_object.annotation_scope, matrix_file_id: matrix_file.id ) + de_result.set_automated_comparisons(group1: params_object.group1, group2: params_object.group2) de_result.save end diff --git a/config/application.rb b/config/application.rb index 1425f00af..8c99744de 100644 --- a/config/application.rb +++ b/config/application.rb @@ -29,7 +29,7 @@ class Application < Rails::Application config.middleware.use Rack::Brotli # Docker image for file parsing via scp-ingest-pipeline - config.ingest_docker_image = 'gcr.io/broad-singlecellportal-staging/scp-ingest-pipeline:1.37.1' + config.ingest_docker_image = 'gcr.io/broad-singlecellportal-staging/scp-ingest-pipeline:1.38.0' # Docker image for image pipeline jobs config.image_pipeline_docker_image = 'gcr.io/broad-singlecellportal-staging/image-pipeline:0.1.0_c2b090043' diff --git a/test/models/differential_expression_parameters_test.rb b/test/models/differential_expression_parameters_test.rb index bc400c790..eabcc2eab 100644 --- a/test/models/differential_expression_parameters_test.rb +++ b/test/models/differential_expression_parameters_test.rb @@ -35,6 +35,19 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase matrix_file_type: 'h5ad', file_size: 10.gigabytes } + + @pairwise_options = { + annotation_name: 'Category', + annotation_scope: 'cluster', + de_type: 'pairwise', + group1: 'foo', + group2: 'bar', + annotation_file: 'gs://test_bucket/metadata.tsv', + cluster_file: 'gs://test_bucket/cluster.tsv', + cluster_name: 'UMAP', + matrix_file_path: 'gs://test_bucket/dense.tsv', + matrix_file_type: 'dense' + } end test 'should instantiate and validate parameters' do @@ -44,6 +57,8 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase assert sparse_params.valid? anndata_params = DifferentialExpressionParameters.new(@anndata_options) assert anndata_params.valid? + pairwise_params = DifferentialExpressionParameters.new(@pairwise_options) + assert pairwise_params.valid? # test conditional validations dense_params.annotation_file = '' @@ -56,13 +71,15 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase sparse_params.machine_type = 'foo' assert_not sparse_params.valid? assert_equal [:machine_type, :gene_file], sparse_params.errors.attribute_names + pairwise_params.group1 = '' + assert_not pairwise_params.valid? end test 'should format differential expression parameters for python cli' do dense_params = DifferentialExpressionParameters.new(@dense_options) options_array = dense_params.to_options_array dense_params.attributes.each do |name, value| - next if value.blank? # gene_file and barcode_file will not be set + next if value.blank? # some values will not be set expected_name = Parameterizable.to_cli_opt(name) assert_includes options_array, expected_name @@ -73,6 +90,8 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase sparse_params = DifferentialExpressionParameters.new(@sparse_options) options_array = sparse_params.to_options_array sparse_params.attributes.each do |name, value| + next if value.blank? + expected_name = Parameterizable.to_cli_opt(name) assert_includes options_array, expected_name assert_includes options_array, value @@ -82,7 +101,18 @@ class DifferentialExpressionParametersTest < ActiveSupport::TestCase anndata_params = DifferentialExpressionParameters.new(@anndata_options) options_array = anndata_params.to_options_array anndata_params.attributes.each do |name, value| - next if value.blank? # gene_file and barcode_file will not be set + next if value.blank? + + expected_name = Parameterizable.to_cli_opt(name) + assert_includes options_array, expected_name + assert_includes options_array, value + end + assert_includes options_array, DifferentialExpressionParameters::PARAMETER_NAME + + pairwise_params = DifferentialExpressionParameters.new(@pairwise_options) + options_array = pairwise_params.to_options_array + pairwise_params.attributes.each do |name, value| + next if value.blank? expected_name = Parameterizable.to_cli_opt(name) assert_includes options_array, expected_name diff --git a/test/models/ingest_job_test.rb b/test/models/ingest_job_test.rb index 33612f5e3..cb720588b 100644 --- a/test/models/ingest_job_test.rb +++ b/test/models/ingest_job_test.rb @@ -999,4 +999,73 @@ class IngestJobTest < ActiveSupport::TestCase cluster.reload assert_not cluster.is_subsampling end + + test 'should create differential expression result on completion' do + study = FactoryBot.create(:detached_study, + name_prefix: 'DifferentialExpressionResult Test', + user: @user, + test_array: @@studies_to_clean) + + cells = %w[A B C D E] + matrix = FactoryBot.create(:expression_file, + name: 'raw.txt', + study:, + expression_file_info: { + is_raw_counts: true, + units: 'raw counts', + library_preparation_protocol: 'Drop-seq', + biosample_input_type: 'Whole cell', + modality: 'Proteomic' + }) + cluster_file = FactoryBot.create(:cluster_file, + name: 'cluster_diffexp.txt', + study:, + cell_input: { + x: [1, 4, 6, 8, 9], + y: [7, 5, 3, 2, 1], + cells: + } + ) + cluster_group = study.cluster_groups.by_name('cluster_diffexp.txt') + FactoryBot.create(:metadata_file, + name: 'metadata.txt', + study:, + cell_input: cells, + annotation_input: [ + { + name: 'cell_type__ontology_label', + type: 'group', + values: ['B cell', 'B cell', 'T cell', 'B cell', 'T cell'] + } + ] + ) + + # one vs rest test + one_vs_rest = DifferentialExpressionParameters.new( + annotation_name: 'cell_type__ontology_label', annotation_scope: 'study', cluster_name: 'cluster_diffexp.txt', + matrix_file_path: "gs://#{study.bucket_id}/raw.txt" + ) + job = IngestJob.new(study:, study_file: cluster_file, action: :differential_expression, params_object: one_vs_rest) + job.create_differential_expression_results + + result = DifferentialExpressionResult.find_by( + study:, annotation_name: 'cell_type__ontology_label', annotation_scope: 'study', matrix_file_id: matrix.id, + cluster_group: + ) + assert result.present? + assert_equal ['B cell', 'T cell'], result.one_vs_rest_comparisons + + # pairwise test + pairwise = DifferentialExpressionParameters.new( + annotation_name: 'cell_type__ontology_label', annotation_scope: 'study', cluster_name: 'cluster_diffexp.txt', + matrix_file_path: "gs://#{study.bucket_id}/raw.txt", de_type: 'pairwise', group1: 'B cell', group2: 'T cell' + ) + job = IngestJob.new(study:, study_file: cluster_file, action: :differential_expression, params_object: pairwise) + job.create_differential_expression_results + + # should be the same DE result with existing one vs. rest results + result.reload + assert_equal ['T cell'], result.pairwise_comparisons['B cell'] + assert_equal ['B cell', 'T cell'], result.one_vs_rest_comparisons + end end diff --git a/test/services/differential_expression_service_test.rb b/test/services/differential_expression_service_test.rb index 91d7aa4a5..719cf8481 100644 --- a/test/services/differential_expression_service_test.rb +++ b/test/services/differential_expression_service_test.rb @@ -108,6 +108,24 @@ class DifferentialExpressionServiceTest < ActiveSupport::TestCase mock.verify job_mock.verify end + + # test pairwise job + @job_params[:de_type] = 'pairwise' + @job_params[:group1] = 'dog' + @job_params[:group2] = 'cat' + + job_mock = Minitest::Mock.new + job_mock.expect(:push_remote_and_launch_ingest, Delayed::Job.new) + mock = Minitest::Mock.new + mock.expect(:delay, job_mock) + IngestJob.stub :new, mock do + job_launched = DifferentialExpressionService.run_differential_expression_job( + @cluster_group, @basic_study, @user, **@job_params + ) + assert job_launched + mock.verify + job_mock.verify + end end test 'should run differential expression job with sparse matrix' do From 2baccd7a6620fd0a08095c92648dcda551ab7cb0 Mon Sep 17 00:00:00 2001 From: bistline Date: Mon, 9 Dec 2024 14:54:00 -0500 Subject: [PATCH 2/5] Adding test coverage --- .../differential_expression_result_test.rb | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/models/differential_expression_result_test.rb b/test/models/differential_expression_result_test.rb index 0e488e49a..d3ff08264 100644 --- a/test/models/differential_expression_result_test.rb +++ b/test/models/differential_expression_result_test.rb @@ -290,4 +290,23 @@ class DifferentialExpressionResultTest < ActiveSupport::TestCase assert_equal de_file.id, result.study_file_id end end + + test 'should set observations on results' do + cell_type = DifferentialExpressionResult.new( + study: @study, cluster_group: @cluster_file.cluster_groups.first, annotation_name: 'cell_type__ontology_label', + annotation_scope: 'study', matrix_file_id: @raw_matrix.id + ) + cell_type.set_automated_comparisons + assert_equal ['B cell', 'T cell'], cell_type.one_vs_rest_comparisons + assert_empty cell_type.pairwise_comparisons + + custom = DifferentialExpressionResult.new( + study: @study, cluster_group: @cluster_file.cluster_groups.first, annotation_name: 'cell_type__custom', + annotation_scope: 'study', matrix_file_id: @raw_matrix.id + ) + types = @custom_cell_types.uniq + custom.set_automated_comparisons(group1: types.first, group2: types.last) + assert_equal ['Custom 10'], custom.pairwise_comparisons['Custom 2'] + assert_empty custom.one_vs_rest_comparisons + end end From 24a1730347dd2a98fc7ad1f1ea638fb44b912d33 Mon Sep 17 00:00:00 2001 From: bistline Date: Wed, 11 Dec 2024 10:03:02 -0500 Subject: [PATCH 3/5] Update annotation validation, Add message re: pairwise to email --- app/lib/differential_expression_service.rb | 2 +- app/models/ingest_job.rb | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/app/lib/differential_expression_service.rb b/app/lib/differential_expression_service.rb index f1d736be2..fd957f736 100644 --- a/app/lib/differential_expression_service.rb +++ b/app/lib/differential_expression_service.rb @@ -126,7 +126,7 @@ def self.run_differential_expression_on_all(study_accession, user: nil, machine_ def self.run_differential_expression_job(cluster_group, study, user, annotation_name:, annotation_scope:, de_type: 'rest', group1: nil, group2: nil, machine_type: nil, dry_run: nil) validate_study(study) - validate_annotation(cluster_group, study, annotation_name, annotation_scope) + validate_annotation(cluster_group, study, annotation_name, annotation_scope, group1:, group2:) cluster_url = cluster_file_url(cluster_group) study_file = cluster_group.study_file metadata_url = study_file.is_viz_anndata? ? diff --git a/app/models/ingest_job.rb b/app/models/ingest_job.rb index f9c2e6aa2..3d0321e93 100644 --- a/app/models/ingest_job.rb +++ b/app/models/ingest_job.rb @@ -1189,6 +1189,9 @@ def generate_success_email_array when :differential_expression message << "Differential expression calculations for #{params_object.cluster_name} have completed" message << "Selected annotation: #{params_object.annotation_name} (#{params_object.annotation_scope})" + if params_object.de_type == 'pairwise' + message << "Pairwise selections: #{params_object.group1} vs. #{params_object.group2}" + end when :render_expression_arrays matrix_name = params_object.matrix_file_path.split('/').last matrix = study.expression_matrices.find_by(name: matrix_name) From 83ac5a0163b58e254ebce03b311f62d57ee8c9ac Mon Sep 17 00:00:00 2001 From: bistline Date: Thu, 12 Dec 2024 13:31:04 -0500 Subject: [PATCH 4/5] disable lexical sorting of A vs. B labels --- app/models/differential_expression_result.rb | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/app/models/differential_expression_result.rb b/app/models/differential_expression_result.rb index 5c3d2b091..24cf12894 100644 --- a/app/models/differential_expression_result.rb +++ b/app/models/differential_expression_result.rb @@ -86,12 +86,7 @@ def bucket_path_for(label, comparison_group: nil) # will convert non-word characters to underscores "_", except plus signs "+" which are changed to "pos" # this is to handle cases where + or - are the only difference in labels, such as CD4+ and CD4- def filename_for(label, comparison_group: nil) - if comparison_group.present? - first_label, second_label = Naturally.sort([label, comparison_group]) # comparisons must be sorted - values = [cluster_name, annotation_name, first_label, second_label, annotation_scope, computational_method] - else - values = [cluster_name, annotation_name, label, annotation_scope, computational_method] - end + values = [cluster_name, annotation_name, label, comparison_group, annotation_scope, computational_method].compact basename = DifferentialExpressionService.encode_filename(values) "#{basename}.tsv" end From 5180217f337cf3e8af3bed737c305b882f2294c1 Mon Sep 17 00:00:00 2001 From: bistline Date: Thu, 12 Dec 2024 15:06:06 -0500 Subject: [PATCH 5/5] Adding special email subject for DE --- app/models/ingest_job.rb | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/app/models/ingest_job.rb b/app/models/ingest_job.rb index 3d0321e93..be4c79fa8 100644 --- a/app/models/ingest_job.rb +++ b/app/models/ingest_job.rb @@ -380,7 +380,11 @@ def poll_for_completion(run_at: 1.minute.from_now) set_study_state_after_ingest study_file.invalidate_cache_by_file_type # clear visualization caches for file log_to_mixpanel - subject = "#{study_file.file_type} file: '#{study_file.upload_file_name}' has completed parsing" + if action == :differential_expression + subject = "Differential expression analysis for #{study_file.file_type} file: '#{study_file.upload_file_name}' has completed processing" + else + subject = "#{study_file.file_type} file: '#{study_file.upload_file_name}' has completed parsing" + end message = generate_success_email_array if special_action? # don't email users for 'special actions' like DE or image pipeline, instead notify admins @@ -397,7 +401,11 @@ def poll_for_completion(run_at: 1.minute.from_now) log_error_messages log_to_mixpanel # log before queuing file for deletion to preserve properties # don't delete files or notify users if this is a 'special action', like DE or image pipeline jobs - subject = "Error: #{study_file.file_type} file: '#{study_file.upload_file_name}' parse has failed" + if action == :differential_expression + subject = "Error: Differential expression analysis for #{study_file.file_type} file: '#{study_file.upload_file_name}' has failed processing" + else + subject = "Error: #{study_file.file_type} file: '#{study_file.upload_file_name}' parse has failed" + end handle_ingest_failure(subject) unless (special_action? || should_retry?) admin_email_content = generate_error_email_body(email_type: :dev)