Merge branch 'devel' of github.com:PavlidisLab/gemma.R into devel

PavlidisLab · Feb 14, 2024 · ea3210e · ea3210e
2 parents 50fc3db + cb14520
commit ea3210e
Show file tree

Hide file tree

Showing 14 changed files with 191 additions and 75 deletions.
diff --git a/R/allEndpoints.R b/R/allEndpoints.R
@@ -20,6 +20,7 @@
 #' @keywords internal
 #'
 #' @examples
+#' gemma.R:::.getResultSets(523099)
 .getResultSets <- function(resultSet = NA_character_, raw = getOption(
         "gemma.raw",
         FALSE
@@ -106,6 +107,7 @@ mem.getResultSets <- function(resultSet = NA_character_, raw = getOption(
 #' @keywords internal
 #'
 #' @examples
+#' gemma.R:::.getResultSetFactors(523099)
 .getResultSetFactors <- function(resultSet = NA_character_, raw = getOption(
         "gemma.raw",
         FALSE

diff --git a/R/convenience.R b/R/convenience.R
@@ -226,6 +226,86 @@ make_design <- function(samples,metaType = "text"){
 }
 
 
+#' Get a subset of an array of factorValues
+#' @param factorValue unimplemented
+#' @param differential_expressions  
+#' @keywords internal
+#' @return a boolean vector, samples representing the resultSet and/or the contrast
+#' are set to TRUE
+subset_factorValues <- function(factorValues,
+                                factorValue = NULL,
+                                differential_expressions = NULL,
+                                resultSet = NULL,
+                                contrast = NULL){
+
+    out <- rep(TRUE,length(factorValues))
+
+
+    if(!is.null(factorValue)){
+        # unimplemented
+    }
+
+    if(!is.null(differential_expressions)){
+
+        # this should never trigger but just in case...
+        assertthat::assert_that(
+            factorValues %>% do.call(rbind,.) %>% dplyr::select(ID,factor.ID) %>% unique %>% .$ID %>% table %>% {all(.==1)},
+            msg = "ID's cannot be repeated across factors")
+
+
+
+        subset <- differential_expressions %>%   dplyr::filter(result.ID == resultSet) %>% .$subsetFactor %>% unique
+        # result set should have the same subset for all contrasts
+        assertthat::assert_that(length(subset)==1)
+
+        if(nrow(subset[[1]])!=0){
+
+            subset_ids <- subset %>% purrr::map('ID') %>% unlist
+
+            in_subset <- factorValues %>% purrr::map_lgl(function(x){
+                any(x$ID %in% subset_ids)
+            })
+
+
+            # subset_ids <- subset[[1]] %>%
+            #     dplyr::mutate(comb = paste0(ID,'.',factor.ID)) %>% {.$comb}
+            # 
+            # in_subset <- factorValues %>% purrr::map_lgl(function(x){
+            #     x %>%  dplyr::mutate(comb = paste0(ID,'.',factor.ID)) %>% {.$comb} %>%
+            #         {all(subset_ids %in% .)}
+            # })
+
+            out <- out & in_subset
+        }
+
+
+        if(!is.null(contrast)){
+            cn <- differential_expressions %>% dplyr::filter(result.ID == resultSet & contrast.ID == contrast)
+            baseline_id <- cn$baseline.factors %>% purrr::map('ID') %>% unlist%>% unique
+            baseline_factor_id <- cn$baseline.factors %>% purrr::map('factor.ID') %>% unlist%>% unique
+
+            contrast_id <- cn$experimental.factors %>% purrr::map('ID') %>% unlist %>% unique
+            contrast_factor_id <-  cn$experimental.factors %>%  purrr::map('factor.ID') %>% unlist %>% unique
+
+            contrast_id<- contrast_id[match(baseline_factor_id,contrast_factor_id)]
+
+            in_contrast <- factorValues %>% purrr::map_lgl(function(x){
+                all(contrast_id %in% x$ID) | 
+                    all(baseline_id %in% x$ID) |
+                    all(c(contrast_id[1],baseline_id[2]) %in% x$ID) |
+                    all(c(contrast_id[2],baseline_id[1]) %in% x$ID)
+            })
+
+            out <- out & in_contrast
+
+        }
+    }
+    return(out)
+}
+
+
+
+
 #' Compile gene expression data and metadata
 #'
 #' Return an annotated Bioconductor-compatible
@@ -236,7 +316,9 @@ make_design <- function(samples,metaType = "text"){
 #' SummarizedExperiments which are more recent. See the Summarized experiment
 #' \href{https://bioconductor.org/packages/release/bioc/vignettes/SummarizedExperiment/inst/doc/SummarizedExperiment.html}{vignette}
 #' or the ExpressionSet \href{https://bioconductor.org/packages/release/bioc/vignettes/Biobase/inst/doc/ExpressionSetIntroduction.pdf}{vignette}
-#' for more details.
+#' for more details. "tidy" for a long form data frame compatible with tidyverse functions.
+#' 'list' to return a list containing individual data frames containing expression values,
+#' design and the experiment.
 #' @inheritParams memoise
 #' @inheritParams get_dataset_expression_for_genes
 #' @param metaType How should the metadata information should be included. Can be "text", "uri" or "both". "text" and "uri" options
@@ -364,45 +446,19 @@ get_dataset_object <- function(datasets,
         data.table::setcolorder(packed_info$exp,c(gene_info,rownames(packed_info$design)))
 
         if(!is.null(resultSets)){
+
+
             diff <- get_dataset_differential_expression_analyses(dataset,memoised = memoised)
-            subset <- diff %>%
-                dplyr::filter(result.ID == resultSets[i]) %>%
-                .$subsetFactor %>% unique
-
-            assertthat::assert_that(length(subset)==1)
 
+            # passing the original samples is fine since expression data is
+            # reordered not design file
+            relevant <- subset_factorValues(packed_info$design$factorValues,
+                                            differential_expressions = diff,
+                                            resultSet = resultSets[i],
+                                            contrast = contrasts[i])
 
-            if(nrow(subset[[1]])!=0){
-                subset_ids <- subset %>% purrr::map('ID') %>% unlist
-
-                in_subset <- packed_info$design$factorValues %>% purrr::map_lgl(function(x){
-                    any(x$ID %in% subset_ids)
-                })
-            } else{
-                in_subset <- TRUE
-            }
-
-            if(!is.null(contrasts)){
-                contrast <- diff %>% dplyr::filter(result.ID == resultSets[i] & contrast.ID == contrasts[i])
-                baseline_id <- contrast$baseline.factors %>% purrr::map('ID') %>% unlist%>% unique
-                baseline_factor_id <- contrast$baseline.factors %>% purrr::map('factor.ID') %>% unlist%>% unique
-                contrast_id <- contrast$experimental.factors %>% purrr::map('ID') %>% unlist %>% unique
-                contrast_factor_id <-  contrast$experimental.factors %>%  purrr::map('factor.ID') %>% unlist %>% unique
-
-                contrast_id<- contrast_id[match(baseline_factor_id,contrast_factor_id)]
-
-
-                in_contrast <- packed_info$design$factorValues %>% purrr::map_lgl(function(x){
-                    all(contrast_id %in% x$ID) | 
-                        all(baseline_id %in% x$ID) |
-                        all(c(contrast_id[1],baseline_id[2]) %in% x$ID) |
-                        all(c(contrast_id[2],baseline_id[1]) %in% x$ID)
-                })
-            } else{
-                in_contrast <- TRUE
-            }
-            packed_info$exp <- packed_info$exp[,.SD,.SDcols = c(gene_info, rownames(packed_info$design)[in_subset & in_contrast])]
-            packed_info$design <- packed_info$design[in_subset & in_contrast,]
+            packed_info$exp <- packed_info$exp[,.SD,.SDcols = c(gene_info, rownames(packed_info$design)[relevant])]
+            packed_info$design <- packed_info$design[relevant,]
         }
 
         return(packed_info)
@@ -704,8 +760,7 @@ gemma_call <- function(call,...,json = TRUE){
 #' @param directory Directory to save the output from the individual calls to. If provided, each page
 #' is saved to separate files.
 #' @param file The name of a file to save the results to, or \code{NULL} to not write
-#' results to a file. If \code{raw == TRUE}, the output will be the raw endpoint from the
-#' API, likely a JSON or a gzip file. Otherwise, it will be a RDS file.
+#' results to a file. This function always saves the output as an RDS file. Otherwise, it will be a RDS file.
 #' @param overwrite Whether or not to overwrite if a file exists at the specified
 #' filename.
 #' @return A data.table or a list containing data from all pages.

diff --git a/R/processors.R b/R/processors.R
@@ -557,17 +557,18 @@ processElements <- function(d) {
 #' The fields of the output data.table are:
 #'
 #' \itemize{
-#'     \item \code{gene.Symbol}: Symbol for the gene
-#'     \item \code{gene.Ensembl}: Ensembl ID for the gene
+#'     \item \code{gene.symbol}: Symbol for the gene
+#'     \item \code{gene.ensembl}: Ensembl ID for the gene
 #'     \item \code{gene.NCBI}: NCBI id for the gene
-#'     \item \code{gene.Name}: Name of the gene
-#'     \item \code{gene.MFX.Rank}: Multifunctionality rank for the gene
-#'     \item \code{taxon.Name}: Name of the species
-#'     \item \code{taxon.Scientific}: Scientific name for the taxon
+#'     \item \code{gene.name}: Name of the gene
+#'     \item \code{gene.aliases}: Gene aliases. Each row includes a vector
+#'     \item \code{gene.MFX.rank}: Multifunctionality rank for the gene
+#'     \item \code{taxon.name}: Name of the species
+#'     \item \code{taxon.scientific}: Scientific name for the taxon
 #'     \item \code{taxon.ID}: Internal identifier given to the species by Gemma
 #'     \item \code{taxon.NCBI}: NCBI ID of the taxon
-#'     \item \code{taxon.Database.Name}: Underlying database used in Gemma for the taxon
-#'     \item \code{taxon.Database.ID}: ID of the underyling database used in Gemma for the taxon
+#'     \item \code{taxon.database.name}: Underlying database used in Gemma for the taxon
+#'     \item \code{taxon.database.ID}: ID of the underlying database used in Gemma for the taxon
 #' }
 #'
 #' @keywords internal
@@ -577,6 +578,9 @@ processGenes <- function(d) {
         gene.ensembl = accessField(d,'ensemblId',NA_character_),
         gene.NCBI = accessField(d,"ncbiId",NA_integer_),
         gene.name = accessField(d, "officialName",NA_character_),
+        gene.aliases = d %>% purrr::map(function(x){
+            x$aliases %>% unlist
+        }),
         # gene.Aliases = d[["aliases"]],
         # gene.GO = d[["numGoTerms"]],
         # gene.Homologues = d[["homologues"]],

diff --git a/inst/script/openapi.json b/inst/script/openapi.json
diff --git a/inst/script/overrides.R b/inst/script/overrides.R
@@ -315,3 +315,16 @@ NULL
 #' @examples
 #' get_dataset_expression_for_genes('GSE2018',genes=c(10225,2841))
 NULL
+
+#' .getResultSets
+#' 
+#' @examples
+#' gemma.R:::.getResultSets(523099)
+NULL
+
+
+#' .getResultSetFactors
+#' 
+#' @examples
+#' gemma.R:::.getResultSetFactors(523099)
+NULL
diff --git a/inst/script/registry.R b/inst/script/registry.R
@@ -85,7 +85,10 @@ registerEndpoint(
 
 # /resultSets, get_result_sets -----
 # unimplemented
-# this is not useful in gemma.R and can be replaced by get_dataset_differential_expression_analyses
+# this endpoint can be potentially useful if we can use 
+# make better use of its filter argument. talk to guillaume 
+# to see if there is a good way to do it
+# otherwise it can be replaced by get_dataset_differential_expression_analyses
 # the implementation below is also missing arguments
 
 # registerEndpoint(
@@ -506,6 +509,8 @@ registerEndpoint("platforms/{platform}/elements/{probe}/genes?offset={offset}&li
 
 # platforms -----
 # merged with platforms/{platform}
+# this endpoint has no unique parameters of its own unlike get_datasets
+# which is why it's not separated
 
 # platforms/{platform}, get_platforms_by_ids ---- 
 registerEndpoint("platforms/{platforms}?&offset={offset}&limit={limit}&sort={sort}&filter={filter}",

diff --git a/man/dot-getResultSetFactors.Rd b/man/dot-getResultSetFactors.Rd
diff --git a/man/dot-getResultSets.Rd b/man/dot-getResultSets.Rd
diff --git a/man/get_all_pages.Rd b/man/get_all_pages.Rd
diff --git a/man/get_dataset_object.Rd b/man/get_dataset_object.Rd
diff --git a/man/get_genes.Rd b/man/get_genes.Rd
diff --git a/man/get_platform_element_genes.Rd b/man/get_platform_element_genes.Rd
diff --git a/man/processGenes.Rd b/man/processGenes.Rd