Merge pull request #75 from chainsawriot/frex

fix #74
gesistsa · Aug 25, 2023 · 68439dc · 68439dc
2 parents 2376967 + c87c6b1
commit 68439dc
Show file tree

Hide file tree

Showing 9 changed files with 222 additions and 38 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -41,3 +41,4 @@
 ^\.github$
 ^rawdata/
 ^CRAN-SUBMISSION$
+^\.editorconfig$
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,16 @@
+root = true
+
+
+[*]
+charset = utf8
+end_of_line = lf
+insert_final_newline = true
+indent_style = space
+
+
+[*.{r,R,rmd,Rmd,cpp}]
+indent_size = 4
+trim_trailing_whitespace = true
+
+[Makefile]
+indent_style = tab
diff --git a/R/oolong.R b/R/oolong.R
@@ -51,21 +51,23 @@ Oolong_test <- R6::R6Class(
 #' For more details, please see the overview vignette: \code{vignette("overview", package = "oolong")}
 #' @param input_model (wi, ti, witi, wsi) a STM, WarpLDA, topicmodels, KeyATM, seededlda, textmodel_nb, or BTM object; if it is NULL, create_oolong assumes that you want to create gold standard.
 #' @param input_corpus (wi, ti, witi, wsi, gs) if input_model is not null, it should be the corpus (character vector or quanteda::corpus object) to generate the model object. If input_model and input_corpus are not NULL, topic intrusion test cases are generated. If input_model is a BTM object, this argument is ignored. If input_model is null, it generates gold standard test cases.
-#' @param n_top_terms (wi, witi) integer, number of top topic words to be included in the candidates of word intrusion test. 
+#' @param n_top_terms (wi, witi) integer, number of top topic words to be included in the candidates of word intrusion test.
 #' @param bottom_terms_percentile (wi, witi) double, a term is considered to be an word intruder when its theta less than the percentile of this theta, must be within the range of 0 to 1
 #' @param exact_n (ti, witi, gs) integer, number of topic intrusion test cases to generate, ignore if frac is not NULL
 #' @param frac (ti, witi, gs) double, fraction of test cases to be generated from the corpus
 #' @param n_top_topics (wi, witi) integer, number of most relevant topics to be shown alongside the intruder topic
 #' @param n_topiclabel_words (witi, ti, wsi) integer, number of topic words to be shown as the topic ("ti" and "witi") / word set ("wsi") label
 #' @param use_frex_words (wi, witi, ti, wsi) logical, for a STM object, use FREX words if TRUE, use PROB words if FALSE
-#' @param difficulty (wi, witi, ti, wsi) double, adjust the difficulty of the test. Higher value indicates higher difficulty and must be within the range of 0 to 1, no effect for STM if use_frex_words is FALSE. Ignore for topicmodels objects
+#' @param frexweight (wi, witi, ti, wsi) double, adjust the `frexweight` for STM (see [stm::labelTopics()]), no effect for STM if use_frex_words is FALSE
+#' @param lambda (wi, witi, ti, wsi) double, adjust the `lambda` for WarpLDA (see [text2vec::LatentDirichletAllocation()])
 #' @param type (create_oolong) a character string to denote what you want to create. "wi": word intrusion test; "ti": topic intrusion test; "witi": both word intrusion test and topic intrusion test; "gs": gold standard generation
 #' @param input_dfm (wi, witi, ti, wsi) a dfm object used for training the input_model, if input_model is a WarpLDA object
 #' @param construct (gs) string, an adjective to describe the construct you want your coders to code the the gold standard test cases
 #' @param btm_dataframe (witi, ti) dataframe used for training the input_model, if input_model is a BTM object
 #' @param userid a character string to denote the name of the coder. Default to NA (no userid); not recommended
 #' @param n_correct_ws (wsi) number of word sets to be shown alongside the intruder word set
 #' @param wsi_n_top_terms (wsi) number of top topic words from each topic to be randomized selected as the word set label
+#' @param difficulty (wi, witi, ti, wsi) double, deprecated, for backward compatibility
 #' @return an oolong test object.
 #' @examples
 #' ## Creation of oolong test with only word intrusion test
@@ -93,17 +95,21 @@ Oolong_test <- R6::R6Class(
 #'   Song et al. (2020) In validations we trust? The impact of imperfect human annotations as a gold standard on the quality of validation of automated content analysis. Political Communication.
 #'
 #'   Ying, L., Montgomery, J. M., & Stewart, B. M. (2021). Topics, Concepts, and Measurement: A Crowdsourced Procedure for Validating Topics as Measures. Political Analysis
-#' 
+#'
 #' @export
-create_oolong <- function(input_model = NULL, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, use_frex_words = FALSE, difficulty = 1, input_dfm = NULL, construct = "positive", btm_dataframe = NULL, n_correct_ws = 3, wsi_n_top_terms = 20, userid = NA, type = "witi") {
+create_oolong <- function(input_model = NULL, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, use_frex_words = FALSE, frexweight = .5, input_dfm = NULL, construct = "positive", btm_dataframe = NULL, n_correct_ws = 3, wsi_n_top_terms = 20, userid = NA, type = "witi", lambda = 1, difficulty = NULL) {
     .cstop(!type %in% c("wi", "witi", "ti", "gs", "wsi"), "Unknown type, available types are 'wi', 'witi', 'ti', 'wsi' and 'gs'")
     .cstop(is.null(input_model) & is.null(input_corpus), "input_model and input_corpus cannot be both NULL.")
     if (!is.null(input_model)) {
         .cstop(!.is_topic_model(input_model), "input_model is not a topic model. If you want to create gold standard with an input_corpus, use: create_oolong(input_corpus = input_corpus) or gs(input_corpus)")
     }
     .cstop(length(userid) > 1, "userid must not be a vector with length > 1.")
+    if (!is.null(difficulty)) {
+        frexweight <- difficulty
+        lambda <- difficulty
+    }
     if (!is.null(input_model) & type %in% c("wi", "ti", "witi", "wsi")) {
-        return(Oolong_test_tm$new(input_model = input_model, input_corpus = input_corpus, n_top_terms = n_top_terms, bottom_terms_percentile = bottom_terms_percentile, exact_n = exact_n, frac = frac, n_top_topics = n_top_topics, n_topiclabel_words = n_topiclabel_words, difficulty = difficulty, use_frex_words = use_frex_words, input_dfm = input_dfm, btm_dataframe = btm_dataframe, n_correct_ws = n_correct_ws, wsi_n_top_terms = wsi_n_top_terms, userid = userid, type = type))
+        return(Oolong_test_tm$new(input_model = input_model, input_corpus = input_corpus, n_top_terms = n_top_terms, bottom_terms_percentile = bottom_terms_percentile, exact_n = exact_n, frac = frac, n_top_topics = n_top_topics, n_topiclabel_words = n_topiclabel_words, frexweight = frexweight, use_frex_words = use_frex_words, input_dfm = input_dfm, btm_dataframe = btm_dataframe, n_correct_ws = n_correct_ws, wsi_n_top_terms = wsi_n_top_terms, userid = userid, type = type, lambda = lambda))
     }
     if (is.null(input_model) | type == "gs") {
         return(Oolong_test_gs$new(input_corpus = input_corpus, exact_n = exact_n, frac = frac, construct = construct, userid = userid))

diff --git a/R/oolong_stm.R b/R/oolong_stm.R
@@ -1,16 +1,16 @@
-.extract_ingredients.input_model_s3_stm <- function(input_model_s3, n_top_terms = 5, difficulty = 1, use_frex_words = FALSE, need_topic = FALSE, n_topiclabel_words = 8, ...) {
+.extract_ingredients.input_model_s3_stm <- function(input_model_s3, n_top_terms = 5, frexweight = .5, use_frex_words = FALSE, need_topic = FALSE, n_topiclabel_words = 8, ...) {
     input_model <- input_model_s3$model
     K <- input_model$settings$dim$K
     V <- input_model$settings$dim$V
     if (use_frex_words) {
-        terms <- stm::labelTopics(input_model, n = input_model$settings$dim$V, frexweight = difficulty)$frex
+        terms <- stm::labelTopics(input_model, n = input_model$settings$dim$V, frexweight = frexweight)$frex
     } else {
         terms <- stm::labelTopics(input_model, n = input_model$settings$dim$V)$prob
     }
     all_terms <- unique(as.vector(terms[,seq_len(n_top_terms)]))
     if (need_topic) {
         if (use_frex_words) {
-            model_terms <- stm::labelTopics(input_model, n = n_topiclabel_words, frexweight = difficulty)$frex
+            model_terms <- stm::labelTopics(input_model, n = n_topiclabel_words, frexweight = frexweight)$frex
         } else {
             model_terms <- stm::labelTopics(input_model, n = n_topiclabel_words)$prob
         }

diff --git a/R/oolong_tm.R b/R/oolong_tm.R
@@ -121,8 +121,8 @@
     return(test_content)
 }
 
-.generate_test_content <- function(input_model, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, difficulty = 1, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, type = "witi", n_correct_ws = 3, wsi_n_top_terms = 20) {
-    ingredients <- .extract_ingredients(.convert_input_model_s3(input_model), n_top_terms = n_top_terms, difficulty = difficulty, need_topic = !is.null(input_corpus), n_topiclabel_words = n_topiclabel_words, input_dfm = input_dfm, use_frex_words = use_frex_words, input_corpus = input_corpus, btm_dataframe = btm_dataframe)
+.generate_test_content <- function(input_model, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, frexweight = .5, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, type = "witi", n_correct_ws = 3, wsi_n_top_terms = 20, lambda = 1) {
+    ingredients <- .extract_ingredients(.convert_input_model_s3(input_model), n_top_terms = n_top_terms, frexweight = frexweight, need_topic = !is.null(input_corpus), n_topiclabel_words = n_topiclabel_words, input_dfm = input_dfm, use_frex_words = use_frex_words, input_corpus = input_corpus, btm_dataframe = btm_dataframe, lambda = lambda)
     .cstop(type %in% c("ti") & is.null(ingredients$theta), "input_corpus can't be NULL for generating oolong test object with only topic intrusion test.")
     test_content <- list()
     if (type %in% c("wi", "witi")) {
@@ -133,7 +133,7 @@
     } else if (type %in% c("witi", "ti")) {
         test_content$ti <- .generate_topic_intrusion_test(input_corpus = input_corpus, ingredients = ingredients, exact_n = exact_n, frac = frac, n_top_topics = n_top_topics, n_topiclabel_words = n_topiclabel_words)
     } else {
-        test_content$ti <- NULL        
+        test_content$ti <- NULL
     }
     if (type %in% c("wsi")) {
         test_content$wsi <- .generate_wsi(ingredients, n_correct_ws = n_correct_ws, n_topiclabel_words = n_topiclabel_words, wsi_n_top_terms = wsi_n_top_terms)
@@ -209,8 +209,23 @@ Oolong_test_tm <-
         "oolong_test_tm",
         inherit = Oolong_test,
         public = list(
-            initialize = function(input_model = NULL, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = 15, frac = NULL, n_top_topics = 3, n_topiclabel_words = 8, difficulty = 1, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, userid = NA, n_correct_ws = 3, wsi_n_top_terms = 20, type = "witi") {
-                private$test_content <- .generate_test_content(input_model, input_corpus, n_top_terms, bottom_terms_percentile, exact_n, frac, n_top_topics, n_topiclabel_words, difficulty, use_frex_words = use_frex_words, input_dfm = input_dfm, btm_dataframe = btm_dataframe, type = type, n_correct_ws = n_correct_ws, wsi_n_top_terms = wsi_n_top_terms)
+            initialize = function(input_model = NULL, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = 15, frac = NULL, n_top_topics = 3, n_topiclabel_words = 8, frexweight = .5, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, userid = NA, n_correct_ws = 3, wsi_n_top_terms = 20, lambda = 1, type = "witi") {
+                private$test_content <- .generate_test_content(input_model = input_model,
+                                                               input_corpus = input_corpus,
+                                                               n_top_terms = n_top_terms,
+                                                               bottom_terms_percentile = bottom_terms_percentile,
+                                                               exact_n = exact_n,
+                                                               frac = frac,
+                                                               n_top_topics = n_top_topics,
+                                                               n_topiclabel_words = n_topiclabel_words,
+                                                               frexweight = frexweight,
+                                                               use_frex_words = use_frex_words,
+                                                               input_dfm = input_dfm,
+                                                               btm_dataframe = btm_dataframe,
+                                                               type = type,
+                                                               n_correct_ws = n_correct_ws,
+                                                               wsi_n_top_terms = wsi_n_top_terms,
+                                                               lambda = lambda)
                 self$userid <- userid
                 private$hash <- .safe_hash(private$test_content)
                 private$hash_input_model <- .safe_hash(input_model)
@@ -242,5 +257,3 @@ Oolong_test_tm <-
             finalized = FALSE
         )
         )
-
-
diff --git a/R/oolong_ui.R b/R/oolong_ui.R
@@ -1,25 +1,69 @@
 #' @rdname create_oolong
 #' @export
-wi <- function(input_model = NULL, userid = NA, n_top_terms = 5, bottom_terms_percentile = 0.6, difficulty = 1, use_frex_words = FALSE) {
-    create_oolong(input_model = input_model, n_top_terms = n_top_terms, bottom_terms_percentile = bottom_terms_percentile, difficulty = difficulty, use_frex_words = use_frex_words, userid = userid, type = "wi")
+wi <- function(input_model = NULL, userid = NA, n_top_terms = 5, bottom_terms_percentile = 0.6, frexweight = .5, use_frex_words = FALSE, lambda = 1, difficulty = NULL) {
+    create_oolong(input_model = input_model,
+                  n_top_terms = n_top_terms,
+                  bottom_terms_percentile = bottom_terms_percentile,
+                  frexweight = frexweight,
+                  use_frex_words = use_frex_words,
+                  userid = userid, type = "wi",
+                  lambda = lambda,
+                  difficulty = difficulty)
 }
 
 #' @rdname create_oolong
 #' @export
-witi <- function(input_model = NULL, input_corpus = NULL, userid = NA, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, use_frex_words = FALSE, difficulty = 1, input_dfm = NULL, btm_dataframe = NULL) {
-    create_oolong(input_model = input_model, input_corpus = input_corpus, n_top_terms = n_top_terms, bottom_terms_percentile = bottom_terms_percentile, exact_n = exact_n, frac = frac, n_top_topics = n_top_topics, n_topiclabel_words = n_topiclabel_words, use_frex_words = use_frex_words, difficulty = difficulty, input_dfm = input_dfm, btm_dataframe = btm_dataframe, userid = userid, type = "witi")
+witi <- function(input_model = NULL, input_corpus = NULL, userid = NA, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, frexweight = .5, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, lambda = 1, difficulty = NULL) {
+    create_oolong(input_model = input_model,
+                  input_corpus = input_corpus,
+                  n_top_terms = n_top_terms,
+                  bottom_terms_percentile = bottom_terms_percentile,
+                  exact_n = exact_n,
+                  frac = frac,
+                  n_top_topics = n_top_topics,
+                  n_topiclabel_words = n_topiclabel_words,
+                  frexweight = frexweight,
+                  use_frex_words = use_frex_words,
+                  input_dfm = input_dfm,
+                  btm_dataframe = btm_dataframe,
+                  userid = userid,
+                  type = "witi",
+                  lambda = lambda,
+                  difficulty = difficulty)
 }
 
 #' @rdname create_oolong
 #' @export
-ti <- function(input_model = NULL, input_corpus = NULL, userid = NA, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, use_frex_words = FALSE, difficulty = 1, input_dfm = NULL, btm_dataframe = NULL) {
-    create_oolong(input_model = input_model, input_corpus = input_corpus, exact_n = exact_n, frac = frac, n_top_topics = n_top_topics, n_topiclabel_words = n_topiclabel_words, use_frex_words = use_frex_words, difficulty = difficulty, input_dfm = input_dfm, btm_dataframe = btm_dataframe, userid = userid, type = "ti")
+ti <- function(input_model = NULL, input_corpus = NULL, userid = NA, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, frexweight = .5, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, lambda = 1, difficulty = NULL) {
+    create_oolong(input_model = input_model,
+                  input_corpus = input_corpus,
+                  exact_n = exact_n,
+                  frac = frac,
+                  n_top_topics = n_top_topics,
+                  n_topiclabel_words = n_topiclabel_words,
+                  frexweight = frexweight,
+                  use_frex_words = use_frex_words,
+                  input_dfm = input_dfm,
+                  btm_dataframe = btm_dataframe,
+                  userid = userid,
+                  type = "ti",
+                  lambda = lambda,
+                  difficulty = difficulty)
 }
 
 #' @rdname create_oolong
 #' @export
-wsi <- function(input_model = NULL, userid = NA, n_topiclabel_words = 4, n_correct_ws = 3, wsi_n_top_terms = 20, difficulty = 1, use_frex_words = FALSE) {
-    create_oolong(input_model = input_model, difficulty = difficulty, use_frex_words = use_frex_words, n_topiclabel_words = n_topiclabel_words, n_correct_ws = n_correct_ws, wsi_n_top_terms = wsi_n_top_terms, userid = userid, type = "wsi")
+wsi <- function(input_model = NULL, userid = NA, n_topiclabel_words = 4, n_correct_ws = 3, wsi_n_top_terms = 20, frexweight = .5, use_frex_words = FALSE, lambda = 1, difficulty = NULL) {
+    create_oolong(input_model = input_model,
+                  frexweight = frexweight,
+                  use_frex_words = use_frex_words,
+                  n_topiclabel_words = n_topiclabel_words,
+                  n_correct_ws = n_correct_ws,
+                  wsi_n_top_terms = wsi_n_top_terms,
+                  userid = userid,
+                  type = "wsi",
+                  lambda = lambda,
+                  difficulty = difficulty)
 }
 
 #' @rdname create_oolong

diff --git a/R/oolong_warplda.R b/R/oolong_warplda.R
@@ -1,12 +1,12 @@
-.extract_ingredients.input_model_s3_warplda <- function(input_model_s3, n_top_terms = 5, difficulty = 1, need_topic = TRUE, n_topiclabel_words = 8, input_dfm = NULL, ...) {
+.extract_ingredients.input_model_s3_warplda <- function(input_model_s3, n_top_terms = 5, lambda = lambda, need_topic = TRUE, n_topiclabel_words = 8, input_dfm = NULL, ...) {
     input_model <- input_model_s3$model
     K <- input_model$.__enclos_env__$private$n_topics
     V <- length(input_model$.__enclos_env__$private$vocabulary)
-    terms <- t(input_model$get_top_words(n = V, lambda = difficulty))
+    terms <- t(input_model$get_top_words(n = V, lambda = lambda))
     all_terms <- unique(as.vector(terms[,seq_len(n_top_terms)]))
     if (need_topic) {
         .cstop(is.null(input_dfm), "input_dfm must not be NULL when input_model is a WarpLDA object.")
-        model_terms <- t(input_model$get_top_words(n = n_topiclabel_words, lambda = difficulty))
+        model_terms <- t(input_model$get_top_words(n = n_topiclabel_words, lambda = lambda))
         theta <- input_model$transform(input_dfm)
     } else {
         model_terms <- NULL