Skip to content

Commit

Permalink
Merge pull request #75 from chainsawriot/frex
Browse files Browse the repository at this point in the history
fix #74
  • Loading branch information
chainsawriot authored Aug 25, 2023
2 parents 2376967 + c87c6b1 commit 68439dc
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 38 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,4 @@
^\.github$
^rawdata/
^CRAN-SUBMISSION$
^\.editorconfig$
16 changes: 16 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
root = true


[*]
charset = utf8
end_of_line = lf
insert_final_newline = true
indent_style = space


[*.{r,R,rmd,Rmd,cpp}]
indent_size = 4
trim_trailing_whitespace = true

[Makefile]
indent_style = tab
16 changes: 11 additions & 5 deletions R/oolong.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,23 @@ Oolong_test <- R6::R6Class(
#' For more details, please see the overview vignette: \code{vignette("overview", package = "oolong")}
#' @param input_model (wi, ti, witi, wsi) a STM, WarpLDA, topicmodels, KeyATM, seededlda, textmodel_nb, or BTM object; if it is NULL, create_oolong assumes that you want to create gold standard.
#' @param input_corpus (wi, ti, witi, wsi, gs) if input_model is not null, it should be the corpus (character vector or quanteda::corpus object) to generate the model object. If input_model and input_corpus are not NULL, topic intrusion test cases are generated. If input_model is a BTM object, this argument is ignored. If input_model is null, it generates gold standard test cases.
#' @param n_top_terms (wi, witi) integer, number of top topic words to be included in the candidates of word intrusion test.
#' @param n_top_terms (wi, witi) integer, number of top topic words to be included in the candidates of word intrusion test.
#' @param bottom_terms_percentile (wi, witi) double, a term is considered to be an word intruder when its theta less than the percentile of this theta, must be within the range of 0 to 1
#' @param exact_n (ti, witi, gs) integer, number of topic intrusion test cases to generate, ignore if frac is not NULL
#' @param frac (ti, witi, gs) double, fraction of test cases to be generated from the corpus
#' @param n_top_topics (wi, witi) integer, number of most relevant topics to be shown alongside the intruder topic
#' @param n_topiclabel_words (witi, ti, wsi) integer, number of topic words to be shown as the topic ("ti" and "witi") / word set ("wsi") label
#' @param use_frex_words (wi, witi, ti, wsi) logical, for a STM object, use FREX words if TRUE, use PROB words if FALSE
#' @param difficulty (wi, witi, ti, wsi) double, adjust the difficulty of the test. Higher value indicates higher difficulty and must be within the range of 0 to 1, no effect for STM if use_frex_words is FALSE. Ignore for topicmodels objects
#' @param frexweight (wi, witi, ti, wsi) double, adjust the `frexweight` for STM (see [stm::labelTopics()]), no effect for STM if use_frex_words is FALSE
#' @param lambda (wi, witi, ti, wsi) double, adjust the `lambda` for WarpLDA (see [text2vec::LatentDirichletAllocation()])
#' @param type (create_oolong) a character string to denote what you want to create. "wi": word intrusion test; "ti": topic intrusion test; "witi": both word intrusion test and topic intrusion test; "gs": gold standard generation
#' @param input_dfm (wi, witi, ti, wsi) a dfm object used for training the input_model, if input_model is a WarpLDA object
#' @param construct (gs) string, an adjective to describe the construct you want your coders to code the the gold standard test cases
#' @param btm_dataframe (witi, ti) dataframe used for training the input_model, if input_model is a BTM object
#' @param userid a character string to denote the name of the coder. Default to NA (no userid); not recommended
#' @param n_correct_ws (wsi) number of word sets to be shown alongside the intruder word set
#' @param wsi_n_top_terms (wsi) number of top topic words from each topic to be randomized selected as the word set label
#' @param difficulty (wi, witi, ti, wsi) double, deprecated, for backward compatibility
#' @return an oolong test object.
#' @examples
#' ## Creation of oolong test with only word intrusion test
Expand Down Expand Up @@ -93,17 +95,21 @@ Oolong_test <- R6::R6Class(
#' Song et al. (2020) In validations we trust? The impact of imperfect human annotations as a gold standard on the quality of validation of automated content analysis. Political Communication.
#'
#' Ying, L., Montgomery, J. M., & Stewart, B. M. (2021). Topics, Concepts, and Measurement: A Crowdsourced Procedure for Validating Topics as Measures. Political Analysis
#'
#'
#' @export
create_oolong <- function(input_model = NULL, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, use_frex_words = FALSE, difficulty = 1, input_dfm = NULL, construct = "positive", btm_dataframe = NULL, n_correct_ws = 3, wsi_n_top_terms = 20, userid = NA, type = "witi") {
create_oolong <- function(input_model = NULL, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, use_frex_words = FALSE, frexweight = .5, input_dfm = NULL, construct = "positive", btm_dataframe = NULL, n_correct_ws = 3, wsi_n_top_terms = 20, userid = NA, type = "witi", lambda = 1, difficulty = NULL) {
.cstop(!type %in% c("wi", "witi", "ti", "gs", "wsi"), "Unknown type, available types are 'wi', 'witi', 'ti', 'wsi' and 'gs'")
.cstop(is.null(input_model) & is.null(input_corpus), "input_model and input_corpus cannot be both NULL.")
if (!is.null(input_model)) {
.cstop(!.is_topic_model(input_model), "input_model is not a topic model. If you want to create gold standard with an input_corpus, use: create_oolong(input_corpus = input_corpus) or gs(input_corpus)")
}
.cstop(length(userid) > 1, "userid must not be a vector with length > 1.")
if (!is.null(difficulty)) {
frexweight <- difficulty
lambda <- difficulty
}
if (!is.null(input_model) & type %in% c("wi", "ti", "witi", "wsi")) {
return(Oolong_test_tm$new(input_model = input_model, input_corpus = input_corpus, n_top_terms = n_top_terms, bottom_terms_percentile = bottom_terms_percentile, exact_n = exact_n, frac = frac, n_top_topics = n_top_topics, n_topiclabel_words = n_topiclabel_words, difficulty = difficulty, use_frex_words = use_frex_words, input_dfm = input_dfm, btm_dataframe = btm_dataframe, n_correct_ws = n_correct_ws, wsi_n_top_terms = wsi_n_top_terms, userid = userid, type = type))
return(Oolong_test_tm$new(input_model = input_model, input_corpus = input_corpus, n_top_terms = n_top_terms, bottom_terms_percentile = bottom_terms_percentile, exact_n = exact_n, frac = frac, n_top_topics = n_top_topics, n_topiclabel_words = n_topiclabel_words, frexweight = frexweight, use_frex_words = use_frex_words, input_dfm = input_dfm, btm_dataframe = btm_dataframe, n_correct_ws = n_correct_ws, wsi_n_top_terms = wsi_n_top_terms, userid = userid, type = type, lambda = lambda))
}
if (is.null(input_model) | type == "gs") {
return(Oolong_test_gs$new(input_corpus = input_corpus, exact_n = exact_n, frac = frac, construct = construct, userid = userid))
Expand Down
6 changes: 3 additions & 3 deletions R/oolong_stm.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
.extract_ingredients.input_model_s3_stm <- function(input_model_s3, n_top_terms = 5, difficulty = 1, use_frex_words = FALSE, need_topic = FALSE, n_topiclabel_words = 8, ...) {
.extract_ingredients.input_model_s3_stm <- function(input_model_s3, n_top_terms = 5, frexweight = .5, use_frex_words = FALSE, need_topic = FALSE, n_topiclabel_words = 8, ...) {
input_model <- input_model_s3$model
K <- input_model$settings$dim$K
V <- input_model$settings$dim$V
if (use_frex_words) {
terms <- stm::labelTopics(input_model, n = input_model$settings$dim$V, frexweight = difficulty)$frex
terms <- stm::labelTopics(input_model, n = input_model$settings$dim$V, frexweight = frexweight)$frex
} else {
terms <- stm::labelTopics(input_model, n = input_model$settings$dim$V)$prob
}
all_terms <- unique(as.vector(terms[,seq_len(n_top_terms)]))
if (need_topic) {
if (use_frex_words) {
model_terms <- stm::labelTopics(input_model, n = n_topiclabel_words, frexweight = difficulty)$frex
model_terms <- stm::labelTopics(input_model, n = n_topiclabel_words, frexweight = frexweight)$frex
} else {
model_terms <- stm::labelTopics(input_model, n = n_topiclabel_words)$prob
}
Expand Down
27 changes: 20 additions & 7 deletions R/oolong_tm.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@
return(test_content)
}

.generate_test_content <- function(input_model, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, difficulty = 1, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, type = "witi", n_correct_ws = 3, wsi_n_top_terms = 20) {
ingredients <- .extract_ingredients(.convert_input_model_s3(input_model), n_top_terms = n_top_terms, difficulty = difficulty, need_topic = !is.null(input_corpus), n_topiclabel_words = n_topiclabel_words, input_dfm = input_dfm, use_frex_words = use_frex_words, input_corpus = input_corpus, btm_dataframe = btm_dataframe)
.generate_test_content <- function(input_model, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, frexweight = .5, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, type = "witi", n_correct_ws = 3, wsi_n_top_terms = 20, lambda = 1) {
ingredients <- .extract_ingredients(.convert_input_model_s3(input_model), n_top_terms = n_top_terms, frexweight = frexweight, need_topic = !is.null(input_corpus), n_topiclabel_words = n_topiclabel_words, input_dfm = input_dfm, use_frex_words = use_frex_words, input_corpus = input_corpus, btm_dataframe = btm_dataframe, lambda = lambda)
.cstop(type %in% c("ti") & is.null(ingredients$theta), "input_corpus can't be NULL for generating oolong test object with only topic intrusion test.")
test_content <- list()
if (type %in% c("wi", "witi")) {
Expand All @@ -133,7 +133,7 @@
} else if (type %in% c("witi", "ti")) {
test_content$ti <- .generate_topic_intrusion_test(input_corpus = input_corpus, ingredients = ingredients, exact_n = exact_n, frac = frac, n_top_topics = n_top_topics, n_topiclabel_words = n_topiclabel_words)
} else {
test_content$ti <- NULL
test_content$ti <- NULL
}
if (type %in% c("wsi")) {
test_content$wsi <- .generate_wsi(ingredients, n_correct_ws = n_correct_ws, n_topiclabel_words = n_topiclabel_words, wsi_n_top_terms = wsi_n_top_terms)
Expand Down Expand Up @@ -209,8 +209,23 @@ Oolong_test_tm <-
"oolong_test_tm",
inherit = Oolong_test,
public = list(
initialize = function(input_model = NULL, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = 15, frac = NULL, n_top_topics = 3, n_topiclabel_words = 8, difficulty = 1, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, userid = NA, n_correct_ws = 3, wsi_n_top_terms = 20, type = "witi") {
private$test_content <- .generate_test_content(input_model, input_corpus, n_top_terms, bottom_terms_percentile, exact_n, frac, n_top_topics, n_topiclabel_words, difficulty, use_frex_words = use_frex_words, input_dfm = input_dfm, btm_dataframe = btm_dataframe, type = type, n_correct_ws = n_correct_ws, wsi_n_top_terms = wsi_n_top_terms)
initialize = function(input_model = NULL, input_corpus = NULL, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = 15, frac = NULL, n_top_topics = 3, n_topiclabel_words = 8, frexweight = .5, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, userid = NA, n_correct_ws = 3, wsi_n_top_terms = 20, lambda = 1, type = "witi") {
private$test_content <- .generate_test_content(input_model = input_model,
input_corpus = input_corpus,
n_top_terms = n_top_terms,
bottom_terms_percentile = bottom_terms_percentile,
exact_n = exact_n,
frac = frac,
n_top_topics = n_top_topics,
n_topiclabel_words = n_topiclabel_words,
frexweight = frexweight,
use_frex_words = use_frex_words,
input_dfm = input_dfm,
btm_dataframe = btm_dataframe,
type = type,
n_correct_ws = n_correct_ws,
wsi_n_top_terms = wsi_n_top_terms,
lambda = lambda)
self$userid <- userid
private$hash <- .safe_hash(private$test_content)
private$hash_input_model <- .safe_hash(input_model)
Expand Down Expand Up @@ -242,5 +257,3 @@ Oolong_test_tm <-
finalized = FALSE
)
)


60 changes: 52 additions & 8 deletions R/oolong_ui.R
Original file line number Diff line number Diff line change
@@ -1,25 +1,69 @@
#' @rdname create_oolong
#' @export
wi <- function(input_model = NULL, userid = NA, n_top_terms = 5, bottom_terms_percentile = 0.6, difficulty = 1, use_frex_words = FALSE) {
create_oolong(input_model = input_model, n_top_terms = n_top_terms, bottom_terms_percentile = bottom_terms_percentile, difficulty = difficulty, use_frex_words = use_frex_words, userid = userid, type = "wi")
wi <- function(input_model = NULL, userid = NA, n_top_terms = 5, bottom_terms_percentile = 0.6, frexweight = .5, use_frex_words = FALSE, lambda = 1, difficulty = NULL) {
create_oolong(input_model = input_model,
n_top_terms = n_top_terms,
bottom_terms_percentile = bottom_terms_percentile,
frexweight = frexweight,
use_frex_words = use_frex_words,
userid = userid, type = "wi",
lambda = lambda,
difficulty = difficulty)
}

#' @rdname create_oolong
#' @export
witi <- function(input_model = NULL, input_corpus = NULL, userid = NA, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, use_frex_words = FALSE, difficulty = 1, input_dfm = NULL, btm_dataframe = NULL) {
create_oolong(input_model = input_model, input_corpus = input_corpus, n_top_terms = n_top_terms, bottom_terms_percentile = bottom_terms_percentile, exact_n = exact_n, frac = frac, n_top_topics = n_top_topics, n_topiclabel_words = n_topiclabel_words, use_frex_words = use_frex_words, difficulty = difficulty, input_dfm = input_dfm, btm_dataframe = btm_dataframe, userid = userid, type = "witi")
witi <- function(input_model = NULL, input_corpus = NULL, userid = NA, n_top_terms = 5, bottom_terms_percentile = 0.6, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, frexweight = .5, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, lambda = 1, difficulty = NULL) {
create_oolong(input_model = input_model,
input_corpus = input_corpus,
n_top_terms = n_top_terms,
bottom_terms_percentile = bottom_terms_percentile,
exact_n = exact_n,
frac = frac,
n_top_topics = n_top_topics,
n_topiclabel_words = n_topiclabel_words,
frexweight = frexweight,
use_frex_words = use_frex_words,
input_dfm = input_dfm,
btm_dataframe = btm_dataframe,
userid = userid,
type = "witi",
lambda = lambda,
difficulty = difficulty)
}

#' @rdname create_oolong
#' @export
ti <- function(input_model = NULL, input_corpus = NULL, userid = NA, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, use_frex_words = FALSE, difficulty = 1, input_dfm = NULL, btm_dataframe = NULL) {
create_oolong(input_model = input_model, input_corpus = input_corpus, exact_n = exact_n, frac = frac, n_top_topics = n_top_topics, n_topiclabel_words = n_topiclabel_words, use_frex_words = use_frex_words, difficulty = difficulty, input_dfm = input_dfm, btm_dataframe = btm_dataframe, userid = userid, type = "ti")
ti <- function(input_model = NULL, input_corpus = NULL, userid = NA, exact_n = NULL, frac = 0.01, n_top_topics = 3, n_topiclabel_words = 8, frexweight = .5, use_frex_words = FALSE, input_dfm = NULL, btm_dataframe = NULL, lambda = 1, difficulty = NULL) {
create_oolong(input_model = input_model,
input_corpus = input_corpus,
exact_n = exact_n,
frac = frac,
n_top_topics = n_top_topics,
n_topiclabel_words = n_topiclabel_words,
frexweight = frexweight,
use_frex_words = use_frex_words,
input_dfm = input_dfm,
btm_dataframe = btm_dataframe,
userid = userid,
type = "ti",
lambda = lambda,
difficulty = difficulty)
}

#' @rdname create_oolong
#' @export
wsi <- function(input_model = NULL, userid = NA, n_topiclabel_words = 4, n_correct_ws = 3, wsi_n_top_terms = 20, difficulty = 1, use_frex_words = FALSE) {
create_oolong(input_model = input_model, difficulty = difficulty, use_frex_words = use_frex_words, n_topiclabel_words = n_topiclabel_words, n_correct_ws = n_correct_ws, wsi_n_top_terms = wsi_n_top_terms, userid = userid, type = "wsi")
wsi <- function(input_model = NULL, userid = NA, n_topiclabel_words = 4, n_correct_ws = 3, wsi_n_top_terms = 20, frexweight = .5, use_frex_words = FALSE, lambda = 1, difficulty = NULL) {
create_oolong(input_model = input_model,
frexweight = frexweight,
use_frex_words = use_frex_words,
n_topiclabel_words = n_topiclabel_words,
n_correct_ws = n_correct_ws,
wsi_n_top_terms = wsi_n_top_terms,
userid = userid,
type = "wsi",
lambda = lambda,
difficulty = difficulty)
}

#' @rdname create_oolong
Expand Down
6 changes: 3 additions & 3 deletions R/oolong_warplda.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
.extract_ingredients.input_model_s3_warplda <- function(input_model_s3, n_top_terms = 5, difficulty = 1, need_topic = TRUE, n_topiclabel_words = 8, input_dfm = NULL, ...) {
.extract_ingredients.input_model_s3_warplda <- function(input_model_s3, n_top_terms = 5, lambda = lambda, need_topic = TRUE, n_topiclabel_words = 8, input_dfm = NULL, ...) {
input_model <- input_model_s3$model
K <- input_model$.__enclos_env__$private$n_topics
V <- length(input_model$.__enclos_env__$private$vocabulary)
terms <- t(input_model$get_top_words(n = V, lambda = difficulty))
terms <- t(input_model$get_top_words(n = V, lambda = lambda))
all_terms <- unique(as.vector(terms[,seq_len(n_top_terms)]))
if (need_topic) {
.cstop(is.null(input_dfm), "input_dfm must not be NULL when input_model is a WarpLDA object.")
model_terms <- t(input_model$get_top_words(n = n_topiclabel_words, lambda = difficulty))
model_terms <- t(input_model$get_top_words(n = n_topiclabel_words, lambda = lambda))
theta <- input_model$transform(input_dfm)
} else {
model_terms <- NULL
Expand Down
Loading

0 comments on commit 68439dc

Please sign in to comment.