diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml index 5497bfa..90ae24a 100644 --- a/.JuliaFormatter.toml +++ b/.JuliaFormatter.toml @@ -1,13 +1,3 @@ -indent = 2 -margin = 68 -format_markdown = true -verbose = true -always_for_in = true -whitespace_ops_in_indices = true -remove_extra_newlines = true -pipe_to_function_call = true -always_use_return = true -whitespace_in_kwargs = false +style = "yas" format_docstrings = true -conditional_to_if = true -trailing_comma = true +format_markdown = true diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..cba9134 --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,16 @@ +name: CompatHelper +on: + schedule: + - cron: 0 0 * * * + workflow_dispatch: +jobs: + CompatHelper: + runs-on: ubuntu-latest + steps: + - name: Pkg.add("CompatHelper") + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + run: julia -e 'using CompatHelper; CompatHelper.main()' diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..d9ccab1 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,43 @@ +name: CI +on: + push: + paths-ignore: + - 'README.md' + branches: + - main + pull_request: + paths-ignore: + - 'README.md' + branches: + - main +jobs: + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1' + - 'min' + - 'nightly' + os: + - ubuntu-latest + - macOS-latest + - windows-latest + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 + with: + version: ${{ matrix.version }} + - name: Cache + uses: julia-actions/cache@v2 + with: + cache-compiled: "true" + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v4 + with: + files: lcov.info + token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/formatcheck.yml b/.github/workflows/formatcheck.yml new file mode 100644 index 0000000..84ed363 --- /dev/null +++ b/.github/workflows/formatcheck.yml @@ -0,0 +1,48 @@ +--- + name: Format Check + on: + push: + branches: + - main + - /^release-.*$/ + tags: ["*"] + paths: + - "**/*.jl" + - ".github/workflows/FormatCheck.yml" + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + paths: + - "**/*.jl" + - ".github/workflows/FormatCheck.yml" + jobs: + format-check: + name: Julia + # These permissions are needed to: + # - Delete old caches: https://github.com/julia-actions/cache#usage + # - Post formatting suggestions: https://github.com/reviewdog/action-suggester#required-permissions + permissions: + actions: write + contents: read + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 + with: + version: "1" + - uses: julia-actions/cache@v2 + - name: Install JuliaFormatter + shell: julia --project=@format --color=yes {0} + run: | + using Pkg + Pkg.add(PackageSpec(; name="JuliaFormatter", version="1")) + - name: Check formatting + shell: julia --project=@format --color=yes {0} + run: | + using JuliaFormatter + format("."; verbose=true) || exit(1) + # Add formatting suggestions to non-draft PRs even if when "Check formatting" fails + - uses: reviewdog/action-suggester@db4abb16fbaabe386831e5addb7be1485d0d63d3 # v1.18.0 + if: ${{ !cancelled() && github.event_name == 'pull_request' && github.event.pull_request.draft == false }} + with: + tool_name: JuliaFormatter diff --git a/.gitignore b/.gitignore index 64760cf..a4cf022 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,5 @@ data/ /.quarto/ /.luarc.json +lcov.info +*.cov diff --git a/.zenodo.json b/.zenodo.json new file mode 100644 index 0000000..69d303b --- /dev/null +++ b/.zenodo.json @@ -0,0 +1,24 @@ +{ + "access_right": "open", + "license": "MIT", + "title": "Embrace Uncertainty", + "description": "A Julia package accompanying the online book Embrace Uncertainty", + "upload_type": "software", + "creators": [ + { + "affiliation": "Beacon Biosignals", + "name": "Alday, Phillip M.", + "orcid": "0000-0002-9984-5745" + }, + { + "affiliation": "University of Potsdam", + "name": "Kliegl, Reinhold", + "orcid": "0000-0002-0180-8488" + }, + { + "affiliation": "University of Wisconsin - Madison", + "name": "Bates, Douglas", + "orcid": "0000-0001-8316-9503" + } + ] +} diff --git a/LDT_accuracy.qmd b/LDT_accuracy.qmd index b729406..bbcdf26 100644 --- a/LDT_accuracy.qmd +++ b/LDT_accuracy.qmd @@ -27,20 +27,17 @@ and define some constants ```{julia} #| code-fold: true #| output: false -@isdefined(contrasts) || const contrasts = Dict{Symbol, Any}() +@isdefined(contrasts) || const contrasts = Dict{Symbol,Any}() @isdefined(progress) || const progress = false ``` ## Create the dataset - ```{julia} #| output: false -trials = innerjoin( - DataFrame(dataset(:ELP_ldt_trial)), - select(DataFrame(dataset(:ELP_ldt_item)), :item, :isword, :wrdlen), - on = :item -) +trials = innerjoin(DataFrame(dataset(:ELP_ldt_trial)), + select(DataFrame(dataset(:ELP_ldt_item)), :item, :isword, :wrdlen); + on=:item) ``` ```{julia} @@ -54,20 +51,18 @@ This takes about ten to fifteen minutes on a recent laptop ```{julia} contrasts[:isword] = EffectsCoding() contrasts[:wrdlen] = Center(8) -@time gm1 = let f = @formula(acc ~ 1 + isword * wrdlen + (1|item) + (1|subj)) +@time gm1 = let f = @formula(acc ~ 1 + isword * wrdlen + (1 | item) + (1 | subj)) fit(MixedModel, f, trials, Bernoulli(); contrasts, progress, init_from_lmm=(:β, :θ)) end ``` - ```{julia} print(gm1) ``` - ```{julia} #| fig-cap: Conditional modes and 95% prediction intervals on random effects for subject in model gm1 #| label: fig-gm1condmodesubj #| code-fold: true -qqcaterpillar!(Figure(; size=(800,800)), gm1, :subj) -``` \ No newline at end of file +qqcaterpillar!(Figure(; size=(800, 800)), gm1, :subj) +``` diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c145bda --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020-2024 Phillip Alday, Reinhold Kliegl and Douglas Bates + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Project.toml b/Project.toml index 5a824dd..1ef9624 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "EmbraceUncertainty" uuid = "81be8970-b14c-4107-8889-06289fef228c" -authors = ["Phillip Alday , Reinhold Kliegl , Douglas Bates "] +authors = ["Phillip Alday ", "Reinhold Kliegl ", "Douglas Bates "] version = "0.1.0" [deps] @@ -44,7 +44,9 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" [compat] AlgebraOfGraphics = "0.6, 0.7, 0.8" +Aqua = "0.8.7" Arrow = "2" +BenchmarkTools = "1" CSV = "0.10" CairoMakie = "0.11, 0.12" CategoricalArrays = "0.10" @@ -52,17 +54,22 @@ Chain = "0.5,0.6" DataAPI = "1" DataFrameMacros = "0.3,0.4" DataFrames = "1.3" +Dates = "1" Distributions = "0.25" Downloads = "1" Effects = "1" FreqTables = "0.4" GLM = "1" LinearAlgebra = "1" +Markdown = "1" MixedModels = "4,5" +MixedModelsDatasets = "0.1.1" MixedModelsMakie = "0.4" NLopt = "1" +Pkg = "1" PooledArrays = "1" RCall = "0.13,0.14" +RectangularFullPacked = "0.2" Random = "1" SHA = "0.7" Scratch = "1" @@ -71,6 +78,17 @@ StatsAPI = "1" StatsBase = "0.33, 0.34" StatsModels = "0.7" Tables = "1" +TidierPlots = "0.8" +Test = "1.11.0" +TestSetExtensions = "3.0.0" TypedTables = "1" ZipFile = "0.10" -julia = "1.8" +julia = "1.10" + +[extras] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +TestSetExtensions = "98d24dd4-01ad-11ea-1b02-c9a08f80db04" + +[targets] +test = ["Aqua", "Test", "TestSetExtensions"] diff --git a/aGHQ.qmd b/aGHQ.qmd index a8a82ea..342d96b 100644 --- a/aGHQ.qmd +++ b/aGHQ.qmd @@ -204,8 +204,8 @@ A Julia function to evaluate both the mean and the unit deviance can be written ```{julia} #| output: false function meanunitdev(y::T, η::T) where {T<:AbstractFloat} - expmη = exp(-η) - return (; μ=inv(1 + expmη), dev=2 * ((1 - y) * η + log1p(expmη))) + expmη = exp(-η) + return (; μ=inv(1 + expmη), dev=2 * ((1 - y) * η + log1p(expmη))) end ``` @@ -217,10 +217,10 @@ Mathematically `log1p`, read *log of 1 plus*, is defined as $\mathrm{log1p}(x)=\ ```{julia} let small = eps() / 10 - @show small - @show 1 + small - @show log(1 + small) - @show log1p(small) + @show small + @show 1 + small + @show log(1 + small) + @show log1p(small) end; ``` @@ -245,19 +245,16 @@ We illustrate some of these computations using only the fixed-effects specificat #| output: false #| label: com05 contra = let tbl = dataset(:contra) - Table(tbl; ch=tbl.livch .≠ "0") + Table(tbl; ch=tbl.livch .≠ "0") end contrasts[:urban] = HelmertCoding() contrasts[:ch] = HelmertCoding() -com05 = - let d = contra, +com05 = let d = contra, ds = Bernoulli(), - f = @formula( - use ~ 1 + urban + ch * age + age & age + (1 | dist & urban) - ) + f = @formula(use ~ 1 + urban + ch * age + age & age + (1 | dist & urban)) fit(MixedModel, f, d, ds; contrasts, nAGQ=9, progress) - end +end ``` Extract the fixed-effects model matrix, $\bbX$, and initialize the coefficient vector, $\bbbeta$, to a copy (in case we modify it) of the estimated fixed-effects. @@ -291,10 +288,10 @@ When minimizing the deviance it is convenient to have the different components o ```{julia} struct BernoulliGLM{T<:AbstractFloat} - X::Matrix{T} - β::Vector{T} - ytbl::NamedTuple{(:y, :η),NTuple{2,Vector{T}}} - rtbl::Vector{NamedTuple{(:μ, :dev),NTuple{2,T}}} + X::Matrix{T} + β::Vector{T} + ytbl::NamedTuple{(:y, :η),NTuple{2,Vector{T}}} + rtbl::Vector{NamedTuple{(:μ, :dev),NTuple{2,T}}} end ``` @@ -303,22 +300,20 @@ In this case the external constructor creates a `BernoulliGLM` from the model ma ```{julia} #| output: false -function BernoulliGLM( - X::Matrix{T}, - y::Vector{T}, -) where {T<:AbstractFloat} - - # check consistency of arguments - n = size(X, 1) # number of rows in X - if length(y) ≠ n || any(!in([0, 1]), y) - throw(ArgumentError("y is not an $n-vector of 0's and 1's")) - end - - # initial β from linear regression of y in {-1,1} coding - β = X \ replace(y, 0 => -1) - η = X * β - - return BernoulliGLM(X, β, (; y, η), meanunitdev.(y, η)) +function BernoulliGLM(X::Matrix{T}, + y::Vector{T}) where {T<:AbstractFloat} + + # check consistency of arguments + n = size(X, 1) # number of rows in X + if length(y) ≠ n || any(!in([0, 1]), y) + throw(ArgumentError("y is not an $n-vector of 0's and 1's")) + end + + # initial β from linear regression of y in {-1,1} coding + β = X \ replace(y, 0 => -1) + η = X * β + + return BernoulliGLM(X, β, (; y, η), meanunitdev.(y, η)) end ``` @@ -341,10 +336,10 @@ We also define a mutating function, `setβ!`, that installs a new value of `β` ```{julia} #| output: false function setβ!(m::BernoulliGLM, newβ) - (; y, η) = m.ytbl # destructure ytbl - mul!(η, m.X, copyto!(m.β, newβ)) # η = X * newβ in place - m.rtbl .= meanunitdev.(y, η) # update rtbl in place - return m + (; y, η) = m.ytbl # destructure ytbl + mul!(η, m.X, copyto!(m.β, newβ)) # η = X * newβ in place + m.rtbl .= meanunitdev.(y, η) # update rtbl in place + return m end ``` @@ -377,17 +372,15 @@ Following the instructions given at that package's repository, we create an `Opt ```{julia} function StatsAPI.fit!(m::BernoulliGLM{T}) where {T} - opt = Opt(:LN_BOBYQA, length(m.β)) - function objective(x::Vector{T}, g::Vector{T}) where {T} - isempty(g) || throw( - ArgumentError("Gradient not available, g must be empty"), - ) - return deviance(setβ!(m, x)) - end - opt.min_objective = objective - minf, minx, ret = optimize(opt, copy(m.β)) - @info (; code=ret, nevals=opt.numevals, minf) - return m + opt = Opt(:LN_BOBYQA, length(m.β)) + function objective(x::Vector{T}, g::Vector{T}) where {T} + isempty(g) || throw(ArgumentError("Gradient not available, g must be empty")) + return deviance(setβ!(m, x)) + end + opt.min_objective = objective + minf, minx, ret = optimize(opt, copy(m.β)) + @info (; code=ret, nevals=opt.numevals, minf) + return m end ``` @@ -402,8 +395,7 @@ Each evaluation of the deviance is fast, requiring only a fraction of a millisec ```{julia} βopt = copy(com05fe.β) -@benchmark deviance(setβ!(m, β)) seconds = 1 setup = - (m = com05fe; β = βopt) +@benchmark deviance(setβ!(m, β)) seconds = 1 setup = (m = com05fe; β = βopt) ``` but the already large number of evaluations for these six coefficients would not scale well as this dimension increases. @@ -505,15 +497,13 @@ We also add fields `Xqr`, in which the weighted model matrix, $\bbW^{1/2}\bbX$, ```{julia} #| output: false struct BernoulliIRLS{T<:AbstractFloat} - X::Matrix{T} - Xqr::Matrix{T} # copy of X used in the QR decomp - β::Vector{T} - βcp::Vector{T} # copy of previous β - Whalf::Diagonal{T,Vector{T}} # rtwwt as a Diagonal matrix - ytbl::NamedTuple{(:y, :η),NTuple{2,Vector{T}}} - rtbl::Vector{ - NamedTuple{(:μ, :dev, :rtwwt, :wwres, :wwresp),NTuple{5,T}}, - } + X::Matrix{T} + Xqr::Matrix{T} # copy of X used in the QR decomp + β::Vector{T} + βcp::Vector{T} # copy of previous β + Whalf::Diagonal{T,Vector{T}} # rtwwt as a Diagonal matrix + ytbl::NamedTuple{(:y, :η),NTuple{2,Vector{T}}} + rtbl::Vector{NamedTuple{(:μ, :dev, :rtwwt, :wwres, :wwresp),NTuple{5,T}}} end ``` @@ -521,22 +511,20 @@ with constructor ```{julia} #| output: false -function BernoulliIRLS( - X::Matrix{T}, - y::Vector{T}, -) where {T<:AbstractFloat} - n = size(X, 1) # number of rows of X - if length(y) ≠ n || !all(v -> (iszero(v) || isone(v)), y) - throw(ArgumentError("y is not an $n-vector of 0's and 1's")) - end - # initial β from linear least squares fit of y in {-1,1} coding - Xqr = copy(X) - β = qr!(Xqr) \ replace(y, 0 => -1) - βcp = copy(β) - η = X * β - rtbl = tblrow.(y, η) - Whalf = Diagonal([r.rtwwt for r in rtbl]) - return BernoulliIRLS(X, Xqr, β, βcp, Whalf, (; y, η), rtbl) +function BernoulliIRLS(X::Matrix{T}, + y::Vector{T}) where {T<:AbstractFloat} + n = size(X, 1) # number of rows of X + if length(y) ≠ n || !all(v -> (iszero(v) || isone(v)), y) + throw(ArgumentError("y is not an $n-vector of 0's and 1's")) + end + # initial β from linear least squares fit of y in {-1,1} coding + Xqr = copy(X) + β = qr!(Xqr) \ replace(y, 0 => -1) + βcp = copy(β) + η = X * β + rtbl = tblrow.(y, η) + Whalf = Diagonal([r.rtwwt for r in rtbl]) + return BernoulliIRLS(X, Xqr, β, βcp, Whalf, (; y, η), rtbl) end ``` @@ -545,20 +533,18 @@ The `offset` argument, which defaults to zero, is not used in calls for `Bernoul ```{julia} #| output: false -function tblrow( - y::T, - η::T, - offset::T=zero(T), -) where {T<:AbstractFloat} - rtexpmη = exp(-η / 2) # square root of exp(-η) - expmη = abs2(rtexpmη) # exp(-η) - denom = 1 + expmη - μ = inv(denom) - dev = 2 * ((1 - y) * η + log1p(expmη)) - rtwwt = rtexpmη / denom # sqrt of working wt - wwres = (y - μ) / rtwwt # weighted working resid - wwresp = wwres + rtwwt * (η - offset) - return (; μ, dev, rtwwt, wwres, wwresp) +function tblrow(y::T, + η::T, + offset::T=zero(T)) where {T<:AbstractFloat} + rtexpmη = exp(-η / 2) # square root of exp(-η) + expmη = abs2(rtexpmη) # exp(-η) + denom = 1 + expmη + μ = inv(denom) + dev = 2 * ((1 - y) * η + log1p(expmη)) + rtwwt = rtexpmη / denom # sqrt of working wt + wwres = (y - μ) / rtwwt # weighted working resid + wwresp = wwres + rtwwt * (η - offset) + return (; μ, dev, rtwwt, wwres, wwresp) end ``` @@ -572,15 +558,15 @@ Next we define a mutating function, `updateβ!`, that evaluates $\bbbeta^{(k)}$, ```{julia} #| output: false function updateβ!(m::BernoulliIRLS) - (; X, Xqr, β, βcp, Whalf, ytbl, rtbl) = m # destructure m & ytbl - (; y, η) = ytbl - copyto!(βcp, β) # keep a copy of β - copyto!(Whalf.diag, r.rtwwt for r in rtbl) # rtwwt -> Whalf - mul!(Xqr, Whalf, X) # weighted model matrix - copyto!(η, r.wwresp for r in rtbl) # use η as temp storage - ldiv!(β, qr!(Xqr), η) # weighted least squares - rtbl .= tblrow.(y, mul!(η, X, β)) # update η and rtbl - return m + (; X, Xqr, β, βcp, Whalf, ytbl, rtbl) = m # destructure m & ytbl + (; y, η) = ytbl + copyto!(βcp, β) # keep a copy of β + copyto!(Whalf.diag, r.rtwwt for r in rtbl) # rtwwt -> Whalf + mul!(Xqr, Whalf, X) # weighted model matrix + copyto!(η, r.wwresp for r in rtbl) # use η as temp storage + ldiv!(β, qr!(Xqr), η) # weighted least squares + rtbl .= tblrow.(y, mul!(η, X, β)) # update η and rtbl + return m end ``` @@ -604,26 +590,26 @@ We create a `fit!` method to iterate to convergence. ```{julia} #| output: false function StatsAPI.fit!(m::BernoulliIRLS, β₀=m.β; verbose::Bool=true) - (; X, β, βcp, ytbl, rtbl) = m - (; y, η) = ytbl - rtbl .= tblrow.(y, mul!(η, X, copyto!(β, β₀))) - olddev = deviance(m) - verbose && @info 0, olddev # record the deviance at initial β - for i in 1:100 # perform at most 100 iterations - newdev = deviance(updateβ!(m)) - verbose && @info i, newdev # iteration number and deviance - if newdev > olddev - @warn "failure to decrease deviance" - copyto!(β, βcp) # roll back changes to β, η, and rtbl - rtbl = tblrow.(y, mul!(η, X, β)) - break - elseif (olddev - newdev) < (1.0e-10 * olddev) - break # exit loop if deviance is stable - else - olddev = newdev + (; X, β, βcp, ytbl, rtbl) = m + (; y, η) = ytbl + rtbl .= tblrow.(y, mul!(η, X, copyto!(β, β₀))) + olddev = deviance(m) + verbose && @info 0, olddev # record the deviance at initial β + for i in 1:100 # perform at most 100 iterations + newdev = deviance(updateβ!(m)) + verbose && @info i, newdev # iteration number and deviance + if newdev > olddev + @warn "failure to decrease deviance" + copyto!(β, βcp) # roll back changes to β, η, and rtbl + rtbl = tblrow.(y, mul!(η, X, β)) + break + elseif (olddev - newdev) < (1.0e-10 * olddev) + break # exit loop if deviance is stable + else + olddev = newdev + end end - end - return m + return m end ``` @@ -696,19 +682,13 @@ We define a struct ```{julia} #| output: false struct BernoulliPIRLS{T<:AbstractFloat,S<:Integer} - X::Matrix{T} - θβ::Vector{T} - ytbl::NamedTuple{ # column-table - (:refs, :y, :η, :offset), - Tuple{Vector{S},Vector{T},Vector{T},Vector{T}}, - } - utbl::NamedTuple{ # column-table - (:u, :u0, :Ldiag, :pdev, :pdev0, :aGHQ), - NTuple{6,Vector{T}}, - } - rtbl::Vector{ # row-table - NamedTuple{(:μ, :dev, :rtwwt, :wwres, :wwresp),NTuple{5,T}}, - } + X::Matrix{T} + θβ::Vector{T} + ytbl::NamedTuple{(:refs, :y, :η, :offset), + Tuple{Vector{S},Vector{T},Vector{T},Vector{T}}} + utbl::NamedTuple{(:u, :u0, :Ldiag, :pdev, :pdev0, :aGHQ), + NTuple{6,Vector{T}}} + rtbl::Vector{NamedTuple{(:μ, :dev, :rtwwt, :wwres, :wwresp),NTuple{5,T}}} end ``` @@ -716,33 +696,29 @@ with an external constructor ```{julia} #| output: false -function BernoulliPIRLS( - X::Matrix{T}, - y::Vector{T}, - refs::Vector{S}, -) where {T<:AbstractFloat,S<:Integer} - # use IRLS to check X and y, obtain initial β, and establish rtbl - irls = fit!(BernoulliIRLS(X, y); verbose=false) - β = irls.β - θβ = append!(ones(T, 1), β) # initial θ = 1 - η = X * β - - # refs should contain all values from 1 to maximum(refs) - refvals = sort!(unique(refs)) - q = length(refvals) - if refvals ≠ 1:q - throw(ArgumentError("sort!(unique(refs)) must be 1:$q")) - end - length(refs) == length(y) || - throw(ArgumentError("lengths of y and refs aren't equal")) - - ytbl = (; refs, y, η, offset=copy(η)) - - utbl = NamedTuple( - nm => zeros(T, q) for - nm in (:u, :u0, :Ldiag, :pdev, :pdev0, :aGHQ) - ) - return updatetbl!(BernoulliPIRLS(X, θβ, ytbl, utbl, irls.rtbl)) +function BernoulliPIRLS(X::Matrix{T}, + y::Vector{T}, + refs::Vector{S}) where {T<:AbstractFloat,S<:Integer} + # use IRLS to check X and y, obtain initial β, and establish rtbl + irls = fit!(BernoulliIRLS(X, y); verbose=false) + β = irls.β + θβ = append!(ones(T, 1), β) # initial θ = 1 + η = X * β + + # refs should contain all values from 1 to maximum(refs) + refvals = sort!(unique(refs)) + q = length(refvals) + if refvals ≠ 1:q + throw(ArgumentError("sort!(unique(refs)) must be 1:$q")) + end + length(refs) == length(y) || + throw(ArgumentError("lengths of y and refs aren't equal")) + + ytbl = (; refs, y, η, offset=copy(η)) + + utbl = NamedTuple(nm => zeros(T, q) for + nm in (:u, :u0, :Ldiag, :pdev, :pdev0, :aGHQ)) + return updatetbl!(BernoulliPIRLS(X, θβ, ytbl, utbl, irls.rtbl)) end ``` @@ -763,16 +739,16 @@ For this model $\bbtheta$ is one-dimensional and $\bbLambda_{\bbtheta}$ is a sca ```{julia} #| output: false function updatetbl!(m::BernoulliPIRLS) - (; refs, y, η, offset) = m.ytbl - u = m.utbl.u - θ = first(m.θβ) - # evaluate η = offset + ZΛu where Λ is θ * I and Z is one-hot - fill!(η, 0) - @inbounds for i in eachindex(η, refs, offset) - η[i] += offset[i] + u[refs[i]] * θ - end - m.rtbl .= tblrow.(y, η, offset) - return m + (; refs, y, η, offset) = m.ytbl + u = m.utbl.u + θ = first(m.θβ) + # evaluate η = offset + ZΛu where Λ is θ * I and Z is one-hot + fill!(η, 0) + @inbounds for i in eachindex(η, refs, offset) + η[i] += offset[i] + u[refs[i]] * θ + end + m.rtbl .= tblrow.(y, η, offset) + return m end ``` @@ -781,7 +757,7 @@ The `pdeviance` method returns the deviance for the GLM model plus the penalty o ```{julia} #| output: false function pdeviance(m::BernoulliPIRLS) - return sum(r.dev for r in m.rtbl) + sum(abs2, m.utbl.u) + return sum(r.dev for r in m.rtbl) + sum(abs2, m.utbl.u) end ``` @@ -792,23 +768,23 @@ At convergence of the PIRLS algorithm the elements of `Ldiag` are replaced by th ```{julia} #| output: false function updateu!(m::BernoulliPIRLS) - (; u, u0, Ldiag) = m.utbl - copyto!(u0, u) # keep a copy of u - θ = first(m.θβ) # extract the scalar θ - fill!(u, 0) - if iszero(θ) # skip the update if θ == 0 - fill!(Ldiag, 1) # L is the identity if θ == 0 - return updatetbl!(m) - end - fill!(Ldiag, 0) - @inbounds for (ri, ti) in zip(m.ytbl.refs, m.rtbl) - rtWΛ = θ * ti.rtwwt # non-zero in i'th row of √WZΛ - Ldiag[ri] += abs2(rtWΛ) # accumulate Λ'Z'WZΛ - u[ri] += rtWΛ * ti.wwresp # accumulate Λ'Z'Wỹ - end - Ldiag .+= 1 # form diagonal of Λ'Z'WZΛ + I = LL' - u ./= Ldiag # solve for u with diagonal LL' - return updatetbl!(m) # and update η and rtbl + (; u, u0, Ldiag) = m.utbl + copyto!(u0, u) # keep a copy of u + θ = first(m.θβ) # extract the scalar θ + fill!(u, 0) + if iszero(θ) # skip the update if θ == 0 + fill!(Ldiag, 1) # L is the identity if θ == 0 + return updatetbl!(m) + end + fill!(Ldiag, 0) + @inbounds for (ri, ti) in zip(m.ytbl.refs, m.rtbl) + rtWΛ = θ * ti.rtwwt # non-zero in i'th row of √WZΛ + Ldiag[ri] += abs2(rtWΛ) # accumulate Λ'Z'WZΛ + u[ri] += rtWΛ * ti.wwresp # accumulate Λ'Z'Wỹ + end + Ldiag .+= 1 # form diagonal of Λ'Z'WZΛ + I = LL' + u ./= Ldiag # solve for u with diagonal LL' + return updatetbl!(m) # and update η and rtbl end ``` @@ -830,29 +806,29 @@ Create a `pirls!` method for this struct. ```{julia} #| output: false function pirls!(m::BernoulliPIRLS; verbose::Bool=false) - (; u, u0, Ldiag) = m.utbl - fill!(u, 0) # start from u == 0 - copyto!(u0, u) # keep a copy of u - oldpdev = pdeviance(updatetbl!(m)) - verbose && @info 0, oldpdev - for i in 1:10 # maximum of 10 PIRLS iterations - newpdev = pdeviance(updateu!(m)) - verbose && @info i, newpdev - if newpdev > oldpdev # PIRLS iteration failed - @warn "PIRLS iteration did not reduce penalized deviance" - copyto!(u, u0) # restore previous u - updatetbl!(m) # restore η and rtbl - break - elseif (oldpdev - newpdev) < (1.0e-8 * oldpdev) - copyto!(u0, u) # keep a copy of u - break - else - copyto!(u0, u) # keep a copy of u - oldpdev = newpdev + (; u, u0, Ldiag) = m.utbl + fill!(u, 0) # start from u == 0 + copyto!(u0, u) # keep a copy of u + oldpdev = pdeviance(updatetbl!(m)) + verbose && @info 0, oldpdev + for i in 1:10 # maximum of 10 PIRLS iterations + newpdev = pdeviance(updateu!(m)) + verbose && @info i, newpdev + if newpdev > oldpdev # PIRLS iteration failed + @warn "PIRLS iteration did not reduce penalized deviance" + copyto!(u, u0) # restore previous u + updatetbl!(m) # restore η and rtbl + break + elseif (oldpdev - newpdev) < (1.0e-8 * oldpdev) + copyto!(u0, u) # keep a copy of u + break + else + copyto!(u0, u) # keep a copy of u + oldpdev = newpdev + end end - end - map!(sqrt, Ldiag, Ldiag) # replace diag(LL') by diag(L) - return m + map!(sqrt, Ldiag, Ldiag) # replace diag(LL') by diag(L) + return m end ``` @@ -915,7 +891,7 @@ $$ ```{julia} #| output: false function laplaceapprox(m::BernoulliPIRLS) - return pdeviance(m) + 2 * sum(log, m.utbl.Ldiag) + return pdeviance(m) + 2 * sum(log, m.utbl.Ldiag) end ``` @@ -928,25 +904,25 @@ The remaining step is to optimize Laplace's approximation to the GLMM deviance w ```{julia} #| output: false function StatsAPI.fit!(m::BernoulliPIRLS) - θβ = m.θβ - pp1 = length(θβ) # length(β) = p and length(θ) = 1 - opt = Opt(:LN_BOBYQA, pp1) - mβ = view(θβ, 2:pp1) - function obj(x, g) - if !isempty(g) - throw(ArgumentError("gradient not provided, g must be empty")) + θβ = m.θβ + pp1 = length(θβ) # length(β) = p and length(θ) = 1 + opt = Opt(:LN_BOBYQA, pp1) + mβ = view(θβ, 2:pp1) + function obj(x, g) + if !isempty(g) + throw(ArgumentError("gradient not provided, g must be empty")) + end + copyto!(θβ, x) + mul!(m.ytbl.offset, m.X, mβ) + return laplaceapprox(pirls!(m)) end - copyto!(θβ, x) - mul!(m.ytbl.offset, m.X, mβ) - return laplaceapprox(pirls!(m)) - end - opt.min_objective = obj - lb = fill!(similar(θβ), -Inf) # vector of lower bounds - lb[1] = 0 # scalar θ must be non-negative - NLopt.lower_bounds!(opt, lb) - minf, minx, ret = optimize(opt, copy(θβ)) - @info (; ret, fevals=opt.numevals, minf) - return m + opt.min_objective = obj + lb = fill!(similar(θβ), -Inf) # vector of lower bounds + lb[1] = 0 # scalar θ must be non-negative + NLopt.lower_bounds!(opt, lb) + minf, minx, ret = optimize(opt, copy(θβ)) + @info (; ret, fevals=opt.numevals, minf) + return m end ``` @@ -956,26 +932,22 @@ fit!(m); ```{julia} #| code-fold: true -print( - "Converged to θ = ", - first(m.θβ), - " and β =", - view(m.θβ, 2:lastindex(m.θβ)), -) +print("Converged to θ = ", + first(m.θβ), + " and β =", + view(m.θβ, 2:lastindex(m.θβ))) ``` These estimates differ somewhat from those for model `com05`. ```{julia} #| code-fold: true -print( - "Estimates for com05: θ = ", - only(com05.θ), - ", fmin = ", - deviance(com05), - ", and β =", - com05.β, -) +print("Estimates for com05: θ = ", + only(com05.θ), + ", fmin = ", + deviance(com05), + ", and β =", + com05.β) ``` The discrepancy in the results is because the `com05` results are based on a more accurate approximation to the integral called *adaptive Gauss-Hermite Quadrature*, which is discussed in @sec-aGHQ. @@ -1107,11 +1079,10 @@ A function of `k` to evaluate the abscissae and weights is ```{julia} #| output: false function gausshermitenorm(k) - ev = eigen(SymTridiagonal(zeros(k), sqrt.(1:(k - 1)))) - return Table((; - abscissae=ev.values, - weights=abs2.(ev.vectors[1, :]), - )) + ev = eigen(SymTridiagonal(zeros(k), sqrt.(1:(k - 1)))) + return Table((; + abscissae=ev.values, + weights=abs2.(ev.vectors[1, :]),)) end ``` @@ -1128,11 +1099,9 @@ The weights and positions for the 9th order rule are shown in @fig-ghnine. #| fig-cap: Weights and positions for the 9th order normalized Gauss-Hermite quadrature rule #| label: fig-ghnine #| warning: false -draw( - data(gausshermitenorm(9)) * - mapping(:abscissae => "Positions", :weights); - figure=(; size=(600, 450)), -) +draw(data(gausshermitenorm(9)) * + mapping(:abscissae => "Positions", :weights); + figure=(; size=(600, 450)),) ``` Notice that the magnitudes of the weights drop quite dramatically away from zero, even on a logarithmic scale (@fig-ghninelog) @@ -1142,20 +1111,16 @@ Notice that the magnitudes of the weights drop quite dramatically away from zero #| fig-cap: Weights (logarithm base 2) and positions for the 9th order normalized Gauss-Hermite quadrature rule #| label: fig-ghninelog #| warning: false -draw( - data(gausshermitenorm(9)) * mapping( - :abscissae => "Positions", - :weights => log2 => "log₂(weight)", - ); - figure=(; size=(600, 450)), -) +draw(data(gausshermitenorm(9)) * mapping(:abscissae => "Positions", + :weights => log2 => "log₂(weight)"); + figure=(; size=(600, 450)),) ``` The definition of `MixedModels.GHnorm` is similar to the `gausshermitenorm` function with some extra provisions for ensuring symmetry of the abscissae and of the weights and for caching values once they have been calculated. ```{julia} let tbl = GHnorm(9) - Table(abscissae=tbl.z, weights=tbl.w) + Table(; abscissae=tbl.z, weights=tbl.w) end ``` @@ -1171,7 +1136,7 @@ For example, $\mathbb{E}[\mathcal{X}^2]$ where $\mathcal{X}\sim\mathcal{N}(2, 3^ ```{julia} let μ = 2, σ = 3, ghn3 = GHnorm(3) - sum(@. ghn3.w * abs2(μ + σ * ghn3.z)) # should be μ² + σ² = 13 + sum(@. ghn3.w * abs2(μ + σ * ghn3.z)) # should be μ² + σ² = 13 end ``` @@ -1185,12 +1150,12 @@ We have provided a `pdev` vector in the `utbl` field of a `BernoulliPIRLS` objec ```{julia} #| output: false function pdevcomps!(m::BernoulliPIRLS) - (; u, pdev) = m.utbl - pdev .= abs2.(u) # initialize pdevj to square of uj - @inbounds for (ri, ti) in zip(m.ytbl.refs, m.rtbl) - pdev[ri] += ti.dev - end - return m + (; u, pdev) = m.utbl + pdev .= abs2.(u) # initialize pdevj to square of uj + @inbounds for (ri, ti) in zip(m.ytbl.refs, m.rtbl) + pdev[ri] += ti.dev + end + return m end ``` @@ -1208,29 +1173,27 @@ Consider the change in the penalized deviance from that at the conditional mode ```{julia} #| output: false #| code-fold: show -function pdevdiff( - m::BernoulliPIRLS{T}; - zvals=collect(-3.5:inv(32):3.5), - inds=1:5, -) where {T} - (; u, u0, Ldiag, pdev, pdev0) = m.utbl - pdevcomps!(pirls!(m)) # assign u0 - copyto!(pdev0, pdev) # and pdev0 - ni = length(inds) - nz = length(zvals) - uvals = Array{T}(undef, ni, nz) - exact = Array{T}(undef, ni, nz) - for (j, z) in enumerate(zvals) - u .= u0 .+ z ./ Ldiag # evaluate u from z - pdevcomps!(updatetbl!(m)) # evaluate pdev - for (i, ind) in enumerate(inds) # store selected u and pdev - uvals[i, j] = u[ind] - exact[i, j] = pdev[ind] - pdev0[ind] +function pdevdiff(m::BernoulliPIRLS{T}; + zvals=collect(-3.5:inv(32):3.5), + inds=1:5,) where {T} + (; u, u0, Ldiag, pdev, pdev0) = m.utbl + pdevcomps!(pirls!(m)) # assign u0 + copyto!(pdev0, pdev) # and pdev0 + ni = length(inds) + nz = length(zvals) + uvals = Array{T}(undef, ni, nz) + exact = Array{T}(undef, ni, nz) + for (j, z) in enumerate(zvals) + u .= u0 .+ z ./ Ldiag # evaluate u from z + pdevcomps!(updatetbl!(m)) # evaluate pdev + for (i, ind) in enumerate(inds) # store selected u and pdev + uvals[i, j] = u[ind] + exact[i, j] = pdev[ind] - pdev0[ind] + end end - end - uvals = collect(uvals') # transpose uvals - exact = collect(exact') # and exact - return (; zvals, inds, uvals, exact) + uvals = collect(uvals') # transpose uvals + exact = collect(exact') # and exact + return (; zvals, inds, uvals, exact) end m05pdevdiff = pdevdiff(m); ``` @@ -1243,19 +1206,15 @@ We begin with plots of the difference in the penalized deviance from its value a #| label: fig-uscalepdev #| warning: false let (; zvals, inds, uvals, exact) = m05pdevdiff, - fig = Figure(; size=(600, 375)), - ax = Axis( - fig[1, 1]; - xlabel="u", - ylabel="Change in penalized deviance", - ) - - lins = [ - lines!(ax, view(uvals, :, j), view(exact, :, j)) for - j in axes(uvals, 2) - ] - Legend(fig[1, 2], lins, string.(inds)) - fig + fig = Figure(; size=(600, 375)), + ax = Axis(fig[1, 1]; + xlabel="u", + ylabel="Change in penalized deviance",) + + lins = [lines!(ax, view(uvals, :, j), view(exact, :, j)) for + j in axes(uvals, 2)] + Legend(fig[1, 2], lins, string.(inds)) + fig end ``` @@ -1267,17 +1226,14 @@ then shift and scale the horizontal axis to the $z$ scale for this difference in #| label: fig-zscalepdev #| warning: false let (; zvals, inds, uvals, exact) = m05pdevdiff, - fig = Figure(; size=(600, 375)), - ax = Axis( - fig[1, 1]; - xlabel="z", - ylabel="Change in penalized deviance", - ) - - lins = - [lines!(ax, zvals, view(exact, :, j)) for j in axes(uvals, 2)] - Legend(fig[1, 2], lins, string.(inds)) - fig + fig = Figure(; size=(600, 375)), + ax = Axis(fig[1, 1]; + xlabel="z", + ylabel="Change in penalized deviance",) + + lins = [lines!(ax, zvals, view(exact, :, j)) for j in axes(uvals, 2)] + Legend(fig[1, 2], lins, string.(inds)) + fig end ``` @@ -1289,19 +1245,15 @@ The next stage is to plot, on the $z$ scale, the difference between the penalize #| label: fig-zpdevdiff #| warning: false let (; zvals, inds, uvals, exact) = m05pdevdiff, - fig = Figure(; size=(600, 375)), - ax = Axis( - fig[1, 1]; - xlabel="z", - ylabel="Penalized deviance minus quadratic approx", - ) - - lins = [ - lines!(ax, zvals, view(exact, :, j) .- abs2.(zvals)) for - j in axes(uvals, 2) - ] - Legend(fig[1, 2], lins, string.(inds)) - fig + fig = Figure(; size=(600, 375)), + ax = Axis(fig[1, 1]; + xlabel="z", + ylabel="Penalized deviance minus quadratic approx",) + + lins = [lines!(ax, zvals, view(exact, :, j) .- abs2.(zvals)) for + j in axes(uvals, 2)] + Legend(fig[1, 2], lins, string.(inds)) + fig end ``` @@ -1313,22 +1265,17 @@ and, finally, the exponential of negative half of this difference in @fig-zexpne #| label: fig-zexpneghalfdiff #| warning: false let (; zvals, inds, uvals, exact) = m05pdevdiff, - fig = Figure(; size=(600, 375)), - ax = Axis( - fig[1, 1]; - xlabel="z", - ylabel="Exp of half the quadratic minus penalized deviance", - ) - - lins = [ - lines!( - ax, - zvals, - exp.(0.5 .* (abs2.(zvals) .- view(exact, :, j))), - ) for j in axes(uvals, 2) - ] - Legend(fig[1, 2], lins, string.(inds)) - fig + fig = Figure(; size=(600, 375)), + ax = Axis(fig[1, 1]; + xlabel="z", + ylabel="Exp of half the quadratic minus penalized deviance",) + + lins = [lines!(ax, + zvals, + exp.(0.5 .* (abs2.(zvals) .- view(exact, :, j)))) + for j in axes(uvals, 2)] + Legend(fig[1, 2], lins, string.(inds)) + fig end ``` @@ -1353,24 +1300,24 @@ We use the `aGHQ` column of `m.utbl` to accumulate these contributions and to ta #| output: false #| code-fold: show function evalGHQ!(m::BernoulliPIRLS; nGHQ::Integer=9) - (; ytbl, utbl, rtbl) = m - (; u, u0, Ldiag, pdev, pdev0, aGHQ) = utbl - ghqtbl = GHnorm(nGHQ) - pdevcomps!(pirls!(m)) # ensure that u0 and pdev0 are current - copyto!(pdev0, pdev) - fill!(aGHQ, 0) - for (z, w) in zip(ghqtbl.z, ghqtbl.w) - if iszero(z) # exp term is one when z == 0 - aGHQ .+= w - else - u .= u0 .+ z ./ Ldiag - pdevcomps!(updatetbl!(m)) - aGHQ .+= w .* exp.((abs2(z) .+ pdev0 .- pdev) ./ 2) + (; ytbl, utbl, rtbl) = m + (; u, u0, Ldiag, pdev, pdev0, aGHQ) = utbl + ghqtbl = GHnorm(nGHQ) + pdevcomps!(pirls!(m)) # ensure that u0 and pdev0 are current + copyto!(pdev0, pdev) + fill!(aGHQ, 0) + for (z, w) in zip(ghqtbl.z, ghqtbl.w) + if iszero(z) # exp term is one when z == 0 + aGHQ .+= w + else + u .= u0 .+ z ./ Ldiag + pdevcomps!(updatetbl!(m)) + aGHQ .+= w .* exp.((abs2(z) .+ pdev0 .- pdev) ./ 2) + end end - end - map!(log, aGHQ, aGHQ) # log.(aGHQ) in place - aGHQ .*= -2 - return m + map!(log, aGHQ, aGHQ) # log.(aGHQ) in place + aGHQ .*= -2 + return m end ``` @@ -1395,27 +1342,27 @@ When we consider $f_j(z_j)$, shown in @fig-zexpneghalfdiff, the exponential func ```{julia} #| output: false function fitGHQ!(m::BernoulliPIRLS; nGHQ::Integer=9) - (; Ldiag, pdev0, aGHQ) = m.utbl - θβ = m.θβ - pp1 = length(θβ) # length(β) = p and length(θ) = 1 - opt = Opt(:LN_BOBYQA, pp1) - mβ = view(θβ, 2:pp1) - function obj(x, g) - if !isempty(g) - throw(ArgumentError("gradient not provided, g must be empty")) + (; Ldiag, pdev0, aGHQ) = m.utbl + θβ = m.θβ + pp1 = length(θβ) # length(β) = p and length(θ) = 1 + opt = Opt(:LN_BOBYQA, pp1) + mβ = view(θβ, 2:pp1) + function obj(x, g) + if !isempty(g) + throw(ArgumentError("gradient not provided, g must be empty")) + end + copyto!(θβ, x) + mul!(m.ytbl.offset, m.X, mβ) + evalGHQ!(m; nGHQ) + return sum(pdev0) + sum(aGHQ) + 2 * sum(log, Ldiag) end - copyto!(θβ, x) - mul!(m.ytbl.offset, m.X, mβ) - evalGHQ!(m; nGHQ) - return sum(pdev0) + sum(aGHQ) + 2 * sum(log, Ldiag) - end - opt.min_objective = obj - lb = fill!(similar(θβ), -Inf) # vector of lower bounds - lb[1] = 0 # scalar θ must be non-negative - NLopt.lower_bounds!(opt, lb) - minf, minx, ret = optimize(opt, copy(θβ)) - @info (; ret, fevals=opt.numevals, minf) - return m + opt.min_objective = obj + lb = fill!(similar(θβ), -Inf) # vector of lower bounds + lb[1] = 0 # scalar θ must be non-negative + NLopt.lower_bounds!(opt, lb) + minf, minx, ret = optimize(opt, copy(θβ)) + @info (; ret, fevals=opt.numevals, minf) + return m end ``` diff --git a/datatables.qmd b/datatables.qmd index 9eea74f..6e44ec9 100644 --- a/datatables.qmd +++ b/datatables.qmd @@ -194,10 +194,8 @@ As shown in the last code block, the process of iterating over the rows of a `Ta In @sec-glmmbinomial a `newdata` table is constructed from the Cartesian product of vectors of covariates as the `newdata` table. ```{julia} -newdata = Table( - (; age=a, ch=c, urban=u) # NamedTuple from iterator product - for a in -10:3:20, c in [false, true], u in ["N", "Y"] -) +newdata = Table((; age=a, ch=c, urban=u) # NamedTuple from iterator product + for a in -10:3:20, c in [false, true], u in ["N", "Y"]) ``` ::: {.callout-note collapse="true"} @@ -218,10 +216,9 @@ In this expression the `;` is not necessary but there is another form where we j Thus the `newdata` table could be constructed as ```{julia} -newdata = Table( - (; age, ch, urban) for age in -10:3:20, ch in [false, true], - urban in ["N", "Y"] -) +newdata = Table((; age, ch, urban) + for age in -10:3:20, ch in [false, true], + urban in ["N", "Y"]) ``` ::: @@ -245,9 +242,7 @@ contratbl = Table(contratbl; ch=contratbl.livch .== "0") If later we decide that we can do without this column we can drop it using `getproperties` whose second argument should be a `Tuple` of `Symbol`s of the names of the columns to retain. ```{julia} -contratbl = Table( - getproperties(contratbl, (:dist, :urban, :livch, :age, :use)), -) +contratbl = Table(getproperties(contratbl, (:dist, :urban, :livch, :age, :use))) ``` ## The DataFrames package diff --git a/glmmbernoulli.qmd b/glmmbernoulli.qmd index fd49de2..97d6084 100644 --- a/glmmbernoulli.qmd +++ b/glmmbernoulli.qmd @@ -80,7 +80,7 @@ with summary ```{julia} #| code-fold: true let df = DataFrame(contra) - describe(df, :mean, :min, :median, :max, :nunique, :eltype) + describe(df, :mean, :min, :median, :max, :nunique, :eltype) end ``` @@ -109,17 +109,13 @@ as in @fig-contradata, #| fig-cap: Smoothed relative frequency of contraception use versus centered age for women in the 1989 Bangladesh Fertility Survey #| label: fig-contradata #| warning: false -draw( - data(contra) * - mapping( - :age => "Centered age (yr)", - :use => ==("Y") => "Frequency of contraceptive use"; - col=:urban => renamer(["N" => "Rural", "Y" => "Urban"]), - color=:livch, - ) * - smooth(); - figure=(; size=(650, 400)), -) +draw(data(contra) * + mapping(:age => "Centered age (yr)", + :use => ==("Y") => "Frequency of contraceptive use"; + col=:urban => renamer(["N" => "Rural", "Y" => "Urban"]), + color=:livch,) * + smooth(); + figure=(; size=(650, 400)),) ``` to show the trend in the response with respect to the covariate. @@ -146,13 +142,11 @@ Establishing the contrasts and fitting a preliminary model with random effects f contrasts[:livch] = EffectsCoding(; base="0") contrasts[:urban] = HelmertCoding() -com01 = - let d = contra, +com01 = let d = contra, ds = Bernoulli(), - f = @formula(use ~ - 1 + livch + (age + abs2(age)) * urban + (1 | dist)) + f = @formula(use ~ 1 + livch + (age + abs2(age)) * urban + (1 | dist)) - fit(MixedModel, f, d, ds; contrasts, nAGQ, progress) + fit(MixedModel, f, d, ds; contrasts, nAGQ, progress) end ``` @@ -194,20 +188,19 @@ contra = Table(contra; ch=contra.livch .!= "0") ```{julia} #| code-fold: true let df = DataFrame(contra) - describe(df, :mean, :min, :median, :max, :nunique, :eltype) + describe(df, :mean, :min, :median, :max, :nunique, :eltype) end ``` we fit a reduced model. ```{julia} -com02 = - let d = contra, +com02 = let d = contra, ds = Bernoulli(), f = @formula(use ~ 1 + ch + age * age * urban + (1 | dist)) fit(MixedModel, f, d, ds; contrasts, nAGQ, progress) - end +end ``` Comparing this model to the previous model @@ -221,13 +214,12 @@ indicates that the reduced model is adequate. Apparently neither the second-order interaction `age & urban` nor the third-order interaction `age & age & urban` is significant and we fit a model without these terms. ```{julia} -com03 = - let f = @formula(use ~ 1 + urban + ch + age * age + (1 | dist)), +com03 = let f = @formula(use ~ 1 + urban + ch + age * age + (1 | dist)), d = contra, ds = Bernoulli() fit(MixedModel, f, d, ds; contrasts, nAGQ, progress) - end +end ``` A likelihood ratio test @@ -245,31 +237,25 @@ A plot of the smoothed observed proportions versus centered age by `urban` and ` #| fig-cap: Smoothed relative frequency of contraception use versus centered age for women in the 1989 Bangladesh Fertility Survey. The livch factor has been collapsed to children/nochildren. #| label: fig-contradata2 #| warning: false -draw( - data(contra) * - mapping( - :age => "Centered age (yr)", - :use => ==("Y") => "Frequency of contraceptive use"; - col=:urban => renamer(["N" => "Rural", "Y" => "Urban"]), - color=:ch => "Children", - ) * - smooth(); - figure=(; size=(650, 400)), -) +draw(data(contra) * + mapping(:age => "Centered age (yr)", + :use => ==("Y") => "Frequency of contraceptive use"; + col=:urban => renamer(["N" => "Rural", "Y" => "Urban"]), + color=:ch => "Children",) * + smooth(); + figure=(; size=(650, 400)),) ``` indicates that all four groups have a quadratic trend with respect to age but the location of the peak proportion is shifted for those without children relative to those with children. Incorporating an interaction of `age` and `ch` allows for such a shift. ```{julia} -com04 = - let f = - @formula(use ~ 1 + urban + ch * age + abs2(age) + (1 | dist)), +com04 = let f = @formula(use ~ 1 + urban + ch * age + abs2(age) + (1 | dist)), d = contra, ds = Bernoulli() fit(MixedModel, f, d, ds; contrasts, nAGQ, progress) - end +end ``` Comparing this fitted model to the previous one @@ -283,14 +269,12 @@ confirms the usefulness of this term. A series of such model fits led to a model with random effects for the combinations of `dist` and `urban`, because differences between urban and rural women in the same district were comparable to differences between districts, even after accounting for an effect of `urban` at the fixed-effects (or population) level. ```{julia} -com05 = - let f = @formula(use ~ - 1 + urban + ch * age + abs2(age) + (1 | dist & urban)), +com05 = let f = @formula(use ~ 1 + urban + ch * age + abs2(age) + (1 | dist & urban)), d = contra, ds = Bernoulli() fit(MixedModel, f, d, ds; contrasts, nAGQ, progress) - end +end ``` In more detail, @@ -332,19 +316,16 @@ We want to create a plot like @fig-contradata2 by evaluating the linear predicto First we construct a table of covariate values, `newdata`, containing the *Cartesian product* of values of these covariates, created using a [generator expression](https://docs.julialang.org/en/v1/manual/arrays/#Generator-Expressions) returning [NamedTuples](https://docs.julialang.org/en/v1/base/base/#Core.NamedTuple), which is the standard row-wise representation of tabular data. ```{julia} -newdata = Table( - (; age, ch, urban) for age in -10:3:20, ch in [false, true], - urban in ["N", "Y"] -) +newdata = Table((; age, ch, urban) + for age in -10:3:20, ch in [false, true], + urban in ["N", "Y"]) ``` Next we isolate the fixed-effects terms from the formula for model `com05` by selecting its `rhs` property (the right-hand side of the formula) then filtering out any random-effects terms in the expression. ```{julia} -feform = filter( - t -> !isa(t, MixedModels.AbstractReTerm), - com05.formula.rhs, -); +feform = filter(t -> !isa(t, MixedModels.AbstractReTerm), + com05.formula.rhs); ``` ::: {.callout-note} @@ -371,17 +352,13 @@ Plotting the result, @fig-com05pred, #| fig-cap: Linear predictor versus centered age from model com05 #| label: fig-com05pred #| warning: false -draw( - data(newdata) * - mapping( - :age => "Centered age (yr)", - :η => "Linear predictor value from model com05"; - col=:urban => renamer(["N" => "Rural", "Y" => "Urban"]), - color=:ch => "Children", - ) * - smooth(); - figure=(; size=(650, 400)), -) +draw(data(newdata) * + mapping(:age => "Centered age (yr)", + :η => "Linear predictor value from model com05"; + col=:urban => renamer(["N" => "Rural", "Y" => "Urban"]), + color=:ch => "Children",) * + smooth(); + figure=(; size=(650, 400)),) ``` shows that these curves follow the general trends of the data plot. @@ -421,10 +398,10 @@ is called the *logistic* function and is shown in @fig-logitinv. #| label: fig-logitinv #| warning: false let - fig = Figure(; size=(650, 300)) - ax = Axis(fig[1, 1]; xlabel="η", ylabel="μ") - lines!(ax, -5.5 .. 5.5, η -> inv(1 + exp(-η))) - fig + fig = Figure(; size=(650, 300)) + ax = Axis(fig[1, 1]; xlabel="η", ylabel="μ") + lines!(ax, -5.5 .. 5.5, η -> inv(1 + exp(-η))) + fig end ``` @@ -470,17 +447,13 @@ producing the population predictions on the probability scale, as shown in @fig- #| fig-cap: Predicted probability of contraception use versus centered age from model com05. #| label: fig-com05predmu #| warning: false -draw( - data(newdata) * - mapping( - :age => "Centered age (yr)", - :μ => "Probability of contraceptive use"; - col=:urban => renamer(["N" => "Rural", "Y" => "Urban"]), - color=:ch => "Children", - ) * - smooth(); - figure=(; size=(650, 400)), -) +draw(data(newdata) * + mapping(:age => "Centered age (yr)", + :μ => "Probability of contraceptive use"; + col=:urban => renamer(["N" => "Rural", "Y" => "Urban"]), + color=:ch => "Children",) * + smooth(); + figure=(; size=(650, 400)),) ``` On the probability scale we can compare the predictions to the observed frequencies shown with scatterplot smoother lines in @fig-contradata2. @@ -514,7 +487,7 @@ The numerical values and the identifier of the combination of `dist` and `urban` ```{julia} srtdre = let retbl = only(raneftables(com05)) - retbl[sortperm(last.(retbl))] + retbl[sortperm(last.(retbl))] end first(srtdre, 6) ``` diff --git a/intro.qmd b/intro.qmd index df93173..c46d553 100644 --- a/intro.qmd +++ b/intro.qmd @@ -212,23 +212,14 @@ Although this table does show us an important property of the data, namely that #| code-fold: true #| warning: false let - sumry = sort!( - combine(groupby(dyestuff, :batch), :yield => mean => :yield), - :yield, - ) - mp = mapping( - :yield => "Yield of dyestuff [g]", - :batch => - sorter(sumry.batch) => "Batch of intermediate product", - ) - draw( - ( - data(dyestuff) * - mp * - visual(Scatter; marker='○', markersize=12) - ) + (data(sumry) * mp * visual(Lines)); - figure=(; size=(600, 300)), - ) + sumry = sort!(combine(groupby(dyestuff, :batch), :yield => mean => :yield), + :yield) + mp = mapping(:yield => "Yield of dyestuff [g]", + :batch => sorter(sumry.batch) => "Batch of intermediate product") + draw((data(dyestuff) * + mp * + visual(Scatter; marker='○', markersize=12)) + (data(sumry) * mp * visual(Lines)); + figure=(; size=(600, 300)),) end ``` @@ -288,22 +279,14 @@ A data plot (@fig-dyestuff2data) #| code-fold: true #| warning: false let - sumry = sort!( - combine(groupby(dyestuff2, :batch), :yield => mean => :yield), - :yield, - ) - mp = mapping( - :yield => "Simulated yield", - :batch => sorter(sumry.batch) => "Batch", - ) - draw( - ( - data(dyestuff2) * - mp * - visual(Scatter; marker='○', markersize=12) - ) + (data(sumry) * mp * visual(Lines)); - figure=(; size=(600, 300)), - ) + sumry = sort!(combine(groupby(dyestuff2, :batch), :yield => mean => :yield), + :yield) + mp = mapping(:yield => "Simulated yield", + :batch => sorter(sumry.batch) => "Batch") + draw((data(dyestuff2) * + mp * + visual(Scatter; marker='○', markersize=12)) + (data(sumry) * mp * visual(Lines)); + figure=(; size=(600, 300)),) end ``` @@ -328,7 +311,7 @@ We fit a model to the data allowing for an overall level of the `yield` and for ```{julia} #| label: dsm01 dsm01 = let f = @formula(yield ~ 1 + (1 | batch)) - fit(MixedModel, f, dyestuff; progress=false) + fit(MixedModel, f, dyestuff; progress=false) end ``` @@ -417,7 +400,7 @@ Fitting a similar model to the `dyestuff2` data produces an estimate $\widehat{\ ```{julia} #| label: dsm02 dsm02 = let f = @formula(yield ~ 1 + (1 | batch)) - fit(MixedModel, f, dyestuff2; progress=false) + fit(MixedModel, f, dyestuff2; progress=false) end print(dsm02) ``` @@ -632,15 +615,11 @@ You can check the details by clicking on the "Code" button in the HTML version o #| fig-cap: Kernel density plot of bootstrap fixed-effects parameter estimates from dsm01 #| label: fig-dsm01_bs_beta_density #| warning: false -draw( - data(@subset(dsm01pars, :type == "β")) * - mapping( - :value => "Bootstrap samples of β"; - color=(:names => "Names"), - ) * - AlgebraOfGraphics.density(); - figure=(; size=(600, 340)), -) +draw(data(@subset(dsm01pars, :type == "β")) * + mapping(:value => "Bootstrap samples of β"; + color=(:names => "Names"),) * + AlgebraOfGraphics.density(); + figure=(; size=(600, 340)),) ``` The distribution of the estimates of `β₁` is more-or-less a Gaussian (or "normal") shape, with a mean value of `{julia} repr(mean(βdf.value), context=:compact=>true)` which is close to the estimated `β₁` of `{julia} repr(only(dsm01.β), context=:compact=>true)`. @@ -657,15 +636,11 @@ The situation is different for the estimates of the standard deviation parameter #| fig-cap: Kernel density plot of bootstrap variance-component parameter estimates from model dsm01 #| label: fig-dsm01_bs_sigma_density #| warning: false -draw( - data(@subset(dsm01pars, :type == "σ")) * - mapping( - :value => "Bootstrap samples of σ"; - color=(:group => "Group"), - ) * - AlgebraOfGraphics.density(); - figure=(; size=(600, 340)), -) +draw(data(@subset(dsm01pars, :type == "σ")) * + mapping(:value => "Bootstrap samples of σ"; + color=(:group => "Group"),) * + AlgebraOfGraphics.density(); + figure=(; size=(600, 340)),) ``` The estimator for the residual standard deviation, $\sigma$, is approximately normally distributed but the estimator for $\sigma_1$, the standard deviation of the `batch` random effects is bimodal (i.e. has two "modes" or local maxima). @@ -687,15 +662,11 @@ Use a lower alpha in the colors for multiple histograms so the bars behind anoth #| fig-cap: Histogram of bootstrap variance-components as standard deviations from model dsm01 #| label: fig-dsm01_bs_sigma_hist #| warning: false -draw( - data(@subset(dsm01pars, :type == "σ")) * - mapping( - :value => "Bootstrap parameter estimates of σ"; - color=(:group => "Group"), - ) * - AlgebraOfGraphics.histogram(; bins=80); - figure=(; size=(600, 340)), -) +draw(data(@subset(dsm01pars, :type == "σ")) * + mapping(:value => "Bootstrap parameter estimates of σ"; + color=(:group => "Group"),) * + AlgebraOfGraphics.histogram(; bins=80); + figure=(; size=(600, 340)),) ``` Nearly 1000 of the 10,000 bootstrap fits are "singular" in that the estimated unconditional distribution of the random effects is degenerate. @@ -718,15 +689,11 @@ In many cases standard errors are quoted for estimates of the variance component #| fig-cap: Histogram of bootstrap variance-components from model dsm01 #| label: fig-dsm01_bs_sigmasqr_hist #| warning: false -draw( - data(@transform(@subset(dsm01pars, :type == "σ"), abs2(:value))) * - mapping( - :value_abs2 => "Bootstrap sample of estimates of σ²", - color=:group, - ) * - AlgebraOfGraphics.histogram(; bins=200); - figure=(; size=(600, 340)), -) +draw(data(@transform(@subset(dsm01pars, :type == "σ"), abs2(:value))) * + mapping(:value_abs2 => "Bootstrap sample of estimates of σ²"; + color=:group) * + AlgebraOfGraphics.histogram(; bins=200); + figure=(; size=(600, 340)),) ``` The approach of creating coverage intervals from a bootstrap sample of parameter estimates, described in the next section, does accommodate non-normal distributions. @@ -760,13 +727,11 @@ The optional argument, `thin=1`, in a call to `fit` causes all the values of $\b ```{julia} #| label: dsm01trace -dsm01trace = fit( - MixedModel, - @formula(yield ~ 1 + (1 | batch)), - dyestuff; - thin=1, - progress=false, -) +dsm01trace = fit(MixedModel, + @formula(yield ~ 1 + (1 | batch)), + dyestuff; + thin=1, + progress=false,) DataFrame(dsm01trace.optsum.fitlog) ``` @@ -824,7 +789,7 @@ A plot of these prediction intervals is sometimes called a *caterpillar plot* be #| fig-cap: Caterpillar plot of prediction intervals for dsm01 random effects #| label: fig-caterpillar_dsm01 #| warning: false -caterpillar!(Figure(size=(600, 200)), ranefinfo(dsm01, :batch)) +caterpillar!(Figure(; size=(600, 200)), ranefinfo(dsm01, :batch)) ``` The `caterpillar` function returns a plot with linear spacing of the @@ -836,7 +801,7 @@ An alternative, the `qqcaterpillar` function #| fig-cap: Quantile caterpillar plot of prediction intervals for dsm01 random effects #| label: fig-qqcaterpillar_dsm01 #| warning: false -qqcaterpillar!(Figure(size=(600, 200)), ranefinfo(dsm01, :batch)) +qqcaterpillar!(Figure(; size=(600, 200)), ranefinfo(dsm01, :batch)) ``` returns a plot like @fig-qqcaterpillar_dsm01 where the intervals are plotted with vertical spacing corresponding to the quantiles of the standard normal distribution. @@ -873,13 +838,11 @@ There is an advantage in assigning the contrasts dictionary to the name `contras ```{julia} #| code-fold: false #| eval: false -dsm01 = fit( - MixedModel, - @formula(yield ~ 1 + (1 | batch)), - dyestuff; - contrasts=contrasts, - progress=false, -) +dsm01 = fit(MixedModel, + @formula(yield ~ 1 + (1 | batch)), + dyestuff; + contrasts=contrasts, + progress=false,) ``` can be condensed to @@ -887,13 +850,11 @@ can be condensed to ```{julia} #| code-fold: false #| eval: false -dsm01 = fit( - MixedModel, - @formula(yield ~ 1 + (1 | batch)), - dyestuff; - contrasts, - progress=false, -) +dsm01 = fit(MixedModel, + @formula(yield ~ 1 + (1 | batch)), + dyestuff; + contrasts, + progress=false,) ``` That is, if the name of the object to be passed as a named argument is the same as the name of the argument, like `contrasts=contrasts`, then the name does not need to be repeated. @@ -915,7 +876,7 @@ Thus `dsm01` can be assigned as #| code-fold: false #| eval: false dsm01 = let f = @formula(yield ~ 1 + (1 | batch)) - fit(MixedModel, f, dyestuff; contrasts, progress=false) + fit(MixedModel, f, dyestuff; contrasts, progress=false) end ``` diff --git a/largescaledesigned.qmd b/largescaledesigned.qmd index cdb2bca..2c90b9f 100644 --- a/largescaledesigned.qmd +++ b/largescaledesigned.qmd @@ -65,10 +65,8 @@ Subject identifiers are coded in integers from 1 to 816. We prefer them as strin We prefix the subject number with 'S' and leftpad the number with zeros to three digits. ```{julia} -ldttrial = transform!( - DataFrame(ldttrial), - :seq => ByRow(>(2000)) => :s2; -) +ldttrial = transform!(DataFrame(ldttrial), + :seq => ByRow(>(2000)) => :s2;) describe(ldttrial) ``` @@ -93,17 +91,13 @@ As we will create similar summaries by subject, we incorporate an 'i' in the nam ```{julia} byitem = @chain ldttrial begin - groupby(:item) - @combine( - :ni = length(:acc), # no. of obs - :imiss = count(ismissing, :acc), # no. of missing acc - :iacc = count(skipmissing(:acc)), # no. of accurate - :imedianrt = median(:rt), - ) - @transform!( - :wrdlen = Int8(length(:item)), - :ipropacc = :iacc / :ni - ) + groupby(:item) + @combine(:ni = length(:acc), # no. of obs + :imiss = count(ismissing, :acc), # no. of missing acc + :iacc = count(skipmissing(:acc)), # no. of accurate + :imedianrt = median(:rt),) + @transform!(:wrdlen = Int8(length(:item)), + :ipropacc = :iacc / :ni) end ``` @@ -130,13 +124,9 @@ It is called a *left* join because the left (or first) table takes precedence, i If there is no matching row in the second table then missing values are inserted for the columns from the right table in the result. ```{julia} -describe( - leftjoin!( - ldttrial, - select(byitem, :item, :wrdlen, :isword); - on=:item, - ), -) +describe(leftjoin!(ldttrial, + select(byitem, :item, :wrdlen, :isword); + on=:item,)) ``` Notice that the `wrdlen` and `isword` variables in this table allow for missing values, because they are derived from the second argument, but there are no missing values for these variables. @@ -164,12 +154,10 @@ A barchart of the word length counts, @fig-ldtwrdlenhist, shows that the majorit #| label: fig-ldtwrdlenhist #| warning: false let wlen = 1:21 - draw( - data((; wrdlen=wlen, count=counts(byitem.wrdlen, wlen))) * - mapping(:wrdlen => "Length of word", :count) * - visual(BarPlot); - figure=(; size=(600, 450)), - ) + draw(data((; wrdlen=wlen, count=counts(byitem.wrdlen, wlen))) * + mapping(:wrdlen => "Length of word", :count) * + visual(BarPlot); + figure=(; size=(600, 450)),) end ``` @@ -182,16 +170,12 @@ but accuracy decreases with increasing word length for the nonwords. #| fig-cap: "Smoothed curves of accuracy versus word length in the lexical decision task." #| label: fig-ldtaccsmooth #| warning: false -draw( - data(filter(:acc => !ismissing, ldttrial)) * - mapping( - :wrdlen => "Word length", - :acc => "Accuracy"; - color=:isword, - ) * - smooth(); - figure=(; size=(600, 340)), -) +draw(data(filter(:acc => !ismissing, ldttrial)) * + mapping(:wrdlen => "Word length", + :acc => "Accuracy"; + color=:isword,) * + smooth(); + figure=(; size=(600, 340)),) ``` @fig-ldtaccsmooth may be a bit misleading because the largest discrepancies in proportion of accurate identifications of words and nonwords occur for the longest words, of which there are few. @@ -208,16 +192,12 @@ If we restrict the smoother curves to this range, as in @fig-ldtaccsmoothrestric #| fig-cap: "Smoothed curves of accuracy versus word length in the range 4 to 13 characters in the lexical decision task." #| label: fig-ldtaccsmoothrestrict #| warning: false -draw( - data(@subset(ldttrial, !ismissing(:acc), 4 ≤ :wrdlen ≤ 13)) * - mapping( - :wrdlen => "Word length", - :acc => "Accuracy"; - color=:isword, - ) * - smooth(); - figure=(; size=(600, 340)), -) +draw(data(@subset(ldttrial, !ismissing(:acc), 4 ≤ :wrdlen ≤ 13)) * + mapping(:wrdlen => "Word length", + :acc => "Accuracy"; + color=:isword,) * + smooth(); + figure=(; size=(600, 340)),) ``` the differences are less dramatic. @@ -230,26 +210,18 @@ Another way to visualize these results is by plotting the proportion accurate ve #| label: fig-propvswrdlen #| warning: false let - itemsummry = combine( - groupby(byitem, [:wrdlen, :isword]), - :ni => sum, - :imiss => sum, - :iacc => sum, - ) - @transform!( - itemsummry, - :iacc_mean = :iacc_sum / (:ni_sum - :imiss_sum) - ) - @transform!(itemsummry, :msz = sqrt((:ni_sum - :imiss_sum) / 800)) - draw( - data(itemsummry) * mapping( - :wrdlen => "Word length", - :iacc_mean => "Proportion accurate"; - color=:isword, - markersize=:msz, - ); - figure=(; size=(600, 340)), - ) + itemsummry = combine(groupby(byitem, [:wrdlen, :isword]), + :ni => sum, + :imiss => sum, + :iacc => sum) + @transform!(itemsummry, + :iacc_mean = :iacc_sum / (:ni_sum - :imiss_sum)) + @transform!(itemsummry, :msz = sqrt((:ni_sum - :imiss_sum) / 800)) + draw(data(itemsummry) * mapping(:wrdlen => "Word length", + :iacc_mean => "Proportion accurate"; + color=:isword, + markersize=:msz,); + figure=(; size=(600, 340)),) end ``` @@ -261,14 +233,12 @@ A summary of accuracy and median response time by subject ```{julia} bysubj = @chain ldttrial begin - groupby(:subj) - @combine( - :ns = length(:acc), # no. of obs - :smiss = count(ismissing, :acc), # no. of missing acc - :sacc = count(skipmissing(:acc)), # no. of accurate - :smedianrt = median(:rt), - ) - @transform!(:spropacc = :sacc / :ns) + groupby(:subj) + @combine(:ns = length(:acc), # no. of obs + :smiss = count(ismissing, :acc), # no. of missing acc + :sacc = count(skipmissing(:acc)), # no. of accurate + :smedianrt = median(:rt),) + @transform!(:spropacc = :sacc / :ns) end ``` @@ -286,15 +256,11 @@ A plot of the median response time versus proportion accurate, @fig-ldtmedianrtv #| fig-cap: "Median response time versus proportion accurate by subject in the LDT." #| label: fig-ldtmedianrtvspropacc #| warning: false -draw( - data(bysubj) * - mapping( - :spropacc => "Proportion accurate", - :smedianrt => "Median response time (ms)", - ) * - (smooth() + visual(Scatter)); - figure=(; size=(600, 450)), -) +draw(data(bysubj) * + mapping(:spropacc => "Proportion accurate", + :smedianrt => "Median response time (ms)") * + (smooth() + visual(Scatter)); + figure=(; size=(600, 450)),) ``` As described in @Balota_2007, the participants performed the trials in blocks of 250 followed by a short break. @@ -327,9 +293,9 @@ To examine the effects of the fast but inaccurate responders we will fit models ```{julia} pruned = @chain ldttrial begin - @subset(!ismissing(:acc), 200 ≤ :rt ≤ 3000,) - leftjoin!(select(bysubj, :subj, :spropacc); on=:subj) - dropmissing! + @subset(!ismissing(:acc), 200 ≤ :rt ≤ 3000,) + leftjoin!(select(bysubj, :subj, :spropacc); on=:subj) + dropmissing! end size(pruned) ``` @@ -351,12 +317,10 @@ A density plot of the pruned response times, @fig-elpldtrtdens, shows they are s #| fig-cap: Kernel density plot of the pruned response times (ms.) in the LDT. #| label: fig-elpldtrtdens #| warning: false -draw( - data(pruned) * - mapping(:rt => "Response time (ms.) for correct responses") * - AlgebraOfGraphics.density(); - figure=(; size=(600, 340)), -) +draw(data(pruned) * + mapping(:rt => "Response time (ms.) for correct responses") * + AlgebraOfGraphics.density(); + figure=(; size=(600, 340)),) ``` In such cases it is common to transform the response to a scale such as the logarithm of the response time or to the speed of the response, which is the inverse of the response time. @@ -368,27 +332,18 @@ The density of the response speed, in responses per second, is shown in @fig-elp #| fig-cap: Kernel density plot of the pruned response speed in the LDT. #| label: fig-elpldtspeeddens #| warning: false -draw( - data(pruned) * - mapping( - :rt => - ( - x -> 1000 / x - ) => "Response speed (s⁻¹) for correct responses", - ) * - AlgebraOfGraphics.density(); - figure=(; size=(600, 340)), -) +draw(data(pruned) * + mapping(:rt => (x -> 1000 / x) => "Response speed (s⁻¹) for correct responses") * + AlgebraOfGraphics.density(); + figure=(; size=(600, 340)),) ``` @fig-elpldtrtdens and @fig-elpldtspeeddens indicate that it may be more reasonable to establish a lower bound of 1/3 second (333 ms) on the response latency, corresponding to an upper bound of 3 per second on the response speed. However, only about one half of one percent of the correct responses have latencies in the range of 200 ms. to 333 ms. ```{julia} -count( - r -> !ismissing(r.acc) && 200 < r.rt < 333, - eachrow(ldttrial), -) / count(!ismissing, ldttrial.acc) +count(r -> !ismissing(r.acc) && 200 < r.rt < 333, + eachrow(ldttrial)) / count(!ismissing, ldttrial.acc) ``` so the exact position of the lower cut-off point on the response latencies is unlikely to be very important. @@ -401,16 +356,12 @@ For example, @fig-ldtrtvswrdlen shows the smoothed relationship between word len #| fig-cap: "Scatterplot smooths of response time versus word length in the LDT." #| label: fig-ldtrtvswrdlen #| warning: false -draw( - data(pruned) * - mapping( - :wrdlen => "Word length", - :rt => "Response time (ms)"; - :color => :isword, - ) * - smooth(); - figure=(; size=(600, 450)), -) +draw(data(pruned) * + mapping(:wrdlen => "Word length", + :rt => "Response time (ms)"; + :color => :isword,) * + smooth(); + figure=(; size=(600, 450)),) ``` and @fig-ldtspeedvswrdlen shows the similar relationships for speed @@ -420,16 +371,12 @@ and @fig-ldtspeedvswrdlen shows the similar relationships for speed #| fig-cap: "Scatterplot smooths of response speed versus word length in the LDT." #| label: fig-ldtspeedvswrdlen #| warning: false -draw( - data(pruned) * - mapping( - :wrdlen => "Word length", - :rt => (x -> 1000 / x) => "Speed of response (s⁻¹)"; - :color => :isword, - ) * - smooth(); - figure=(; size=(600, 450)), -) +draw(data(pruned) * + mapping(:wrdlen => "Word length", + :rt => (x -> 1000 / x) => "Speed of response (s⁻¹)"; + :color => :isword,) * + smooth(); + figure=(; size=(600, 450)),) ``` For the most part the smoother lines in @fig-ldtspeedvswrdlen are reasonably straight. @@ -443,17 +390,14 @@ The small amount of curvature is associated with short word lengths, say less th #| label: fig-speedviolin #| warning: false let - plt = data(@subset(pruned, :wrdlen > 3, :wrdlen < 14)) - plt *= mapping( - :wrdlen => "Word length", - :rt => (x -> 1000 / x) => "Speed of response (s⁻¹)", - color=:isword) - plt *= (mapping(; side=:isword) * visual(Violin) + linear(; interval=:confidence)) - draw( - plt, - axis=(; limits=(nothing, (0.0, 2.8))); - figure=(; size=(600, 450)), - ) + plt = data(@subset(pruned, :wrdlen > 3, :wrdlen < 14)) + plt *= mapping(:wrdlen => "Word length", + :rt => (x -> 1000 / x) => "Speed of response (s⁻¹)"; + color=:isword) + plt *= (mapping(; side=:isword) * visual(Violin) + linear(; interval=:confidence)) + draw(plt; + axis=(; limits=(nothing, (0.0, 2.8))), + figure=(; size=(600, 450)),) end ``` @@ -486,11 +430,9 @@ contrasts[:wrdlen] = Center(8); and fit a first model with simple, scalar, random effects for `subj` and `item`. ```{julia} -elm01 = - let f = @formula 1000 / rt ~ - 1 + isword * wrdlen + (1 | item) + (1 | subj) +elm01 = let f = @formula 1000 / rt ~ 1 + isword * wrdlen + (1 | item) + (1 | subj) fit(MixedModel, f, pruned; contrasts, progress) - end +end ``` The predicted response speed by word length and word/nonword status can be summarized as @@ -502,12 +444,10 @@ effects(Dict(:isword => [false, true], :wrdlen => 4:2:12), elm01) If we restrict to only those subjects with 80% accuracy or greater the model becomes ```{julia} -elm02 = - let f = @formula 1000 / rt ~ - 1 + isword * wrdlen + (1 | item) + (1 | subj) +elm02 = let f = @formula 1000 / rt ~ 1 + isword * wrdlen + (1 | item) + (1 | subj) dat = filter(:spropacc => >(0.8), pruned) fit(MixedModel, f, dat; contrasts, progress) - end +end ``` ```{julia} @@ -523,23 +463,18 @@ To compare the conditional means of the random effects for item in these two mod #| warning: false CairoMakie.activate!(; type="png") -condmeans = leftjoin!( - leftjoin!( - rename!(DataFrame(raneftables(elm01)[:item]), [:item, :elm01]), - rename!(DataFrame(raneftables(elm02)[:item]), [:item, :elm02]); - on=:item, - ), - select(byitem, :item, :isword; copycols=false); - on=:item, -) -draw( - data(condmeans) * mapping( - :elm01 => "Conditional means of item random effects for model elm01", - :elm02 => "Conditional means of item random effects for model elm02"; - color=:isword, - ); - figure=(; size=(600, 400)), -) +condmeans = leftjoin!(leftjoin!(rename!(DataFrame(raneftables(elm01)[:item]), + [:item, :elm01]), + rename!(DataFrame(raneftables(elm02)[:item]), + [:item, :elm02]); + on=:item,), + select(byitem, :item, :isword; copycols=false); + on=:item,) +draw(data(condmeans) * + mapping(:elm01 => "Conditional means of item random effects for model elm01", + :elm02 => "Conditional means of item random effects for model elm02"; + color=:isword,); + figure=(; size=(600, 400)),) ``` ::: {.callout-note} @@ -563,13 +498,11 @@ saveoptsum("./optsums/elm01.json", elm01); ``` ```{julia} -elm01a = restoreoptsum!( - let f = @formula 1000 / rt ~ - 1 + isword * wrdlen + (1 | item) + (1 | subj) - MixedModel(f, pruned; contrasts) - end, - "./optsums/elm01.json", -) +elm01a = restoreoptsum!(let f = @formula 1000 / rt ~ 1 + isword * wrdlen + (1 | item) + + (1 | subj) + MixedModel(f, pruned; contrasts) + end, + "./optsums/elm01.json") ``` Other covariates associated with the item are available as diff --git a/largescaleobserved.qmd b/largescaleobserved.qmd index 895855e..cfe390a 100644 --- a/largescaleobserved.qmd +++ b/largescaleobserved.qmd @@ -38,8 +38,7 @@ and define some constants and a utility function #| code-fold: true #| output: false #| label: constants05 -optsumdir(paths::AbstractString...) = - joinpath(@__DIR__, "optsums", paths...) +optsumdir(paths::AbstractString...) = joinpath(@__DIR__, "optsums", paths...) @isdefined(contrasts) || const contrasts = Dict{Symbol,Any}() @isdefined(progress) || const progress = false ``` @@ -70,8 +69,7 @@ Convert this Arrow table to a `Table` and drop the `timestamp` column, which we (For small data sets dropping such columns is not important but with over 25 million ratings it does help to drop unnecessary columns to save some memory space.) ```{julia} -ratings = - Table(getproperties(ratings, (:userId, :movieId, :rating))) +ratings = Table(getproperties(ratings, (:userId, :movieId, :rating))) ``` Information on the movies is available in the `movies` dataset. @@ -90,13 +88,9 @@ extrema(movies.nrtngs) The number of ratings per user is also highly skewed ```{julia} -users = Table( - combine( - groupby(DataFrame(ratings), :userId), - nrow => :urtngs, - :rating => mean => :umnrtng, - ), -) +users = Table(combine(groupby(DataFrame(ratings), :userId), + nrow => :urtngs, + :rating => mean => :umnrtng)) ``` ```{julia} @@ -114,43 +108,35 @@ Because the distribution of the number of ratings per movie or per user is so hi #| label: fig-nrtngsecdf #| warning: false let - f = Figure(; size=(700, 350)) - xscale = log10 - xminorticksvisible = true - xminorgridvisible = true - yminorticksvisible = true - xminorticks = IntervalsBetween(10) - ylabel = "Relative cumulative frequency" - nrtngs = sort(movies.nrtngs) - ecdfplot( - f[1, 1], - nrtngs; - npoints=last(nrtngs), - axis=( - xlabel="Number of ratings per movie (logarithmic scale)", - xminorgridvisible, - xminorticks, - xminorticksvisible, - xscale, - ylabel, - yminorticksvisible, - ), - ) - urtngs = sort(users.urtngs) - ecdfplot( - f[1, 2], - urtngs; - npoints=last(urtngs), - axis=( - xlabel="Number of ratings per user (logarithmic scale)", - xminorgridvisible, - xminorticks, - xminorticksvisible, - xscale, - yminorticksvisible, - ), - ) - f + f = Figure(; size=(700, 350)) + xscale = log10 + xminorticksvisible = true + xminorgridvisible = true + yminorticksvisible = true + xminorticks = IntervalsBetween(10) + ylabel = "Relative cumulative frequency" + nrtngs = sort(movies.nrtngs) + ecdfplot(f[1, 1], + nrtngs; + npoints=last(nrtngs), + axis=(xlabel="Number of ratings per movie (logarithmic scale)", + xminorgridvisible, + xminorticks, + xminorticksvisible, + xscale, + ylabel, + yminorticksvisible),) + urtngs = sort(users.urtngs) + ecdfplot(f[1, 2], + urtngs; + npoints=last(urtngs), + axis=(xlabel="Number of ratings per user (logarithmic scale)", + xminorgridvisible, + xminorticks, + xminorticksvisible, + xscale, + yminorticksvisible),) + f end ``` @@ -172,19 +158,12 @@ Furthermore, the distribution of ratings for movies with only one rating is syst First, add `nrtngs` and `urtngs` as columns of the `ratings` table. ```{julia} -ratings = Table( - disallowmissing!( - leftjoin!( - leftjoin!( - DataFrame(ratings), - select!(DataFrame(movies), :movieId, :nrtngs); - on=:movieId, - ), - select!(DataFrame(users), :userId, :urtngs); - on=:userId, - ), - ), -) +ratings = Table(disallowmissing!(leftjoin!(leftjoin!(DataFrame(ratings), + select!(DataFrame(movies), :movieId, + :nrtngs); + on=:movieId,), + select!(DataFrame(users), :userId, :urtngs); + on=:userId,))) ``` then create a bar chart of the two distributions @@ -195,33 +174,28 @@ then create a bar chart of the two distributions #| label: fig-ratingsbarcharts #| warning: false let - fiveplus = zeros(Int, 10) - onlyone = zeros(Int, 10) - for (i, n) in - zip(round.(Int, Float32(2) .* ratings.rating), ratings.nrtngs) - if n > 4 - fiveplus[i] += 1 - elseif isone(n) - onlyone[i] += 1 + fiveplus = zeros(Int, 10) + onlyone = zeros(Int, 10) + for (i, n) in + zip(round.(Int, Float32(2) .* ratings.rating), ratings.nrtngs) + if n > 4 + fiveplus[i] += 1 + elseif isone(n) + onlyone[i] += 1 + end end - end - fiveprop = fiveplus ./ sum(fiveplus) - oneprop = onlyone ./ sum(onlyone) - draw( - data((; - props=vcat(fiveprop, oneprop), - rating=repeat(0.5:0.5:5.0; outer=2), - nratings=repeat(["≥5", "only 1"]; inner=10), - )) * - mapping( - :rating => nonnumeric, - :props => "Proportion of ratings"; - color=:nratings => "Ratings/movie", - dodge=:nratings, - ) * - visual(BarPlot); - figure=(; size=(600, 400)), - ) + fiveprop = fiveplus ./ sum(fiveplus) + oneprop = onlyone ./ sum(onlyone) + draw(data((; + props=vcat(fiveprop, oneprop), + rating=repeat(0.5:0.5:5.0; outer=2), + nratings=repeat(["≥5", "only 1"]; inner=10),)) * + mapping(:rating => nonnumeric, + :props => "Proportion of ratings"; + color=:nratings => "Ratings/movie", + dodge=:nratings,) * + visual(BarPlot); + figure=(; size=(600, 400)),) end ``` @@ -270,9 +244,9 @@ As shown if @fig-nratingsbycutoff, the number of ratings varies from a little ov #| label: fig-nratingsbycutoff #| fig-cap: "Number of ratings in reduced table by movie cutoff value" ggplot(sizespeed, aes(; x=:mc, y=:nratings, color=:ucutoff)) + - geom_point() + - geom_line() + - labs(x="Minimum number of ratings per movie", y="Number of ratings") +geom_point() + +geom_line() + +labs(; x="Minimum number of ratings per movie", y="Number of ratings") ``` For this range of choices of cutoffs, the user cutoff has more impact on the number of ratings in the reduced dataset than does the movie cutoff. @@ -286,9 +260,9 @@ A glance at the table shows that the number of users, `nusers`, is essentially a #| label: fig-nusersbycutoff #| fig-cap: "Number of users in reduced table by movie cutoff value" ggplot(sizespeed, aes(; x=:mc, y=:nmvie, color=:ucutoff)) + - geom_point() + - geom_line() + - labs(x="Minimum number of ratings per movie", y="Number of movies in table") +geom_point() + +geom_line() + +labs(; x="Minimum number of ratings per movie", y="Number of movies in table") ``` ```{julia} @@ -305,31 +279,23 @@ This, and any of the models shown in the table, can be restored in a few minutes ```{julia} #| code-fold: true -function ratingsoptsum( - mcutoff::Integer, - ucutoff::Integer; - data=Table(ratings), - form=@formula(rating ~ 1 + (1 | userId) + (1 | movieId)), - contrasts=contrasts, -) - optsumfnm = optsumdir( - "mvm$(lpad(mcutoff, 2, '0'))u$(lpad(ucutoff, 2, '0')).json", - ) - model = LinearMixedModel( - form, - filter( - r -> (r.nrtngs ≥ mcutoff) & (r.urtngs ≥ ucutoff), - data, - ); - contrasts, - ) - isfile(optsumfnm) && return restoreoptsum!(model, optsumfnm) - - @warn "File $optsumfnm is not available, fitting model." - model.optsum.initial .= 0.5 - fit!(model; thin=1) - saveoptsum(optsumfnm, model) - return model +function ratingsoptsum(mcutoff::Integer, + ucutoff::Integer; + data=Table(ratings), + form=@formula(rating ~ 1 + (1 | userId) + (1 | movieId)), + contrasts=contrasts,) + optsumfnm = optsumdir("mvm$(lpad(mcutoff, 2, '0'))u$(lpad(ucutoff, 2, '0')).json") + model = LinearMixedModel(form, + filter(r -> (r.nrtngs ≥ mcutoff) & (r.urtngs ≥ ucutoff), + data); + contrasts,) + isfile(optsumfnm) && return restoreoptsum!(model, optsumfnm) + + @warn "File $optsumfnm is not available, fitting model." + model.optsum.initial .= 0.5 + fit!(model; thin=1) + saveoptsum(optsumfnm, model) + return model end mvm50u80 = ratingsoptsum(50, 80) print(mvm50u80) @@ -354,17 +320,16 @@ The memory footprint (bytes) of each of the blocks is ```{julia} #| code-fold: true let - block = String[] - for i in 1:3 - for j in 1:i - push!(block, "[$i,$j]") + block = String[] + for i in 1:3 + for j in 1:i + push!(block, "[$i,$j]") + end end - end - Table((; - block, - Abytes=Base.summarysize.(mvm50u80.A), - Lbytes=Base.summarysize.(mvm50u80.L), - )) + Table((; + block, + Abytes=Base.summarysize.(mvm50u80.A), + Lbytes=Base.summarysize.(mvm50u80.L),)) end ``` @@ -372,9 +337,7 @@ resulting in total memory footprints (GiB) of ```{julia} #| code-fold: true -NamedTuple{(:A, :L)}( - Base.summarysize.(getproperty.(Ref(mvm50u80), (:A, :L))) ./ 2^30, -) +NamedTuple{(:A, :L)}(Base.summarysize.(getproperty.(Ref(mvm50u80), (:A, :L))) ./ 2^30) ``` That is, `L` requires roughly 10 times the amount of storage as does `A`, and that difference is entirely due to the different structure of the [2,2] block. @@ -386,8 +349,8 @@ The matrix is about 98% zeros or, equivalently, a little over 2% nonzeros, ```{julia} let - L21 = mvm50u80.L[2] # blocks are stored in a one-dimensional array - nnz(L21) / length(L21) + L21 = mvm50u80.L[2] # blocks are stored in a one-dimensional array + nnz(L21) / length(L21) end ``` @@ -400,10 +363,10 @@ The memory footprint of the model representation depends strongly on the number #| code-fold: true #| fig-cap: Memory footprint of the model representation by minimum number of ratings per user and per movie." #| label: fig-memoryfootprint -ggplot(sizespeed, aes(x=:mc, y=:modelsz, color=:ucutoff)) + - geom_point() + - geom_line() + - labs(x="Minimum number of ratings per movie", y="Size of model (GiB)") +ggplot(sizespeed, aes(; x=:mc, y=:modelsz, color=:ucutoff)) + +geom_point() + +geom_line() + +labs(; x="Minimum number of ratings per movie", y="Size of model (GiB)") ``` @fig-memoryvsl22 shows the dominance of the `[2, 2]` block of `L` in the overall memory footprint of the model @@ -413,9 +376,9 @@ ggplot(sizespeed, aes(x=:mc, y=:modelsz, color=:ucutoff)) + #| fig-cap: Memory footprint of the model representation (GiB) versus the size of the [2, 2] block of L (GiB) #| label: fig-memoryvsl22 ggplot(sizespeed, aes(; x=:L22sz, y=:modelsz, color=:ucutoff)) + - geom_point() + - geom_line() + - labs(y="Size of model representation (GiB)", x="Size of [2,2] block of L (GiB)") +geom_point() + +geom_line() + +labs(; y="Size of model representation (GiB)", x="Size of [2,2] block of L (GiB)") ``` @fig-memoryfootprint shows that when all the movies are included in the data to which the model is fit (i.e. `mc == 1`) the total memory footprint is over 20 GiB, and nearly 90% of that memory is that required for the `[2,2]` block of `L`. @@ -441,10 +404,10 @@ As shown in @fig-evtimevsl22 the evaluation time for the objective is predominan #| code-fold: true #| fig-cap: "Evaluation time for the objective (s) versus size of the [2, 2] block of L (GiB)" #| label: fig-evtimevsl22 -ggplot(sizespeed, aes(x=:L22sz, y=:evtime, color=:ucutoff)) + - geom_point() + - geom_line() + - labs(x="Size of [2,2] block of L (GiB)", y="Time for one evaluation of objective (s)") +ggplot(sizespeed, aes(; x=:L22sz, y=:evtime, color=:ucutoff)) + +geom_point() + +geom_line() + +labs(; x="Size of [2,2] block of L (GiB)", y="Time for one evaluation of objective (s)") ``` However the middle panel shows that the number of iterations to convergence is highly variable. diff --git a/linalg.qmd b/linalg.qmd index d9a7e34..04e7384 100644 --- a/linalg.qmd +++ b/linalg.qmd @@ -436,14 +436,10 @@ For the extended Cholesky factor, create the extended matrix of sums of squares ```{julia} crprod = let x = S372.days - Symmetric( - [ - length(x) sum(x) sum(y) - 0.0 sum(abs2, x) dot(x, y) - 0.0 0.0 sum(abs2, y) - ], - :U, - ) + Symmetric([length(x) sum(x) sum(y) + 0.0 sum(abs2, x) dot(x, y) + 0.0 0.0 sum(abs2, y)], + :U) end ``` @@ -458,10 +454,8 @@ extchfac = cholesky(crprod) and information from which the parameter estimates can be evaluated. ```{julia} -β̂ ≈ ldiv!( - UpperTriangular(view(extchfac.U, 1:2, 1:2)), - copy(view(extchfac.U, 1:2, 3)), -) +β̂ ≈ ldiv!(UpperTriangular(view(extchfac.U, 1:2, 1:2)), + copy(view(extchfac.U, 1:2, 3))) ``` The operator `≈` is a check of approximate equality of floating point numbers or arrays. diff --git a/longitudinal.qmd b/longitudinal.qmd index 9000d24..d98cf48 100644 --- a/longitudinal.qmd +++ b/longitudinal.qmd @@ -89,16 +89,12 @@ A common way of plotting such longitudinal data is response versus time on a sin #| fig-cap: Length of ramus bone versus age for a sample of 20 boys. #| label: fig-egspaghetti #| warning: false -draw( - data(egdf) * - mapping( - :time => "Age (yr)", - :resp => "Ramus bone length (mm)", - color=:Subj, - ) * - (visual(Scatter) + visual(Lines)); - figure=(; size=(600, 450)), -) +draw(data(egdf) * + mapping(:time => "Age (yr)", + :resp => "Ramus bone length (mm)"; + color=:Subj) * + (visual(Scatter) + visual(Lines)); + figure=(; size=(600, 450)),) ``` Unfortunately, unless there are very few subjects, such figures, sometimes called "spaghetti plots", are difficult to decipher. @@ -137,7 +133,7 @@ Although it seems that there isn't a strong correlation between initial bone len ```{julia} egm01 = let f = @formula resp ~ 1 + time + (1 + time | Subj) - fit(MixedModel, f, egdf; contrasts, progress) + fit(MixedModel, f, egdf; contrasts, progress) end print(egm01) ``` @@ -154,7 +150,7 @@ A caterpillar plot of the random effects for intercept and slope, @fig-egm01cate #| fig-cap: Conditional modes and 95% prediction intervals on the random effects for slope and intercept in model egm01 #| label: fig-egm01caterpillar #| warning: false -caterpillar!(Figure(size=(600, 320)), ranefinfo(egm01, :Subj)) +caterpillar!(Figure(; size=(600, 320)), ranefinfo(egm01, :Subj)) ``` ### Centering the time values @@ -182,7 +178,7 @@ A model with `time` centered at 8 years of age can be fit as ```{julia} contrasts[:time] = Center(8) egm02 = let f = @formula resp ~ 1 + time + (1 + time | Subj) - fit(MixedModel, f, egdf; contrasts, progress) + fit(MixedModel, f, egdf; contrasts, progress) end print(egm02) ``` @@ -197,7 +193,7 @@ A caterpillar plot, @fig-egm02caterpillar, for `egm02` shows much smaller spread #| label: fig-egm02caterpillar #| code-fold: true #| warning: false -caterpillar!(Figure(size=(600, 380)), ranefinfo(egm02, :Subj)) +caterpillar!(Figure(; size=(600, 380)), ranefinfo(egm02, :Subj)) ``` A third option is to center the `time` covariate at the mean of the original `time` values. @@ -205,7 +201,7 @@ A third option is to center the `time` covariate at the mean of the original `ti ```{julia} contrasts[:time] = Center() egm03 = let f = @formula resp ~ 1 + time + (1 + time | Subj) - fit(MixedModel, f, egdf; contrasts, progress) + fit(MixedModel, f, egdf; contrasts, progress) end print(egm03) ``` @@ -253,12 +249,10 @@ The usual cautions about polynomial terms in regression models apply even more e For balanced data like `egdf` we usually center the `time` axis about the mean, producing ```{julia} -egm04 = - let f = @formula resp ~ - 1 + ctime + ctime^2 + (1 + ctime + ctime^2 | Subj) +egm04 = let f = @formula resp ~ 1 + ctime + ctime^2 + (1 + ctime + ctime^2 | Subj) dat = @transform(egdf, :ctime = :time - 8.75) fit(MixedModel, f, dat; contrasts, progress) - end +end print(egm04) ``` @@ -314,10 +308,8 @@ Because the number of combinations of `Subj` and `Group` is equal to the number These data are balanced with respect to `time` (i.e. each rat is weighed at the same set of times) but not with respect to treatment, as can be seen by checking the number of rats in each treatment group. ```{julia} -combine( - groupby(unique(select(bxdf, :Group, :Subj)), :Group), - nrow => :n, -) +combine(groupby(unique(select(bxdf, :Group, :Subj)), :Group), + nrow => :n) ``` ### Within-group variation @@ -329,16 +321,13 @@ Considering first the control group, whose trajectories can be plotted in a "spa #| fig-cap: Weight (g) of rats in the control group of bxdf versus time in trial (wk). #| label: fig-bxctrlspaghetti #| warning: false -bxaxes = - mapping(:time => "Time in trial (wk)", :resp => "Body weight (g)") +bxaxes = mapping(:time => "Time in trial (wk)", :resp => "Body weight (g)") bxgdf = groupby(bxdf, :Group) -draw( - data(bxgdf[("Control",)]) * - bxaxes * - mapping(; color=:Subj) * - (visual(Scatter) + visual(Lines)); - figure=(; size=(600, 450)), -) +draw(data(bxgdf[("Control",)]) * + bxaxes * + mapping(; color=:Subj) * + (visual(Scatter) + visual(Lines)); + figure=(; size=(600, 450)),) ``` or in separate panels ordered by initial weight, @fig-bxctrllayout. @@ -349,16 +338,12 @@ or in separate panels ordered by initial weight, @fig-bxctrllayout. #| label: fig-bxctrllayout #| warning: false let df = bxgdf[("Control",)] - draw( - data(df) * - bxaxes * - mapping(; - layout=:Subj => - sorter(sort!(filter(:time => iszero, df), :resp).Subj), - ) * - visual(ScatterLines); - axis=(height=180, width=108), - ) + draw(data(df) * + bxaxes * + mapping(; + layout=:Subj => sorter(sort!(filter(:time => iszero, df), :resp).Subj),) * + visual(ScatterLines); + axis=(height=180, width=108),) end ``` @@ -372,17 +357,12 @@ A multi-panel plot for the Thioracil group, @fig-bxthiolayout, #| label: fig-bxthiolayout #| warning: false let - df = bxgdf[("Thioracil",)] - draw( - data(df) * - bxaxes * - mapping( - layout=:Subj => - sorter(sort!(filter(:time => iszero, df), :resp).Subj), - ) * - visual(ScatterLines); - axis=(height=180, width=108), - ) + df = bxgdf[("Thioracil",)] + draw(data(df) * + bxaxes * + mapping(; layout=:Subj => sorter(sort!(filter(:time => iszero, df), :resp).Subj)) * + visual(ScatterLines); + axis=(height=180, width=108),) end ``` @@ -396,17 +376,12 @@ By contrast, in the Thyroxin group, @fig-bxthyrlayout, #| label: fig-bxthyrlayout #| warning: false let - df = bxgdf[("Thyroxin",)] - draw( - data(df) * - bxaxes * - mapping( - layout=:Subj => - sorter(sort!(filter(:time => iszero, df), :resp).Subj), - ) * - visual(ScatterLines); - axis=(height=180, width=108), - ) + df = bxgdf[("Thyroxin",)] + draw(data(df) * + bxaxes * + mapping(; layout=:Subj => sorter(sort!(filter(:time => iszero, df), :resp).Subj)) * + visual(ScatterLines); + axis=(height=180, width=108),) end ``` @@ -427,21 +402,17 @@ We can begin with a main effect for `Group`, as in ```{julia} delete!(contrasts, :time) -bxm01 = - let f = @formula resp ~ - (1 + time + time^2) * Group + (1 + time + time^2 | Subj) +bxm01 = let f = @formula resp ~ (1 + time + time^2) * Group + (1 + time + time^2 | Subj) fit(MixedModel, f, bxdf; contrasts, progress) - end +end ``` but we expect that a model without the main effect for `Group`, ```{julia} -bxm02 = - let f = @formula resp ~ - 1 + (time + time^2) & Group + (1 + time + time^2 | Subj) +bxm02 = let f = @formula resp ~ 1 + (time + time^2) & Group + (1 + time + time^2 | Subj) fit(MixedModel, f, bxdf; contrasts, progress) - end +end ``` will be adequate, as confirmed by @@ -470,11 +441,9 @@ A model without systematic differences between groups in the initial weight and It would indicate that the groups are initially homogeneous both in weight and growth rate but, as the trial proceeds, the different treatments change the rate of growth. ```{julia} -bxm03 = - let f = @formula resp ~ - 1 + time + time^2 & Group + (1 + time + time^2 | Subj) +bxm03 = let f = @formula resp ~ 1 + time + time^2 & Group + (1 + time + time^2 | Subj) fit(MixedModel, f, bxdf; contrasts, progress) - end +end ``` ```{julia} @@ -532,37 +501,34 @@ can help to see that these points lie on a plane. #| label: fig-bxm03plane #| warning: false let - bpts = Point3f.(eachcol(only(bxm03.b))) - Upts = Point3f.(eachcol(svd(only(bxm03.λ)).U)) - origin = Point3(zeros(Float32, 3)) - xlabel, ylabel, zlabel = only(bxm03.reterms).cnames - zlabel = "time²" - perspectiveness = 0.5 - aspect = :data - f = Figure(; size=(600, 250)) - u, v, w = -Upts[2] # second principle direction flipped to get positive w - elevation = asin(w) - azimuth = atan(v, u) - ax1 = - Axis3(f[1, 1]; aspect, xlabel, ylabel, zlabel, perspectiveness) - ax2 = Axis3( - f[1, 2]; - aspect, - xlabel, - ylabel, - zlabel, - perspectiveness, - elevation, - azimuth, - ) - scatter!(ax1, bpts; marker='∘', markersize=20) - scatter!(ax2, bpts; marker='∘', markersize=20) - for p in Upts - seg = [origin, p] - lines!(ax1, seg) - lines!(ax2, seg) - end - f + bpts = Point3f.(eachcol(only(bxm03.b))) + Upts = Point3f.(eachcol(svd(only(bxm03.λ)).U)) + origin = Point3(zeros(Float32, 3)) + xlabel, ylabel, zlabel = only(bxm03.reterms).cnames + zlabel = "time²" + perspectiveness = 0.5 + aspect = :data + f = Figure(; size=(600, 250)) + u, v, w = -Upts[2] # second principle direction flipped to get positive w + elevation = asin(w) + azimuth = atan(v, u) + ax1 = Axis3(f[1, 1]; aspect, xlabel, ylabel, zlabel, perspectiveness) + ax2 = Axis3(f[1, 2]; + aspect, + xlabel, + ylabel, + zlabel, + perspectiveness, + elevation, + azimuth,) + scatter!(ax1, bpts; marker='∘', markersize=20) + scatter!(ax2, bpts; marker='∘', markersize=20) + for p in Upts + seg = [origin, p] + lines!(ax1, seg) + lines!(ax2, seg) + end + f end ``` @@ -580,12 +546,10 @@ Coverage intervals from a parametric bootstrap sample for this model ```{julia} #| code-fold: true -bxm03samp = parametricbootstrap( - Xoshiro(8642468), - 10_000, - bxm03; - progress=false, -) +bxm03samp = parametricbootstrap(Xoshiro(8642468), + 10_000, + bxm03; + progress=false,) bxm03pars = DataFrame(bxm03samp.allpars) DataFrame(shortestcovint(bxm03samp)) ``` @@ -599,15 +563,11 @@ A kernel density plot, @fig-bxm03rhodens, of the parametric bootstrap estimates #| fig-cap: Kernel density plots of parametric bootstrap estimates of correlation estimates from model bxm03 #| label: fig-bxm03rhodens #| warning: false -draw( - data(@subset(bxm03pars, :type == "ρ")) * - mapping( - :value => "Bootstrap replicates of correlation estimates"; - color=(:names => "Variables"), - ) * - AlgebraOfGraphics.density(); - figure=(; size=(600, 400)), -) +draw(data(@subset(bxm03pars, :type == "ρ")) * + mapping(:value => "Bootstrap replicates of correlation estimates"; + color=(:names => "Variables"),) * + AlgebraOfGraphics.density(); + figure=(; size=(600, 400)),) ``` Even on the scale of [Fisher's z transformation](https://en.wikipedia.org/wiki/Fisher_transformation), @fig-bxm03rhodensatanh, these estimates are highly skewed. @@ -618,29 +578,21 @@ Even on the scale of [Fisher's z transformation](https://en.wikipedia.org/wiki/F #| label: fig-bxm03rhodensatanh #| warning: false let - dat = @transform( - @subset(bxm03pars, :type == "ρ"), - :z = atanh(clamp(:value, -0.99999, 0.99999)) - ) - mp = mapping( - :z => "Fisher's z transformation of correlation estimates"; - color=(:names => "Variables"), - ) - draw( - data(dat) * mp * AlgebraOfGraphics.density(); - figure=(; size=(600, 400)), - ) + dat = @transform(@subset(bxm03pars, :type == "ρ"), + :z = atanh(clamp(:value, -0.99999, 0.99999))) + mp = mapping(:z => "Fisher's z transformation of correlation estimates"; + color=(:names => "Variables"),) + draw(data(dat) * mp * AlgebraOfGraphics.density(); + figure=(; size=(600, 400)),) end ``` Because of these high correlations, trying to deal with the degenerate random effects distribution by simply removing the random effects for `time ^ 2` reduces the model too much. ```{julia} -bxm04 = - let f = - @formula resp ~ 1 + time + time^2 & Group + (1 + time | Subj) +bxm04 = let f = @formula resp ~ 1 + time + time^2 & Group + (1 + time | Subj) fit(MixedModel, f, bxdf; contrasts, progress) - end +end ``` ```{julia} @@ -650,15 +602,13 @@ MixedModels.likelihoodratiotest(bxm04, bxm03) as does eliminating within-subject correlations between the random-effects for the `time^2` random effect and the other random effects. ```{julia} -bxm05 = - let f = @formula resp ~ - 1 + - time + - time^2 & Group + - (1 + time | Subj) + - (0 + time^2 | Subj) +bxm05 = let f = @formula resp ~ 1 + + time + + time^2 & Group + + (1 + time | Subj) + + (0 + time^2 | Subj) fit(MixedModel, f, bxdf; contrasts, progress) - end +end ``` ```{julia} @@ -681,51 +631,36 @@ The models give essentially the same predictions of weight versus time for a "ty #| label: fig-bxmodfitted #| warning: false let - times = Float32.((0:256) / 64) - times² = abs2.(times) - z257 = zeros(Float32, 257) - tmat = hcat( - ones(Float32, 257 * 3), - repeat(times, 3), - vcat(times², z257, z257), - vcat(z257, times², z257), - vcat(z257, z257, times²), - ) - grp = repeat(["Control", "Thioracil", "Thyroxin"]; inner=257) - draw( - data( - append!( - append!( - DataFrame(; - times=tmat[:, 2], - wt=tmat * Float32.(bxm03.beta), - Group=grp, - model="bxm03", - ), - DataFrame(; - times=tmat[:, 2], - wt=tmat * Float32.(bxm04.beta), - Group=grp, - model="bxm04", - ), - ), - DataFrame(; - times=tmat[:, 2], - wt=tmat * Float32.(bxm05.beta), - Group=grp, - model="bxm05", - ), - ), - ) * - mapping( - :times => "Time in trial (wk)", - :wt => "Weight (gm)"; - color=:Group, - col=:model, - ) * - visual(Lines); - axis=(width=120, height=130), - ) + times = Float32.((0:256) / 64) + times² = abs2.(times) + z257 = zeros(Float32, 257) + tmat = hcat(ones(Float32, 257 * 3), + repeat(times, 3), + vcat(times², z257, z257), + vcat(z257, times², z257), + vcat(z257, z257, times²)) + grp = repeat(["Control", "Thioracil", "Thyroxin"]; inner=257) + draw(data(append!(append!(DataFrame(; + times=tmat[:, 2], + wt=tmat * Float32.(bxm03.beta), + Group=grp, + model="bxm03",), + DataFrame(; + times=tmat[:, 2], + wt=tmat * Float32.(bxm04.beta), + Group=grp, + model="bxm04",)), + DataFrame(; + times=tmat[:, 2], + wt=tmat * Float32.(bxm05.beta), + Group=grp, + model="bxm05",))) * + mapping(:times => "Time in trial (wk)", + :wt => "Weight (gm)"; + color=:Group, + col=:model,) * + visual(Lines); + axis=(width=120, height=130),) end ``` diff --git a/multiple.qmd b/multiple.qmd index 011a4a4..fbf3aed 100644 --- a/multiple.qmd +++ b/multiple.qmd @@ -119,41 +119,32 @@ Returns a `DataFrame` created from df with the levels of `grps` reordered accord `combine(groupby(df, grps), resp => sumryf)` and this summary DataFrame, also with the levels of `grps` reordered. """ -function _meanrespfrm( - df, - resp::Symbol, - grps::Symbol; - sumryf::Function=mean, -) - # ensure the relevant columns are types that Makie can deal with - df = transform( - df, - resp => Array, - grps => CategoricalArray; - renamecols=false, - ) - # create a summary table by mean resp - sumry = - sort!(combine(groupby(df, grps), resp => sumryf => resp), resp) - glevs = string.(sumry[!, grps]) # group levels in ascending order of mean resp - levels!(df[!, grps], glevs) - levels!(sumry[!, grps], glevs) - return df, sumry +function _meanrespfrm(df, + resp::Symbol, + grps::Symbol; + sumryf::Function=mean,) + # ensure the relevant columns are types that Makie can deal with + df = transform(df, + resp => Array, + grps => CategoricalArray; + renamecols=false,) + # create a summary table by mean resp + sumry = sort!(combine(groupby(df, grps), resp => sumryf => resp), resp) + glevs = string.(sumry[!, grps]) # group levels in ascending order of mean resp + levels!(df[!, grps], glevs) + levels!(sumry[!, grps], glevs) + return df, sumry end let - df, _ = _meanrespfrm(penicillin, :diameter, :plate) - sort!(df, [:plate, :sample]) - - mp = mapping( - :diameter => "Diameter of inhibition zone [mm]", - :plate => "Plate"; - color=:sample, - ) - draw( - data(df) * mp * visual(ScatterLines; marker='○', markersize=12); - figure=(; size=(600, 450)), - ) + df, _ = _meanrespfrm(penicillin, :diameter, :plate) + sort!(df, [:plate, :sample]) + + mp = mapping(:diameter => "Diameter of inhibition zone [mm]", + :plate => "Plate"; + color=:sample,) + draw(data(df) * mp * visual(ScatterLines; marker='○', markersize=12); + figure=(; size=(600, 450)),) end ``` @@ -187,7 +178,7 @@ A model incorporating random effects for both the `plate` and the `sample` is st ```{julia} pnm01 = let f = @formula diameter ~ 1 + (1 | plate) + (1 | sample) - fit(MixedModel, f, penicillin; contrasts, progress) + fit(MixedModel, f, penicillin; contrasts, progress) end ``` @@ -271,15 +262,11 @@ As for model `dsm01` the bootstrap parameter estimates of the fixed-effects para #| fig-cap: "Parametric bootstrap estimates of fixed-effects parameters in model pnm01" #| label: fig-pnm01bsbeta #| warning: false -draw( - data(@subset(pnm01pars, :type == "β")) * - mapping( - :value => "Bootstrap samples of β"; - color=(:names => "Names"), - ) * - AlgebraOfGraphics.density(); - figure=(; size=(600, 340)), -) +draw(data(@subset(pnm01pars, :type == "β")) * + mapping(:value => "Bootstrap samples of β"; + color=(:names => "Names"),) * + AlgebraOfGraphics.density(); + figure=(; size=(600, 340)),) ``` and the shortest coverage interval on this parameter is close to the Wald interval @@ -296,15 +283,11 @@ The densities of the variance-components, on the scale of the standard deviation #| label: fig-pnm01bssigma #| code-fold: true #| warning: false -draw( - data(@subset(pnm01pars, :type == "σ")) * - mapping( - :value => "Bootstrap samples of σ"; - color=(:group => "Group"), - ) * - AlgebraOfGraphics.density(); - figure=(; size=(600, 340)), -) +draw(data(@subset(pnm01pars, :type == "σ")) * + mapping(:value => "Bootstrap samples of σ"; + color=(:group => "Group"),) * + AlgebraOfGraphics.density(); + figure=(; size=(600, 340)),) ``` The lack of precision in the estimate of $\sigma_2$, the standard deviation of the random effects for `sample`, is a consequence of only having 6 distinct levels of the `sample` factor. @@ -334,10 +317,8 @@ pastes = dataset(:pastes) ``` ```{julia} -pastes = transform( - DataFrame(pastes), - [:batch, :cask] => ByRow((x, y) -> string(x, y)) => :sample, -) +pastes = transform(DataFrame(pastes), + [:batch, :cask] => ByRow((x, y) -> string(x, y)) => :sample) describe(pastes) ``` @@ -413,7 +394,7 @@ We include random-effects terms for each factor, as in ```{julia} psm01 = let f = @formula strength ~ 1 + (1 | sample) + (1 | batch) - fit(MixedModel, f, pastes; contrasts, progress) + fit(MixedModel, f, pastes; contrasts, progress) end ``` @@ -449,7 +430,7 @@ Plots of the prediction intervals of the random effects (@fig-psm01batchcaterpil #| fig-cap: "Plot of *batch* prediction intervals from *psm01*" #| label: fig-psm01batchcaterpillar #| warning: false -caterpillar!(Figure(size=(600, 240)), ranefinfo(psm01, :batch)) +caterpillar!(Figure(; size=(600, 240)), ranefinfo(psm01, :batch)) ``` confirm this impression in that all the prediction intervals for the random effects for contain zero. @@ -467,15 +448,11 @@ psm01pars = DataFrame(psm01samp.allpars); #| label: fig-psm01bssampdens #| code-fold: true #| warning: false -draw( - data(@subset(psm01pars, :type == "σ")) * - mapping( - :value => "Bootstrap samples of σ"; - color=(:group => "Group"), - ) * - AlgebraOfGraphics.density(); - figure=(; size=(600, 340)), -) +draw(data(@subset(psm01pars, :type == "σ")) * + mapping(:value => "Bootstrap samples of σ"; + color=(:group => "Group"),) * + AlgebraOfGraphics.density(); + figure=(; size=(600, 340)),) ``` Because there are several indications that $\sigma_2$ could reasonably be zero, resulting in a simpler model incorporating random effects for only `sample`, we perform a statistical test of this hypothesis. @@ -506,7 +483,7 @@ The restricted model fit ```{julia} psm02 = let f = @formula strength ~ 1 + (1 | sample) - fit(MixedModel, f, pastes; contrasts, progress) + fit(MixedModel, f, pastes; contrasts, progress) end ``` @@ -535,12 +512,10 @@ DataFrame(shortestcovint(psm01samp)) ```{julia} #| code-fold: true -psm02samp = parametricbootstrap( - Random.seed!(9753579), - 10_000, - psm02; - progress, -) +psm02samp = parametricbootstrap(Random.seed!(9753579), + 10_000, + psm02; + progress,) DataFrame(shortestcovint(psm02samp)) ``` @@ -587,12 +562,10 @@ Although the response, `y`, is on a scale of 1 to 5, #| fig-cap: "Histogram of instructor ratings in the *insteval* data" #| label: fig-instevalhist #| warning: false -draw( - data(Table(response=1:5, count=counts(insteval.y))) * - mapping(:response => "Rating", :count => "Count") * - visual(BarPlot); - figure=(; size=(600, 340)), -) +draw(data(Table(; response=1:5, count=counts(insteval.y))) * + mapping(:response => "Rating", :count => "Count") * + visual(BarPlot); + figure=(; size=(600, 340)),) ``` it is sufficiently diffuse to warrant treating it as if it were a continuous response. @@ -601,7 +574,7 @@ At this point we will fit models that have random effects for student, instructo ```{julia} iem01 = let f = @formula y ~ 1 + (1 | s) + (1 | d) + (1 | dept) - fit(MixedModel, f, insteval; contrasts, progress) + fit(MixedModel, f, insteval; contrasts, progress) end ``` @@ -614,14 +587,14 @@ It is not surprising that zero is within most of the prediction intervals on the #| fig-cap: "Prediction intervals on random effects for department in model iem01" #| label: fig-iem01caterpillardept #| warning: false -caterpillar!(Figure(size=(600, 300)), ranefinfo(iem01, :dept)) +caterpillar!(Figure(; size=(600, 300)), ranefinfo(iem01, :dept)) ``` However, the p-value for the LRT of $H_0:\sigma_3=0$ versus $H_a:\sigma_3>0$ ```{julia} iem02 = let f = @formula y ~ 1 + (1 | s) + (1 | d) - fit(MixedModel, f, insteval; contrasts, progress) + fit(MixedModel, f, insteval; contrasts, progress) end MixedModels.likelihoodratiotest(iem02, iem01) ``` @@ -661,10 +634,9 @@ There are several ways in which `service` can be incorporated in a model like th The simplest approach is to add `service` to the fixed-effects specification ```{julia} -iem03 = - let f = @formula y ~ 1 + service + (1 | s) + (1 | d) + (1 | dept) +iem03 = let f = @formula y ~ 1 + service + (1 | s) + (1 | d) + (1 | dept) fit(MixedModel, f, insteval; contrasts, progress) - end +end ``` In model `iem03` the effect of `service` is considered to be constant across departments and is modeled with a single fixed-effects parameter, which is the difference in a typical rating in a service course to a non-service course. @@ -693,12 +665,10 @@ shows the differences in precision due to different numbers of observations for #| fig-cap: "Histogram of the number of observations per instructor in the insteval data" #| label: fig-iednobshist #| warning: false -draw( - data(combine(groupby(DataFrame(insteval), :d), nrow => :n)) * - mapping(:n => "Number of observations") * - AlgebraOfGraphics.histogram(; bins=410); - figure=(; size=(600, 450)), -) +draw(data(combine(groupby(DataFrame(insteval), :d), nrow => :n)) * + mapping(:n => "Number of observations") * + AlgebraOfGraphics.histogram(; bins=410); + figure=(; size=(600, 450)),) ``` The precision of the conditional distributions of the random effects, as measured by the width of the intervals, varies considerably between instructors. @@ -706,10 +676,8 @@ The precision of the conditional distributions of the random effects, as measure We can determine that instructor `I1258` has the largest mean of the conditional distributions of the random effects ```{julia} -last( - sort(DataFrame(ranefinfotable(ranefinfo(iem01, :d))), :cmode), - 5, -) +last(sort(DataFrame(ranefinfotable(ranefinfo(iem01, :d))), :cmode), + 5) ``` but the conditional distribution of this random effect clearly overlaps significantly with others. @@ -734,8 +702,8 @@ For convenience, multiple, nested random-effects terms can be specified using a ```{julia} #| label: psm01alt print(let f = @formula(strength ~ 1 + (1 | batch / cask)) - fit(MixedModel, f, pastes; progress) -end) + fit(MixedModel, f, pastes; progress) + end) ``` We will avoid terms of this form, preferring instead an explicit specification with simple, scalar terms based on unambiguous grouping factors. diff --git a/src/EmbraceUncertainty.jl b/src/EmbraceUncertainty.jl index 4241f3c..978f552 100644 --- a/src/EmbraceUncertainty.jl +++ b/src/EmbraceUncertainty.jl @@ -19,7 +19,7 @@ const MMDS = String[] function __init__() CACHE[] = @get_scratch!("data") mkpath(CACHE[]) - append!(MMDS, MixedModelsDatasets.datasets()) + return append!(MMDS, MixedModelsDatasets.datasets()) end include("datasets.jl") @@ -38,7 +38,7 @@ function age_at_event(edate::TimeType, dob::TimeType) end export GENRES, - age_at_event, - tagpad - + age_at_event, + tagpad + end # module EmbraceUncertainty diff --git a/src/datasets.jl b/src/datasets.jl index 83654d4..ee9d82d 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -2,36 +2,31 @@ _file(x) = joinpath(CACHE[], string(x, ".arrow")) clear_scratchspaces!() = Scratch.clear_scratchspaces!(@__MODULE__) -const datasets = - CSV.read( - IOBuffer( -""" -dsname,filename,version,sha2 -box,tkxnh,1,4ac2038735c5286ca0d6a706b4feef5b34bd93560bc4cabc7223addf2366e4c0 -elstongrizzle,5vrbw,1,f33e08ad5a91ab5dd9889953dbb97cc52a299fdac9db8b37ec4f87ad2dacadbd -oxboys,cz6g3,1,079f46e404e43c1848a65d3a4e5b6a14b2cb17565c77819d7ee3effe72f5ebd0 -sizespeed,kazgm,2,d3512795afc101445fdd78e183919378ef7d993d66b4fd55307a0bf57fba1e0b -ELP_ldt_item,c6gxd,1,f851910e1435659ca662ad49cfb8deb6b7cf287e4ce4969103dba11b32ab2e6c -ELP_ldt_subj,rqenu,2,d9c88915681b64fc9db975f9bb2d6f402058fee5cb35887f9de7d07776efdd56 -ELP_ldt_trial,3evhy,2,57a83679f8f458b1d9bb56e05099a556ffdaf15af67464e9ee608c844fc4fa9c -movies,,1,0bdd5e9565328f07bbcfd83c634951ab2933fc160cc07e9bad8200b4c84d90ee -ratings,,1,a2187528c1c6b2a87b1cc09845c0a5591a2fb01088ed7173c187be4a9bee83e2 -fggk21,vwecy,1,0fa959f095f8b92135496b6f8c8a8b5a3e896e8875f0ba6928bd074559d8a796 -fggk21_Child,c2fmn,1,61c91e00336e6f804e9f6b86986ebb4a14561cc4908b3a21cb27c113d2b51a5c -fggk21_Score,7fqx3,1,99d73ee705aaf5f4ee696eadbba992d0113ba6f467ce337a62a63853e4617400 -kkl15,p8cea,2,90d7bb137c8613d7a15c8597c461aee7c7cb0f0989a07c80fc93e1fbe2e5c156 -kwdyz11,4cv52,3,2fa23aa8aa25e1adb10183c8d29646ae0d19d6baef9d711c9906f7fa1b225571 -""" - ), - Table; - downcast=true, - pool=false, - ) +const datasets = CSV.read(IOBuffer(""" + dsname,filename,version,sha2 + box,tkxnh,1,4ac2038735c5286ca0d6a706b4feef5b34bd93560bc4cabc7223addf2366e4c0 + elstongrizzle,5vrbw,1,f33e08ad5a91ab5dd9889953dbb97cc52a299fdac9db8b37ec4f87ad2dacadbd + oxboys,cz6g3,1,079f46e404e43c1848a65d3a4e5b6a14b2cb17565c77819d7ee3effe72f5ebd0 + sizespeed,kazgm,2,d3512795afc101445fdd78e183919378ef7d993d66b4fd55307a0bf57fba1e0b + ELP_ldt_item,c6gxd,1,f851910e1435659ca662ad49cfb8deb6b7cf287e4ce4969103dba11b32ab2e6c + ELP_ldt_subj,rqenu,2,d9c88915681b64fc9db975f9bb2d6f402058fee5cb35887f9de7d07776efdd56 + ELP_ldt_trial,3evhy,2,57a83679f8f458b1d9bb56e05099a556ffdaf15af67464e9ee608c844fc4fa9c + movies,,1,0bdd5e9565328f07bbcfd83c634951ab2933fc160cc07e9bad8200b4c84d90ee + ratings,,1,a2187528c1c6b2a87b1cc09845c0a5591a2fb01088ed7173c187be4a9bee83e2 + fggk21,vwecy,1,0fa959f095f8b92135496b6f8c8a8b5a3e896e8875f0ba6928bd074559d8a796 + fggk21_Child,c2fmn,1,61c91e00336e6f804e9f6b86986ebb4a14561cc4908b3a21cb27c113d2b51a5c + fggk21_Score,7fqx3,1,99d73ee705aaf5f4ee696eadbba992d0113ba6f467ce337a62a63853e4617400 + kkl15,p8cea,2,90d7bb137c8613d7a15c8597c461aee7c7cb0f0989a07c80fc93e1fbe2e5c156 + kwdyz11,4cv52,3,2fa23aa8aa25e1adb10183c8d29646ae0d19d6baef9d711c9906f7fa1b225571 + """), + Table; + downcast=true, + pool=false,) if @isdefined(_cacheddatasets) empty!(_cacheddatasets) # start from an empty cache in case datasets has changed else - const _cacheddatasets = Dict{Symbol, Arrow.Table}() + const _cacheddatasets = Dict{Symbol,Arrow.Table}() end """ @@ -58,10 +53,9 @@ function dataset(nm::AbstractString) if ismissing(row.filename) load_quiver() # special-case `ratings` and `movies` else - Downloads.download( - string("https://osf.io/", row.filename, "/download?version=", row.version), - fnm, - ) + Downloads.download(string("https://osf.io/", row.filename, + "/download?version=", row.version), + fnm) end end if row.sha2 ≠ bytes2hex(open(sha2_256, fnm)) diff --git a/src/movielens.jl b/src/movielens.jl index cc3deb6..cc10957 100644 --- a/src/movielens.jl +++ b/src/movielens.jl @@ -31,31 +31,25 @@ function load_quiver() open(Downloads.download(ML_25M_URL), "r") do io zipfile = ZipFile.Reader(io) @info "Extracting and saving ratings" - ratings = DataFrame( - extract_csv( - zipfile, - "ratings.csv"; - types=[Int32, Int32, Float32, Int32], - pool=[true, true, true, false], - ) - ) + ratings = DataFrame(extract_csv(zipfile, + "ratings.csv"; + types=[Int32, Int32, Float32, Int32], + pool=[true, true, true, false],)) ratings.movieId = tagpad(ratings.movieId, "M") ratings.userId = tagpad(ratings.userId, "U") push!(quiver, create_arrow("ratings.csv", ratings)) @info "Extracting movies that are in the ratings table" - movies = extract_csv(zipfile, "movies.csv"; types=[Int32,String,String], pool=false) + movies = extract_csv(zipfile, "movies.csv"; types=[Int32, String, String], + pool=false) movies.movieId = tagpad(movies.movieId, "M") - links = extract_csv(zipfile, "links.csv"; types=[Int32,Int32,Int32]) + links = extract_csv(zipfile, "links.csv"; types=[Int32, Int32, Int32]) links.movieId = tagpad(links.movieId, "M") - movies = leftjoin!( - leftjoin!( - sort!(combine(groupby(ratings, :movieId), nrow => :nrtngs), :nrtngs; rev=true), - movies, - on=:movieId, - ), - links; - on=:movieId, - ) + movies = leftjoin!(leftjoin!(sort!(combine(groupby(ratings, :movieId), + nrow => :nrtngs), :nrtngs; rev=true), + movies; + on=:movieId), + links; + on=:movieId,) disallowmissing!(movies; error=false) movies.nrtngs = Int32.(movies.nrtngs) for g in GENRES @@ -66,7 +60,7 @@ function load_quiver() @info "Extracting and saving README" readme = only(filter(f -> endswith(f.name, "README.txt"), zipfile.files)) open(joinpath(CACHE[], "README.txt"), "w") do io - write(io, read(readme)) + return write(io, read(readme)) end return nothing diff --git a/src/tagpad.jl b/src/tagpad.jl index 1aa94e8..a3da40e 100644 --- a/src/tagpad.jl +++ b/src/tagpad.jl @@ -15,10 +15,11 @@ The single-argument version, e.g. `tagpad(:I)`, returns a partially-applied func in a `transform` or `select` call. ```@example -show(tagpad(repeat(1:10, inner=2))) +show(tagpad(repeat(1:10; inner=2))) ``` """ -function tagpad(v::AbstractVector{<:Integer}, ndig::Integer, tag::AbstractString="S"; pool::Bool=true) +function tagpad(v::AbstractVector{<:Integer}, ndig::Integer, tag::AbstractString="S"; + pool::Bool=true) tagged = string.(tag, lpad.(v, ndig, '0')) return pool ? PooledArray(tagged; signed=true, compress=true) : tagged end diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 0000000..fed2815 --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,12 @@ +using Aqua +using EmbraceUncertainty +using Test +using TestSetExtensions + +@testset ExtendedTestSet "EmbraceUncertainty" begin + @testset "Aqua" begin + # we can't test stale deps because the dependencies aren't + # just for the package, but also for entire book + Aqua.test_all(EmbraceUncertainty; ambiguities=false, stale_deps=false) + end +end