diff --git a/Project.toml b/Project.toml
index f3ba6d17..b98134c4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,6 +6,7 @@ version = "0.7.1"
 [deps]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 Folds = "41a02a25-b8f0-4f67-bc48-60067656b558"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
@@ -38,6 +39,7 @@ TuringExt = ["DynamicPPL", "MCMCChains", "Turing"]
 [compat]
 Accessors = "0.1"
 Distributions = "0.25"
+DocStringExtensions = "0.9"
 DynamicPPL = "0.20, 0.21"
 Folds = "0.2"
 ForwardDiff = "0.10"
diff --git a/src/Pathfinder.jl b/src/Pathfinder.jl
index 912558eb..bc78faaa 100644
--- a/src/Pathfinder.jl
+++ b/src/Pathfinder.jl
@@ -1,6 +1,7 @@
 module Pathfinder
 
 using Distributions: Distributions
+using DocStringExtensions
 using Folds: Folds
 # ensure that ForwardDiff is conditionally loaded by Optimization
 using ForwardDiff: ForwardDiff
@@ -36,6 +37,8 @@ end
 
 include("transducers.jl")
 include("woodbury.jl")
+include("trace.jl")
+include("callbacks.jl")
 include("optimize.jl")
 include("inverse_hessian.jl")
 include("mvnormal.jl")
diff --git a/src/callbacks.jl b/src/callbacks.jl
new file mode 100644
index 00000000..c3e67813
--- /dev/null
+++ b/src/callbacks.jl
@@ -0,0 +1,175 @@
+# utility macro to annotate a field as const only if supported
+@eval macro $(Symbol("const"))(x)
+    if VERSION ≥ v"1.8"
+        Expr(:const, esc(x))
+    else
+        esc(x)
+    end
+end
+
+# callbacks for Optimization.jl
+
+"""
+$(TYPEDEF)
+
+Abstract type for Optimization.jl callbacks.
+
+A callback should be a callable with the signature
+
+    (x, f(x), args...) -> Bool
+
+where `x` is the parameter being optimized, and `f` is the objective function. A return
+value of `true` signals that optimization should stop.
+
+See the [Optimization.jl docs](https://docs.sciml.ai/Optimization/stable/API/solve/) for
+more information.
+"""
+abstract type AbstractOptimizationCallback end
+
+"""
+$(TYPEDEF)
+
+A sequence of Optimization.jl callbacks to be executed in order.
+"""
+struct CallbackSequence{C<:Tuple} <: AbstractOptimizationCallback
+    "Tuple of Optimization.jl callbacks to be called in order"
+    callbacks::C
+end
+
+"""
+$(SIGNATURES)
+
+Wrap the sequence `callbacks`.
+"""
+CallbackSequence(callbacks...) = CallbackSequence(callbacks)
+
+function (callback::CallbackSequence)(args...)
+    return mapfoldl(|, callback.callbacks; init=false) do cb
+        cb === nothing && return false
+        return cb(args...)
+    end
+end
+
+"""
+$(TYPEDEF)
+
+A callback that signals termination if the objective value is non-finite and `fail=true`.
+"""
+struct CheckFiniteValueCallback <: AbstractOptimizationCallback
+    "Whether to raise an error if the objective function is non-finite"
+    fail::Bool
+end
+
+function (cb::CheckFiniteValueCallback)(x, fx, args...)
+    return cb.fail && (isnan(fx) || fx == -Inf)
+end
+
+"""
+$(TYPEDEF)
+
+A callback to fill an [`OptimizationTrace`](@ref)
+
+!!! note
+    Optimization doesn't have an interface for accessing the gradient trace, so this
+    callback recomputes the gradient.
+
+# Fields
+
+$(FIELDS)
+"""
+struct FillTraceCallback{G,T<:OptimizationTrace} <: AbstractOptimizationCallback
+    "A function to compute the gradient of the objective function"
+    grad::G
+    "An `OptimizationTrace` with empty vectors to be filled."
+    trace::T
+end
+
+function (cb::FillTraceCallback)(x, fx, args...)
+    # NOTE: Optimization doesn't have an interface for accessing the gradient trace,
+    # so we need to recompute it ourselves
+    # see https://github.com/SciML/Optimization.jl/issues/149
+    ∇fx = cb.grad(x)
+    rmul!(∇fx, -1)
+
+    trace = cb.trace
+    # some backends mutate x, so we must copy it
+    push!(trace.points, copy(x))
+    push!(trace.log_densities, -fx)
+    push!(trace.gradients, ∇fx)
+    return false
+end
+
+"""
+$(TYPEDEF)
+
+A callback to log progress with a `reporter`
+
+# Fields
+
+$(FIELDS)
+"""
+Base.@kwdef mutable struct ProgressCallback{R,U} <: AbstractOptimizationCallback
+    "Reporter function, called with signature `report(progress_id, maxiters, try_id, iter_id)`"
+    @const reporter::R
+    "An identifier for the progress bar."
+    @const progress_id::U
+    "Maximum number of iterations"
+    @const maxiters::Int
+    "Index of the current try"
+    try_id::Int
+    "Index of the current iteration"
+    iter_id::Int
+end
+
+function (cb::ProgressCallback)(args...)
+    reporter = cb.reporter
+    reporter === nothing || reporter(cb.progress_id, cb.maxiters, cb.try_id, cb.iter_id)
+    cb.iter_id += 1
+    return false
+end
+
+"""
+$(SIGNATURES)
+
+Report progress using ProgressLogging.jl.
+"""
+function report_progress(progress_id, maxiters::Int, try_id::Int, iter_id::Int)
+    Base.@logmsg ProgressLogging.ProgressLevel "Optimizing (try $(try_id))" progress =
+        iter_id / maxiters _id = progress_id
+    return nothing
+end
+
+# callbacks for Optim.jl
+
+"""
+$(TYPEDEF)
+
+Abstract type for Optim.jl callbacks.
+
+A callback should be a callable with the signature
+
+    (states::Vector{<:AbstractOptimizerState}) -> Bool
+
+where `x` is the parameter being optimized, and `f` is the objective function. A return
+value of `true` signals that optimization should stop.
+"""
+abstract type AbstractOptimJLCallback end
+
+"""
+$(TYPEDEF)
+
+Adaptor for an Optimization.jl callback to be an Optim.jl callback.
+"""
+struct OptimJLCallbackAdaptor{C} <: AbstractOptimJLCallback
+    "An Optimization.jl callback to be called."
+    callback::C
+end
+
+function (cb::OptimJLCallbackAdaptor)(states)
+    state = states[end]
+    md = state.metadata
+    x = md["x"]
+    fx = state.value
+    return cb.callback(x, fx, md["g(x)"])
+end
+(::OptimJLCallbackAdaptor{Nothing})(states) = false
diff --git a/src/multipath.jl b/src/multipath.jl
index 150b6905..6479cdd8 100644
--- a/src/multipath.jl
+++ b/src/multipath.jl
@@ -123,14 +123,23 @@ for approximating expectations with respect to ``p``.
 """
 function multipathfinder end
 
-function multipathfinder(ℓ, ndraws::Int; input=ℓ, kwargs...)
+function multipathfinder(
+    ℓ,
+    ndraws::Int;
+    input=ℓ,
+    history_length::Int=DEFAULT_HISTORY_LENGTH,
+    optimizer=default_optimizer(history_length),
+    kwargs...,
+)
     _check_log_density_problem(ℓ)
     dim = LogDensityProblems.dimension(ℓ)
-    optim_fun = build_optim_function(ℓ)
-    return multipathfinder(optim_fun, ndraws; input, dim, kwargs...)
+    optim_fun = build_optim_function(ℓ, optimizer)
+    return multipathfinder(
+        optim_fun, ndraws; input, dim, history_length, optimizer, kwargs...
+    )
 end
 function multipathfinder(
-    optim_fun::SciMLBase.OptimizationFunction,
+    optim_fun::Union{SciMLBase.OptimizationFunction,OptimJLFunction},
     ndraws::Int;
     init=nothing,
     input=optim_fun,
@@ -145,9 +154,8 @@ function multipathfinder(
     importance::Bool=true,
     kwargs...,
 )
-    if optim_fun.grad === nothing || optim_fun.grad isa Bool
+    _defines_gradient(optim_fun) ||
         throw(ArgumentError("optimization function must define a gradient function."))
-    end
     if init === nothing
         nruns > 0 || throw(
             ArgumentError("A positive `nruns` must be set or `init` must be provided.")
@@ -159,7 +167,7 @@ function multipathfinder(
     if ndraws > ndraws_per_run * nruns
         @warn "More draws requested than total number of draws across replicas. Draws will not be unique."
     end
-    logp(x) = -optim_fun.f(x, nothing)
+    logp = get_logp(optim_fun)
 
     # run pathfinder independently from each starting point
     trans = Transducers.Map() do (init_i)
diff --git a/src/optimize.jl b/src/optimize.jl
index db1c4f11..23d0c159 100644
--- a/src/optimize.jl
+++ b/src/optimize.jl
@@ -1,4 +1,95 @@
-function build_optim_function(ℓ)
+"""
+$(TYPEDEF)
+
+A callable that wraps a LogDensityProblem to be an `fgh!` callable for Optim.jl.
+
+See the [Optim.jl docs](https://julianlsolvers.github.io/Optim.jl/stable/#user/tipsandtricks/#avoid-repeating-computations)
+for details.
+
+# Fields
+
+$(TYPEDFIELDS)
+"""
+struct OptimJLFunction{P}
+    "An object that implements the LogDensityProblem interface"
+    prob::P
+end
+
+# avoid repeated computation by computing the highest order derivative required and not
+# recomputing the lower order ones
+function (fun::OptimJLFunction)(F, G, H, x)
+    prob = fun.prob
+    if H !== nothing
+        lp, glp, Hlp = LogDensityProblems.logdensity_gradient_and_hessian(prob, x)
+        @. H = -Hlp
+        if G !== nothing
+            @. G = -glp
+        end
+        F === nothing || return -lp
+    elseif G !== nothing
+        lp, glp = LogDensityProblems.logdensity_and_gradient(prob, x)
+        @. G = -glp
+        F === nothing || return -lp
+    elseif F !== nothing
+        return -LogDensityProblems.logdensity(prob, x)
+    end
+    return nothing
+end
+
+"""
+$(TYPEDEF)
+
+A utility object to mimic a `SciMLBase.OptimizationProblem` for use with Optim.jl.
+
+# Fields
+
+$(TYPEDFIELDS)
+"""
+struct OptimJLProblem{F<:OptimJLFunction,U<:AbstractVector{<:Real}}
+    "An optimization function."
+    fun::F
+    "Initial point"
+    u0::U
+end
+
+function _defines_gradient(fun::SciMLBase.OptimizationFunction)
+    return fun.grad !== nothing && !(fun.grad isa Bool)
+end
+_defines_gradient(prob::SciMLBase.OptimizationProblem) = _defines_gradient(prob.f)
+_defines_gradient(::Any) = true
+
+"""
+$(SIGNATURES)
+
+Construct a log-density function with signature `x -> Real` from an optimization function.
+"""
+get_logp(fun::SciMLBase.OptimizationFunction) = Base.Fix2((-) ∘ fun.f, nothing)
+get_logp(fun::OptimJLFunction) = Base.Fix1(LogDensityProblems.logdensity, fun.prob)
+
+"""
+$(SIGNATURES)
+
+Construct a log-density function with signature `x -> Real` from an optimization problem.
+"""
+get_logp(prob::SciMLBase.OptimizationProblem) = get_logp(prob.f)
+get_logp(prob::OptimJLProblem) = get_logp(prob.fun)
+
+"""
+$(SIGNATURES)
+
+Build an optimization function from the LogDensityProblem `ℓ`.
+
+The type of the returned object is determined by `optimizer`, either an
+[`OptimJLFunction`](@ref) or a `SciMLBase.OptimizationFunction`.
+"""
+build_optim_function(ℓ, optimizer) = _build_sciml_optim_function(ℓ)
+function build_optim_function(
+    ℓ, ::Union{Optim.FirstOrderOptimizer,Optim.SecondOrderOptimizer}
+)
+    return OptimJLFunction(ℓ)
+end
+
+function _build_sciml_optim_function(ℓ)
     f(x, p) = -LogDensityProblems.logdensity(ℓ, x)
     function grad(res, x, p)
         _, ∇fx = LogDensityProblems.logdensity_and_gradient(ℓ, x)
@@ -13,14 +104,36 @@ function build_optim_function(ℓ)
     return SciMLBase.OptimizationFunction{true}(f; grad, hess)
 end
 
+"""
+$(SIGNATURES)
+
+Build an optimization problem from the LogDensityProblem `ℓ` and initial point `x₀`.
+
+The type of the returned object is determined by `optimizer`, either an
+[`OptimJLProbkem`](@ref) or a `SciMLBase.OptimizationProblem`.
+"""
 function build_optim_problem(optim_fun, x₀)
     return SciMLBase.OptimizationProblem(optim_fun, x₀, nothing)
 end
+function build_optim_problem(optim_fun::OptimJLFunction, x₀)
+    return OptimJLProblem(optim_fun, x₀)
+end
 
+"""
+$(SIGNATURES)
+
+# Returns
+
+- `sol`: The optimization solution, either a `SciMLBase.OptimizationSolution` or a
+    `Optim.MultivariateOptimizationResults`
+- `trace::OptimizationTrace`: the optimization trace, where the first point is the initial
+    one.
+"""
 function optimize_with_trace(
-    prob,
+    prob::SciMLBase.OptimizationProblem,
     optimizer;
-    progress_name="Optimizing",
+    reporter=report_progress,
+    try_id=1,
     progress_id=nothing,
     maxiters=1_000,
     callback=nothing,
@@ -29,59 +142,66 @@ function optimize_with_trace(
 )
     u0 = prob.u0
     fun = prob.f
-    function ∇f(x)
+
+    function grad(x)
         SciMLBase.isinplace(fun) || return fun.grad(x, nothing)
         res = similar(x)
         fun.grad(res, x, nothing)
-        rmul!(res, -1)
         return res
     end
-    # caches for the trace of x and f(x)
+
+    # allocate containers for the trace of x and f(x)
     xs = typeof(u0)[]
     fxs = typeof(fun.f(u0, nothing))[]
     ∇fxs = typeof(u0)[]
-    _callback = _make_optimization_callback(
-        xs, fxs, ∇fxs, ∇f; progress_name, progress_id, maxiters, callback, fail_on_nonfinite
+    trace = OptimizationTrace(xs, fxs, ∇fxs)
+
+    _callback = CallbackSequence(
+        callback,
+        ProgressCallback(; reporter, progress_id, try_id, maxiters, iter_id=0),
+        CheckFiniteValueCallback(fail_on_nonfinite),
+        FillTraceCallback(grad, trace),
     )
     sol = Optimization.solve(prob, optimizer; callback=_callback, maxiters, kwargs...)
-    return sol, OptimizationTrace(xs, fxs, ∇fxs)
+    return sol, trace
 end
-
-function _make_optimization_callback(
-    xs, fxs, ∇fxs, ∇f; progress_name, progress_id, maxiters, callback, fail_on_nonfinite
+function optimize_with_trace(
+    prob::OptimJLProblem,
+    optimizer::Union{Optim.FirstOrderOptimizer,Optim.SecondOrderOptimizer};
+    reporter=report_progress,
+    try_id=1,
+    progress_id=nothing,
+    maxiters=1_000,
+    callback=nothing,
+    fail_on_nonfinite=true,
+    kwargs...,
 )
-    return function (x, nfx, args...)
-        ret = callback !== nothing && callback(x, nfx, args...)
-        iteration = length(xs)
-        Base.@logmsg ProgressLogging.ProgressLevel progress_name progress =
-            iteration / maxiters _id = progress_id
-
-        # some backends mutate x, so we must copy it
-        push!(xs, copy(x))
-        push!(fxs, -nfx)
-        # NOTE: Optimization doesn't have an interface for accessing the gradient trace,
-        # so we need to recompute it ourselves
-        # see https://github.com/SciML/Optimization.jl/issues/149
-        ∇fx = ∇f(x)
-        push!(∇fxs, ∇fx)
-
-        if fail_on_nonfinite && !ret
-            ret = (isnan(nfx) || nfx == -Inf || any(!isfinite, ∇fx))::Bool
-        end
-
-        return ret
-    end
-end
+    _callback = OptimJLCallbackAdaptor(
+        CallbackSequence(
+            callback,
+            ProgressCallback(; reporter, progress_id, try_id, maxiters, iter_id=0),
+            CheckFiniteValueCallback(fail_on_nonfinite),
+        ),
+    )
+    options = Optim.Options(;
+        callback=_callback,
+        store_trace=true,
+        extended_trace=true,
+        iterations=maxiters,
+        kwargs...,
+    )
+    result = Optim.optimize(Optim.only_fgh!(prob.fun), prob.u0, optimizer, options)
 
-struct OptimizationTrace{P,L}
-    points::P
-    log_densities::L
-    gradients::P
-end
+    u0 = prob.u0
+    xtrace = Optim.x_trace(result)
+    iterations = min(length(xtrace) - 1, Optim.iterations(result))
 
-Base.length(trace::OptimizationTrace) = length(trace.points)
+    # containers for the trace of x and ∇f(x)
+    xs = Vector{typeof(u0)}(undef, iterations + 1)
+    ∇fxs = Vector{typeof(u0)}(undef, iterations + 1)
 
-function Base.show(io::IO, trace::OptimizationTrace)
-    print(io, "OptimizationTrace with $(length(trace) - 1) iterations")
-    return nothing
+    copyto!(xs, xtrace)
+    fxs = -Optim.f_trace(result)
+    map!(tr -> -tr.metadata["g(x)"], ∇fxs, Optim.trace(result))
+    return result, OptimizationTrace(xs, fxs, ∇fxs)
 end
diff --git a/src/singlepath.jl b/src/singlepath.jl
index 4603a2c3..4096c156 100644
--- a/src/singlepath.jl
+++ b/src/singlepath.jl
@@ -21,7 +21,8 @@ Container for results of single-path Pathfinder.
 - `draws_transformed`: `draws` transformed to be draws from `fit_distribution_transformed`.
 - `fit_iteration::Int`: Iteration at which ELBO estimate was maximized
 - `num_tries::Int`: Number of tries until Pathfinder succeeded
-- `optim_solution::SciMLBase.OptimizationSolution`: Solution object of optimization.
+- `optim_solution::Union{Optim.MultivariateOptimizationResults,SciMLBase.OptimizationSolution}`:
+    Solution object of optimization.
 - `optim_trace::Pathfinder.OptimizationTrace`: container for optimization trace of points,
     log-density, and gradient. The first point is the initial point.
 - `fit_distributions::AbstractVector{Distributions.MvNormal}`: Multivariate normal
@@ -92,7 +93,8 @@ constructed using at most the previous `history_length` steps.
     [Optimization.jl: OptimizationFunction](https://optimization.sciml.ai/stable/API/optimization_function/).
 - `prob::SciMLBase.OptimizationProblem`: an optimization problem containing a function with
     the same properties as `fun`, as well as an initial point, in which case `init` and
-    `dim` are ignored.
+    `dim` are ignored. If `ntries > 1`, then the initial point `prob.u0` must also be
+    mutable and may be overwritten.
 
 # Keywords
 - `dim::Int`: dimension of the target distribution, needed only if `fun` is provided and
@@ -132,14 +134,20 @@ constructed using at most the previous `history_length` steps.
 """
 function pathfinder end
 
-function pathfinder(ℓ; input=ℓ, kwargs...)
+function pathfinder(
+    ℓ;
+    input=ℓ,
+    history_length::Int=DEFAULT_HISTORY_LENGTH,
+    optimizer=default_optimizer(history_length),
+    kwargs...,
+)
     _check_log_density_problem(ℓ)
     dim = LogDensityProblems.dimension(ℓ)
-    optim_fun = build_optim_function(ℓ)
-    return pathfinder(optim_fun; input, dim, kwargs...)
+    optim_fun = build_optim_function(ℓ, optimizer)
+    return pathfinder(optim_fun; input, dim, history_length, optimizer, kwargs...)
 end
 function pathfinder(
-    optim_fun::SciMLBase.OptimizationFunction;
+    optim_fun::Union{SciMLBase.OptimizationFunction,OptimJLFunction};
     rng=Random.GLOBAL_RNG,
     init=nothing,
     dim::Int=-1,
@@ -149,20 +157,19 @@ function pathfinder(
     kwargs...,
 )
     if init !== nothing
-        _init = init
-        allow_mutating_init = false
+        _init = similar(init)
+        copyto!(_init, init)
     elseif init === nothing && dim > 0
         _init = Vector{Float64}(undef, dim)
         init_sampler(rng, _init)
-        allow_mutating_init = true
     else
         throw(ArgumentError("An initial point `init` or dimension `dim` must be provided."))
     end
     prob = build_optim_problem(optim_fun, _init)
-    return pathfinder(prob; rng, input, init_sampler, allow_mutating_init, kwargs...)
+    return pathfinder(prob; rng, input, init_sampler, kwargs...)
 end
 function pathfinder(
-    prob::SciMLBase.OptimizationProblem;
+    prob::Union{<:SciMLBase.OptimizationProblem,<:OptimJLProblem};
     rng::Random.AbstractRNG=Random.GLOBAL_RNG,
     history_length::Int=DEFAULT_HISTORY_LENGTH,
     optimizer=default_optimizer(history_length),
@@ -171,10 +178,9 @@ function pathfinder(
     input=prob,
     kwargs...,
 )
-    if prob.f.grad === nothing || prob.f.grad isa Bool
+    _defines_gradient(prob) ||
         throw(ArgumentError("optimization function must define a gradient function."))
-    end
-    logp(x) = -prob.f.f(x, nothing)
+    logp = get_logp(prob)
     path_result = ProgressLogging.progress(; name="Optimizing") do progress_id
         return _pathfinder_try_until_succeed(
             rng,
@@ -188,7 +194,7 @@ function pathfinder(
         )
     end
     @unpack (
-        itry,
+        try_id,
         success,
         optim_prob,
         optim_solution,
@@ -201,7 +207,7 @@ function pathfinder(
 
     if !success
         ndraws_elbo_actual = 0
-        @warn "Pathfinder failed after $itry tries. Increase `ntries`, inspect the model for numerical instability, or provide a more suitable `init_sampler`."
+        @warn "Pathfinder failed after $try_id tries. Increase `ntries`, inspect the model for numerical instability, or provide a more suitable `init_sampler`."
     else
         elbo_estimate_opt = elbo_estimates[fit_iteration]
         ndraws_elbo_actual = ndraws_elbo
@@ -238,7 +244,7 @@ function pathfinder(
         fit_distribution_transformed,
         draws_transformed,
         fit_iteration,
-        itry,
+        try_id,
         optim_solution,
         optim_trace,
         fit_distributions,
@@ -254,23 +260,16 @@ function _pathfinder_try_until_succeed(
     ntries::Int=1_000,
     init_scale=2,
     init_sampler=UniformSampler(init_scale),
-    allow_mutating_init::Bool=false,
     kwargs...,
 )
-    itry = 1
-    progress_name = "Optimizing (try 1)"
-    result = _pathfinder(rng, prob, logp; progress_name, kwargs...)
-    _prob = prob
-    while !result.success && itry < ntries
-        if itry == 1 && !allow_mutating_init
-            _prob = deepcopy(prob)
-        end
-        itry += 1
-        init_sampler(rng, _prob.u0)
-        progress_name = "Optimizing (try $itry)"
-        result = _pathfinder(rng, _prob, logp; progress_name, kwargs...)
+    try_id = 1
+    result = _pathfinder(rng, prob, logp; try_id, kwargs...)
+    while !result.success && try_id < ntries
+        try_id += 1
+        init_sampler(rng, prob.u0)
+        result = _pathfinder(rng, prob, logp; try_id, kwargs...)
     end
-    return (; itry, optim_prob=_prob, result...)
+    return (; try_id, optim_prob=prob, result...)
 end
 
 function _pathfinder(
diff --git a/src/trace.jl b/src/trace.jl
new file mode 100644
index 00000000..1b58aace
--- /dev/null
+++ b/src/trace.jl
@@ -0,0 +1,24 @@
+"""
+    $(TYPEDEF)
+
+A container for the trajectory of points and values computed during optimization.
+
+# Fields
+
+$(FIELDS)
+"""
+struct OptimizationTrace{P,L}
+    "Points in the optimization trajectory"
+    points::P
+    "Log-density (negative objective function) values at `points`"
+    log_densities::L
+    "Gradient of the log-density values at `points`"
+    gradients::P
+end
+
+Base.length(trace::OptimizationTrace) = length(trace.points)
+
+function Base.show(io::IO, trace::OptimizationTrace)
+    print(io, "OptimizationTrace with $(length(trace) - 1) iterations")
+    return nothing
+end
diff --git a/test/callbacks.jl b/test/callbacks.jl
new file mode 100644
index 00000000..26134c75
--- /dev/null
+++ b/test/callbacks.jl
@@ -0,0 +1,126 @@
+using ForwardDiff, Optim, Pathfinder, Test, UUIDs
+
+include("test_utils.jl")
+
+struct DummyCallback{C,D}
+    cond1::C
+    cond2::D
+end
+
+function (cb::DummyCallback)(x, fx, args...)
+    return cb.cond1(x) || cb.cond2(fx)
+end
+
+@testset "callbacks" begin
+    @testset "CallbackSequence" begin
+        cb = Pathfinder.CallbackSequence(
+            DummyCallback(iszero, iszero), DummyCallback(isnan, Base.Fix1(any, isnan))
+        )
+        @inferred cb(1.0, [2.0, 3.0], nothing)
+        @test !cb(1.0, [2.0, 3.0], nothing)
+        @test cb(0.0, [2.0, 3.0], nothing)
+        @test cb(1.0, [0.0, 0.0])
+        @test cb(NaN, [1.0, 2.0])
+        @test cb(1.0, [NaN, 2.0])
+    end
+
+    @testset "CheckFiniteValueCallback" begin
+        x = randn(3)
+        cb = Pathfinder.CheckFiniteValueCallback(true)
+        @inferred cb(x, 1.0, nothing)
+        @test !cb(x, 1.0)
+        @test !cb(x, 2.0)
+        @test cb(x, NaN)
+        @test cb(x, -Inf)
+
+        cb2 = Pathfinder.CheckFiniteValueCallback(false)
+        @inferred cb2(x, 1.0, nothing)
+        @test !cb2(x, 1.0)
+        @test !cb2(x, 2.0)
+        @test !cb2(x, NaN)
+        @test !cb2(x, -Inf)
+    end
+
+    @testset "FillTraceCallback" begin
+        trace = Pathfinder.OptimizationTrace(
+            Vector{Float64}[], Float64[], Vector{Float64}[]
+        )
+        grad(x) = x / 4
+        cb = Pathfinder.FillTraceCallback(grad, trace)
+        xs = [randn(10) for _ in 1:3]
+        fxs = rand(3)
+        for i in eachindex(xs, fxs)
+            @test !cb(xs[i], fxs[i], nothing)
+            @test length(trace.points) ==
+                length(trace.log_densities) ==
+                length(trace.gradients) ==
+                i
+            @test trace.points[i] == xs[i]
+            @test trace.log_densities[i] == -fxs[i]
+            @test trace.gradients[i] == -grad(xs[i])
+        end
+    end
+
+    @testset "ProgressCallback" begin
+        @testset for maxiters in [10, 100], try_id in 1:2
+            progress_id = UUIDs.uuid1()
+            progress_trace = []
+            reporter = function (progress_id, maxiters, try_id, iter_id)
+                push!(progress_trace, (progress_id, maxiters, try_id, iter_id))
+                return nothing
+            end
+            cb = Pathfinder.ProgressCallback(;
+                reporter, progress_id, maxiters, try_id, iter_id=0
+            )
+            @testset for i in 1:3
+                @test !cb(randn(10), rand(), nothing)
+                @test length(progress_trace) == i
+                @test progress_trace[i] == (progress_id, maxiters, try_id, i - 1)
+            end
+            cb2 = Pathfinder.ProgressCallback(;
+                reporter=nothing, progress_id, maxiters, try_id, iter_id=0
+            )
+            @test !cb2(randn(10), rand(), nothing)
+        end
+    end
+
+    @testset "report_progress" begin
+        @testset for maxiters in (10, 100), try_id in (1, 2)
+            logs, = Test.collect_test_logs(; min_level=ProgressLogging.ProgressLevel) do
+                ProgressLogging.progress(; name="Optimizing") do progress_id
+                    for iter_id in 0:maxiters
+                        Pathfinder.report_progress(progress_id, maxiters, try_id, iter_id)
+                    end
+                end
+            end
+            @test length(logs) == maxiters + 3
+            @test logs[1].kwargs[:progress] === nothing
+            @test logs[1].message == "Optimizing"
+            for i in 0:maxiters
+                @test logs[i + 2].kwargs[:progress] == i / maxiters
+                @test logs[i + 2].message == "Optimizing (try $try_id)"
+            end
+            @test logs[maxiters + 3].kwargs[:progress] == "done"
+        end
+    end
+
+    @testset "OptimJLCallback" begin
+        f(x) = -logp_banana(x)
+        grad(x) = ForwardDiff.gradient(f, x)
+        trace = Pathfinder.OptimizationTrace(
+            Vector{Float64}[], Float64[], Vector{Float64}[]
+        )
+        callback = Pathfinder.OptimJLCallbackAdaptor(
+            Pathfinder.FillTraceCallback(grad, trace)
+        )
+        x0 = randn(2)
+        options = Optim.Options(; callback, store_trace=true, extended_trace=true)
+        sol = Optim.optimize(f, x0, LBFGS(), options)
+        @test trace.points == Optim.x_trace(sol)
+        @test trace.log_densities == -Optim.f_trace(sol)
+        @test trace.gradients ≈ -[t.metadata["g(x)"] for t in sol.trace]
+
+        cb2 = Pathfinder.OptimJLCallbackAdaptor(nothing)
+        @test !cb2([])
+    end
+end
diff --git a/test/multipath.jl b/test/multipath.jl
index ace61956..ec1ed0ee 100644
--- a/test/multipath.jl
+++ b/test/multipath.jl
@@ -37,7 +37,7 @@ include("test_utils.jl")
             )
             @test result isa MultiPathfinderResult
             @test result.input === ℓ
-            @test result.optim_fun isa SciMLBase.OptimizationFunction
+            @test result.optim_fun isa Pathfinder.OptimJLFunction
             @test result.rng === rng
             @test result.optimizer ===
                 Pathfinder.default_optimizer(Pathfinder.DEFAULT_HISTORY_LENGTH)
diff --git a/test/optimize.jl b/test/optimize.jl
index e4785677..d7cc78b9 100644
--- a/test/optimize.jl
+++ b/test/optimize.jl
@@ -1,4 +1,5 @@
 using LinearAlgebra
+using LogDensityProblems
 using Optim
 using OptimizationNLopt
 using Pathfinder
@@ -12,68 +13,48 @@ include("test_utils.jl")
     n = 20
     ℓ = build_logdensityproblem(logp_banana, n)
     x = randn(n)
-    fun = Pathfinder.build_optim_function(ℓ)
-    @test fun isa SciMLBase.OptimizationFunction
-    @test SciMLBase.isinplace(fun)
-    @test fun.f(x, nothing) ≈ -ℓ.logp(x)
-    ∇fx = similar(x)
-    fun.grad(∇fx, x, nothing)
-    @test ∇fx ≈ -ℓ.∇logp(x)
-    H = similar(x, n, n)
-    fun.hess(H, x, nothing)
-    @test H ≈ -ℓ.∇²logp(x)
+
+    @testset for optimizer in (Optim.LBFGS(), Optim.Newton())
+        fun = @inferred Pathfinder.build_optim_function(ℓ, optimizer)
+        @test fun isa Pathfinder.OptimJLFunction
+        @test fun.prob === ℓ
+    end
+
+    @testset for optimizer in (NLopt.Opt(:LD_LBFGS, n),)
+        fun = @inferred Pathfinder.build_optim_function(ℓ, optimizer)
+        @test fun isa SciMLBase.OptimizationFunction
+        @test SciMLBase.isinplace(fun)
+        @test fun.f(x, nothing) ≈ -ℓ.logp(x)
+        ∇fx = similar(x)
+        fun.grad(∇fx, x, nothing)
+        @test ∇fx ≈ -ℓ.∇logp(x)
+        H = similar(x, n, n)
+        fun.hess(H, x, nothing)
+        @test H ≈ -ℓ.∇²logp(x)
+    end
 end
 
 @testset "build_optim_problem" begin
     n = 20
     ℓ = build_logdensityproblem(logp_banana, n)
     x0 = randn(n)
-    fun = Pathfinder.build_optim_function(ℓ)
-    prob = Pathfinder.build_optim_problem(fun, x0)
-    @test SciMLBase.isinplace(prob)
-    @test prob isa SciMLBase.OptimizationProblem
-    @test prob.f === fun
-    @test prob.u0 == x0
-    @test prob.p === nothing
-end
 
-@testset "_make_optimization_callback" begin
-    @testset "callback return value" begin
-        progress_name = "Optimizing"
-        progress_id = nothing
-        maxiters = 1_000
-        x = randn(5)
-        check_vals = [0.0, NaN, -Inf, Inf]
-        @testset for fail_on_nonfinite in [true, false],
-            fval in check_vals,
-            gval in check_vals,
-            cbfail in [true, false]
+    @testset "OptimJLFunction" begin
+        fun = Pathfinder.OptimJLFunction(ℓ)
+        prob = @inferred Pathfinder.build_optim_problem(fun, x0)
+        @test prob isa Pathfinder.OptimJLProblem
+        @test prob.fun === fun
+        @test prob.u0 === x0
+    end
 
-            xs = Vector{Float64}[]
-            fxs = Float64[]
-            ∇fxs = Vector{Float64}[]
-            ∇f = function (x)
-                g = -x
-                g[end] = gval
-                return g
-            end
-            callback = (x, fx, args...) -> cbfail
-            cb = Pathfinder._make_optimization_callback(
-                xs,
-                fxs,
-                ∇fxs,
-                ∇f;
-                progress_name,
-                progress_id,
-                maxiters,
-                callback,
-                fail_on_nonfinite,
-            )
-            should_fail =
-                cbfail ||
-                (fail_on_nonfinite && (isnan(fval) || fval == Inf || !isfinite(gval)))
-            @test cb(x, -fval) == should_fail
-        end
+    @testset "SciML.OptimizationFunction" begin
+        fun = Pathfinder._build_sciml_optim_function(ℓ)
+        prob = Pathfinder.build_optim_problem(fun, x0)
+        @test SciMLBase.isinplace(prob)
+        @test prob isa SciMLBase.OptimizationProblem
+        @test prob.f === fun
+        @test prob.u0 == x0
+        @test prob.p === nothing
     end
 end
 
@@ -85,8 +66,6 @@ end
     ℓ = build_logdensityproblem(f, n)
 
     x0 = randn(n)
-    fun = Pathfinder.build_optim_function(ℓ)
-    prob = Pathfinder.build_optim_problem(fun, x0)
 
     optimizers = [
         "Optim.BFGS" => Optim.BFGS(),
@@ -95,37 +74,44 @@ end
         "NLopt.Opt" => NLopt.Opt(:LD_LBFGS, n),
     ]
     @testset "$name" for (name, optimizer) in optimizers
+        fun = Pathfinder.build_optim_function(ℓ, optimizer)
+        prob = Pathfinder.build_optim_problem(fun, x0)
         optim_sol, optim_trace = Pathfinder.optimize_with_trace(prob, optimizer)
-        @test optim_sol isa SciMLBase.OptimizationSolution
         @test optim_trace isa Pathfinder.OptimizationTrace
         @test optim_trace.points[1] ≈ x0
         @test optim_trace.points[end] ≈ μ
-        @test optim_sol.u ≈ μ
         @test optim_trace.log_densities ≈ f.(optim_trace.points)
         @test optim_trace.gradients ≈ ℓ.∇logp.(optim_trace.points) atol = 1e-4
 
-        if !(optimizer isa NLopt.Opt)
+        if optimizer isa Union{Optim.FirstOrderOptimizer,Optim.SecondOrderOptimizer}
+            @test optim_sol isa Optim.MultivariateOptimizationResults
+
             options = Optim.Options(; store_trace=true, extended_trace=true)
             res = Optim.optimize(
                 x -> -f(x), (y, x) -> copyto!(y, -ℓ.∇logp(x)), x0, optimizer, options
             )
-            @test Optim.iterations(res) == length(optim_trace.points) - 1
-            @test Optim.x_trace(res) ≈ optim_trace.points
-            @test Optim.minimizer(res) ≈ optim_trace.points[end]
+            @test Optim.iterations(res) == Optim.iterations(optim_sol)
+            @test Optim.x_trace(res) ≈ Optim.x_trace(optim_sol)
+            @test Optim.minimizer(res) ≈ Optim.minimizer(optim_sol)
+        else
+            @test optim_sol isa SciMLBase.OptimizationSolution
+            @test optim_sol.u ≈ μ
         end
-    end
 
-    @testset "progress logging" begin
-        logs, = Test.collect_test_logs(; min_level=ProgressLogging.ProgressLevel) do
-            ProgressLogging.progress(; name="Optimizing") do progress_id
-                Pathfinder.optimize_with_trace(prob, Optim.LBFGS(); progress_id)
+        @testset "progress logging" begin
+            @testset for try_id in 1:2
+                logs, = Test.collect_test_logs(; min_level=ProgressLogging.ProgressLevel) do
+                    ProgressLogging.progress(; name="Optimizing") do progress_id
+                        Pathfinder.optimize_with_trace(prob, Optim.LBFGS(); progress_id, try_id)
+                    end
+                end
+                @test logs[1].kwargs[:progress] === nothing
+                @test logs[1].message == "Optimizing"
+                @test logs[2].kwargs[:progress] == 0.0
+                @test logs[2].message == "Optimizing (try $try_id)"
+                @test logs[3].kwargs[:progress] == 0.001
+                @test logs[end].kwargs[:progress] == "done"
             end
         end
-        @test logs[1].kwargs[:progress] === nothing
-        @test logs[1].message == "Optimizing"
-        @test logs[2].kwargs[:progress] == 0.0
-        @test logs[2].message == "Optimizing"
-        @test logs[3].kwargs[:progress] == 0.001
-        @test logs[end].kwargs[:progress] == "done"
     end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 9bd8462a..d05adaa5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,6 +7,7 @@ Random.seed!(0)
 @testset "Pathfinder.jl" begin
     include("transducers.jl")
     include("woodbury.jl")
+    include("callbacks.jl")
     include("optimize.jl")
     include("inverse_hessian.jl")
     include("mvnormal.jl")
diff --git a/test/singlepath.jl b/test/singlepath.jl
index f70a062f..e298e8c7 100644
--- a/test/singlepath.jl
+++ b/test/singlepath.jl
@@ -27,10 +27,13 @@ include("test_utils.jl")
             ℓ = build_logdensityproblem(logp, 5)
             init = randn(dim)
             Random.seed!(rng, seed)
-            result = @inferred pathfinder(ℓ; init, ndraws, rng, executor)
+            result = @inferred Pathfinder.PathfinderResult pathfinder(
+                ℓ; init, ndraws, rng, executor
+            )
             @test result isa PathfinderResult
             @test result.input === ℓ
-            @test result.optim_prob isa SciMLBase.OptimizationProblem
+            @test result.optim_prob isa Pathfinder.OptimJLProblem
+            @test result.optim_solution isa Optim.MultivariateOptimizationResults
             @test result.logp(init) ≈ logp(init)
             @test result.rng === rng
             @test result.optimizer ===
@@ -46,7 +49,6 @@ include("test_utils.jl")
             @test result.fit_distribution_transformed === result.fit_distribution
             @test result.draws_transformed === result.draws
             @test result.num_tries ≥ 1
-            @test result.optim_solution isa SciMLBase.OptimizationSolution
             @test result.optim_trace isa Pathfinder.OptimizationTrace
             @test result.fit_distributions isa Vector{typeof(fit_distribution)}
             @test length(result.fit_distributions) == length(result.optim_trace)
@@ -93,7 +95,9 @@ include("test_utils.jl")
             executor = rng isa MersenneTwister ? SequentialEx() : ThreadedEx()
 
             Random.seed!(rng, seed)
-            result = @inferred pathfinder(ℓ; rng, optimizer, ndraws_elbo, executor)
+            result = @inferred Pathfinder.PathfinderResult pathfinder(
+                ℓ; rng, optimizer, ndraws_elbo, executor
+            )
             @test result.input === ℓ
             @test result.fit_distribution.Σ ≈ Σ rtol = 1e-1
             @test result.optimizer == optimizer