From 35aad3d7a4fc1a0682e59c7bebae06e00c9c1724 Mon Sep 17 00:00:00 2001
From: teddygroves The next step is to write functions that return I had to update the To take into account the inconsistency between the two raw data sources, I first had to change the variable Next I changed the To finish off I deleted the unused global variables To check that all this worked, I ran the data preparation script manually:2 2 I could also have just run Now the folder Preparing the data
int] = pa.Field(ge = 1)
n_attempt: Series[int] = pa.Field(ge = 0)
n_success: Series[PreparedData
objects. In this case I wrote a couple of data preparation functions: prepare_data_2006
and prepare_data_bdb
:def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
-"""Prepare the 2006 data."""
- = measurements_raw.rename(
- measurements ={"K": "n_attempt", "y": "n_success"}
- columns
- ).assign(="2006",
- season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
- player_season
- )return PreparedData(
- ="2006",
- name={
- coords"player_season": measurements["player_season"].tolist(),
- "season": measurements["season"].tolist(),
-
- },=measurements,
- measurements
+ )
def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
+"""Prepare the 2006 data."""
+ = measurements_raw.rename(
+ measurements ={"K": "n_attempt", "y": "n_success"}
+ columns
+ ).assign(="2006",
+ season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
+ player_season
+ )return PreparedData(
+ ="2006",
+ name={
+ coords"player_season": measurements["player_season"].tolist(),
+ "season": measurements["season"].astype(str).tolist(),
+
+ },=DataFrame[MeasurementsDF](measurements),
+ measurements
+ )
+
+def prepare_data_bdb(
+
+ measurements_main: pd.DataFrame,
+ measurements_post: pd.DataFrame,
+ appearances: pd.DataFrame,-> PreparedData:
+ ) """Prepare the baseballdatabank data.
+
+ There are a few substantive data choices here.
+
+ First, the function excludes players who have a '1' in their position as
+ these are likely pitchers, as well as players with fewer than 20 at bats.
+
+ Second, the function defines a successes and attempts according to the
+ 'on-base percentage' metric, so a success is a time when a player got a hit,
+ a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a
+ base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have
+ alternatively been calculated as just hits divided by at-bats, but my
+ understanding is that this method underrates players who are good at getting
+ walks.
+
+ """
+= appearances.loc[
+ pitchers lambda df: df["G_p"] == df["G_all"], "playerID"
+
].unique()
-
-def prepare_data_bdb(
-
- measurements_main: pd.DataFrame,
- measurements_post: pd.DataFrame,
- appearances: pd.DataFrame,-> PreparedData:
- ) """Prepare the baseballdatabank data.
-
- There are a few substantive data choices here.
-
- First, the function excludes players who have a '1' in their position as
- these are likely pitchers, as well as players with fewer than 20 at bats.
-
- Second, the function defines a successes and attempts according to the
- 'on-base percentage' metric, so a success is a time when a player got a hit,
- a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a
- base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have
- alternatively been calculated as just hits divided by at-bats, but my
- understanding is that this method underrates players who are good at getting
- walks.
-
- """
-= appearances.loc[
- pitchers lambda df: df["G_p"] == df["G_all"], "playerID"
-
- ].unique()
-def filter_batters(df: pd.DataFrame):
- return (
- "AB"] >= 20)
- (df[& (df["season"].ge(2017))
- & (~df["player"].isin(pitchers))
-
- )
-= (
- measurements_main, measurements_post ={"yearID": "season", "playerID": "player"})
- m.rename(columns
- .assign(=lambda df: df["player"].str.cat(
- player_season"season"].astype(str)
- df[
- ),=lambda df: df[["AB", "BB", "HBP", "SF"]]
- n_attempt0)
- .fillna(sum(axis=1)
- .int),
- .astype(=lambda df: (
- n_success"H", "BB", "HBP"]].fillna(0).sum(axis=1).astype(int)
- df[[
- ),
- )
- .loc[
- filter_batters,"player_season", "season", "n_attempt", "n_success"],
- [
- ]
- .copy()for m in [measurements_main, measurements_post]
-
- )= (
- measurements
- pd.concat([measurements_main, measurements_post])"player_season", "season"])
- .groupby([sum()
- .
- .reset_index()
- )return PreparedData(
- ="bdb",
- name={
- coords"player_season": measurements["player_season"].tolist(),
- "season": measurements["season"].tolist(),
-
- },=measurements,
- measurements )
prepare_data
function to take into account the two raw data sources:def prepare_data():
-"""Run main function."""
- print("Reading raw data...")
- = {
- raw_data file, index_col=None) for file in v]
- k: [pd.read_csv(for k, v in RAW_DATA_FILES.items()
-
- }= {
- data_preparation_functions_to_run "2006": prepare_data_2006,
- "bdb": prepare_data_bdb,
-
- }print("Preparing data...")
- for name, dpf in data_preparation_functions_to_run.items():
- print(f"Running data preparation function {dpf.__name__}...")
- = dpf(*raw_data[name])
- prepared_data = os.path.join(PREPARED_DIR, prepared_data.name)
- output_dir print(f"\twriting files to {output_dir}")
- if not os.path.exists(PREPARED_DIR):
-
- os.mkdir(PREPARED_DIR) write_prepared_data(prepared_data, output_dir)
RAW_DATA_FILES
:= {
+ RAW_DATA_FILES "2006": [os.path.join(RAW_DIR, "2006.csv")],
+ "bdb": [
+ "bdb-main.csv"),
+ os.path.join(RAW_DIR, "bdb-post.csv"),
+ os.path.join(RAW_DIR, "bdb-apps.csv"),
+ os.path.join(RAW_DIR,
+ ], }
prepare_data
function to handle the two different data sources.def prepare_data():
+"""Run main function."""
+ print("Reading raw data...")
+ = {
+ raw_data file, index_col=None) for file in v]
+ k: [pd.read_csv(for k, v in RAW_DATA_FILES.items()
+
+ }= {
+ data_preparation_functions_to_run "2006": prepare_data_2006,
+ "bdb": prepare_data_bdb,
+
+ }print("Preparing data...")
+ for name, dpf in data_preparation_functions_to_run.items():
+ print(f"Running data preparation function {dpf.__name__}...")
+ = dpf(*raw_data[name])
+ prepared_data = os.path.join(PREPARED_DIR, prepared_data.name)
+ output_dir print(f"\twriting files to {output_dir}")
+ if not os.path.exists(PREPARED_DIR):
+
+ os.mkdir(PREPARED_DIR) write_prepared_data(prepared_data, output_dir)
NEW_COLNAMES
, DROPNA_COLS
and DIMS
, then checked if the function load_prepared_data
needed any changes: I was pretty sure it didn’t.make analysis
again. This would have caused an error on the step after prepare_data.py, which is fine!> source .venv/bin/activate
-(baseball) > python baseball/prepare_data.py
> source .venv/bin/activate
+(baseball) > python baseball/prepare_data.py
data/prepared/bdb
contained a file data/prepared/bdb/measurements.csv
that looked like this:,player_season,season,n_attempt,n_success
0,abreujo02,2018,553,180
@@ -3449,214 +3458,189 @@
Specifying statistical models
\end{align*}\]
Again I would choose priors for the hyperparameters that put most of the alphas between 0.1 and 0.4.
To implement these models using Stan I first added the following function to the file custom_functions.stan
. This was simply copied from the relevant part of the geomagnetic storms case study.
real gpareto_lpdf(vector y, real ymin, real k, real sigma) {
-// generalised Pareto log pdf
- int N = rows(y);
- real inv_k = inv(k);
- if (k<0 && max(y-ymin)/sigma > -inv_k)
- reject("k<0 and max(y-ymin)/sigma > -1/k; found k, sigma =", k, ", ", sigma);
- if (sigma<=0)
- reject("sigma<=0; found sigma =", sigma);
- if (fabs(k) > 1e-15)
- return -(1+inv_k)*sum(log1p((y-ymin) * (k/sigma))) -N*log(sigma);
- else
- return -sum(y-ymin)/sigma -N*log(sigma); // limit k->0
- }
real gpareto_lpdf(vector y, real ymin, real k, real sigma) {
+// generalised Pareto log pdf
+ int N = rows(y);
+ real inv_k = inv(k);
+ if (k<0 && max(y-ymin)/sigma > -inv_k)
+ reject("k<0 and max(y-ymin)/sigma > -1/k; found k, sigma =", k, ", ", sigma);
+ if (sigma<=0)
+ reject("sigma<=0; found sigma =", sigma);
+ if (fabs(k) > 1e-15)
+ return -(1+inv_k)*sum(log1p((y-ymin) * (k/sigma))) -N*log(sigma);
+ else
+ return -sum(y-ymin)/sigma -N*log(sigma); // limit k->0
+ }
Next I wrote a file gpareto.stan
:
functions {
-#include custom_functions.stan
-
- }data {
-int<lower=0> N; // items
- array[N] int<lower=0> K; // trials
- array[N] int<lower=0> y; // successes
- real min_alpha; // noone worse than this would be in the dataset
- real max_alpha;
- array[2] real prior_sigma;
- array[2] real prior_k;
- int<lower=0,upper=1> likelihood;
-
- }parameters {
-real<lower=0.001> sigma; // scale parameter of generalised pareto distribution
- real<lower=-sigma/(max_alpha-min_alpha)> k; // shape parameter of generalised pareto distribution
- vector<lower=min_alpha,upper=max_alpha>[N] alpha; // success log-odds
-
- }model {
-1], prior_sigma[2]);
- sigma ~ normal(prior_sigma[1], prior_k[2]);
- k ~ normal(prior_k[
- alpha ~ gpareto(min_alpha, k, sigma);if (likelihood){
-
- y ~ binomial_logit(K, alpha);
- }
- }generated quantities {
-vector[N] yrep;
- vector[N] llik;
- for (n in 1:N){
-
- yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));
- llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);
- } }
Finally I wrote a file normal.stan
:
data {
-int<lower=0> N; // items
- array[N] int<lower=0> K; // trials
- array[N] int<lower=0> y; // successes
- array[2] real prior_mu;
- array[2] real prior_tau;
- array[2] real prior_b_K;
- int<lower=0,upper=1> likelihood;
-
- }transformed data {
-vector[N] log_K = log(to_vector(K));
- vector[N] log_K_std = (log_K - mean(log_K)) / sd(log_K);
+ functions {
+#include custom_functions.stan
+
+ }data {
+int<lower=0> N; // items
+ array[N] int<lower=0> K; // trials
+ array[N] int<lower=0> y; // successes
+ real min_alpha; // noone worse than this would be in the dataset
+ real max_alpha;
+ array[2] real prior_sigma;
+ array[2] real prior_k;
+ int<lower=0,upper=1> likelihood;
}parameters {
-real mu; // population mean of success log-odds
- real<lower=0> tau; // population sd of success log-odds
- real b_K;
- vector[N] alpha_std; // success log-odds (standardized)
-
- }model {
-1], prior_b_K[2]);
- b_K ~ normal(prior_b_K[1], prior_mu[2]);
- mu ~ normal(prior_mu[1], prior_tau[2]);
- tau ~ normal(prior_tau[0, 1);
- alpha_std ~ normal(if (likelihood){
-
- y ~ binomial_logit(K, mu + b_K * log_K_std + tau * alpha_std);
- }
- }generated quantities {
-vector[N] alpha = mu + b_K * log_K_std + tau * alpha_std;
- vector[N] yrep;
- vector[N] llik;
- for (n in 1:N){
-
- yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));
- llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);
- } }
+real<lower=0.001> sigma; // scale parameter of generalised pareto distribution
+ real<lower=-sigma/(max_alpha-min_alpha)> k; // shape parameter of generalised pareto distribution
+ vector<lower=min_alpha,upper=max_alpha>[N] alpha; // success log-odds
+
+ }model {
+1], prior_sigma[2]);
+ sigma ~ normal(prior_sigma[1], prior_k[2]);
+ k ~ normal(prior_k[
+ alpha ~ gpareto(min_alpha, k, sigma);if (likelihood){
+
+ y ~ binomial_logit(K, alpha);
+ }
+ }generated quantities {
+vector[N] yrep;
+ vector[N] llik;
+ for (n in 1:N){
+
+ yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));
+ llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);
+ } }
Finally I wrote a file normal.stan
:
data {
+int<lower=0> N; // items
+ array[N] int<lower=0> K; // trials
+ array[N] int<lower=0> y; // successes
+ array[2] real prior_mu;
+ array[2] real prior_tau;
+ array[2] real prior_b_K;
+ int<lower=0,upper=1> likelihood;
+
+ }transformed data {
+vector[N] log_K = log(to_vector(K));
+ vector[N] log_K_std = (log_K - mean(log_K)) / sd(log_K);
+
+ }parameters {
+real mu; // population mean of success log-odds
+ real<lower=0> tau; // population sd of success log-odds
+ real b_K;
+ vector[N] alpha_std; // success log-odds (standardized)
+
+ }model {
+1], prior_b_K[2]);
+ b_K ~ normal(prior_b_K[1], prior_mu[2]);
+ mu ~ normal(prior_mu[1], prior_tau[2]);
+ tau ~ normal(prior_tau[0, 1);
+ alpha_std ~ normal(if (likelihood){
+
+ y ~ binomial_logit(K, mu + b_K * log_K_std + tau * alpha_std);
+ }
+ }generated quantities {
+vector[N] alpha = mu + b_K * log_K_std + tau * alpha_std;
+ vector[N] yrep;
+ vector[N] llik;
+ for (n in 1:N){
+
+ yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));
+ llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);
+ } }
Next I needed to tell the analysis how to turn some prepared data into a dictionary that can be used as input for Stan. Bibat assumes that this task is handled by functions that live in the file baseball/stan_input_functions.py
, each of which takes in a PreparedData
and returns a Python dictionary. You can write as many Stan input functions as you like and choose which one to run for any given inference.
I started by defining some Stan input functions that pass arbitrary prepared data on to each of the models:3
3 Note that this code uses the scipy function logit
, which it imported like this: from scipy.special import logit
-def get_stan_input_normal(ppd: PreparedData) -> Dict:
-"""General function for creating a Stan input."""
- return {
- "N": len(ppd.measurements),
- "K": ppd.measurements["n_attempt"].values,
- "y": ppd.measurements["n_success"].values,
- "prior_mu": [logit(0.25), 0.2],
- "prior_tau": [0.2, 0.1],
- "prior_b_K": [0, 0.03],
-
- }
-
-def get_stan_input_gpareto(ppd: PreparedData) -> Dict:
-"""General function for creating a Stan input."""
- return {
- "N": len(ppd.measurements),
- "K": ppd.measurements["n_attempt"].values,
- "y": ppd.measurements["n_success"].values,
- "min_alpha": logit(0.07),
- "max_alpha": logit(0.5),
- "prior_sigma": [1.5, 0.4],
- "prior_k": [-0.5, 1],
- }
"""General function for creating a Stan input."""
+ return {
+ "N": len(ppd.measurements),
+ "K": ppd.measurements["n_attempt"].values,
+ "y": ppd.measurements["n_success"].values,
+ "prior_mu": [logit(0.25), 0.2],
+ "prior_tau": [0.2, 0.1],
+ "prior_b_K": [0, 0.03],
+
+ }
+
+def get_stan_input_gpareto(ppd: PreparedData) -> Dict:
+"""General function for creating a Stan input."""
+ return {
+ "N": len(ppd.measurements),
+ "K": ppd.measurements["n_attempt"].values,
+ "y": ppd.measurements["n_success"].values,
+ "min_alpha": logit(0.07),
+ "max_alpha": logit(0.5),
+ "prior_sigma": [1.5, 0.4],
+ "prior_k": [-0.5, 1],
+
+ }
+def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:
But why stop there? It can also be useful to generate Stan input data with a model, by running it in simulation mode with hardcoded parameter values. Here are some functions that do this for both of our models:
-
-def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:
-"""Generate fake Stan input consistent with the normal model."""
- = len(ppd.measurements)
- N = np.random.default_rng(seed=1234)
- rng = {
- true_param_values "mu": logit(0.25),
- "tau": 0.18, # 2sds is 0.19 to 0.32 batting average
- "b_K": 0.04, # slight effect of more attempts
- "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
-
- }= ppd.measurements["n_attempt"].values
- K = (np.log(K) - np.log(K).mean()) / np.log(K).std()
- log_K_std = (
- alpha "mu"]
- true_param_values[+ true_param_values["b_K"] * log_K_std
- + true_param_values["tau"] * true_param_values["alpha_std"]
-
- )= rng.random.binomial(K, expit(alpha))
- y return {"N": N, "K": K, "y": y} | true_param_values
-
-
-def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
-"""Generate fake Stan input consistent with the gpareto model."""
- = len(ppd.measurements)
- N = ppd.measurements["n_attempt"].values
- K = 0.1
- min_alpha = np.random.default_rng(seed=1234)
- rng = {"sigma": -1.098, "k": 0.18}
- true_param_values "alpha"] = gpareto_rvs(
- true_param_values[
- rng,
- N,
- min_alpha,"k"],
- true_param_values["sigma"],
- true_param_values[
- )= rng.binomial(K, expit(true_param_values["alpha"]))
- y return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
-
-
-def gpareto_rvs(
-int, mu: float, k: float, sigma: float
- rng: np.random.Generator, size:
- ):"""Generate random numbers from a generalised pareto distribution.
-
- See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
- source.
-
- """
-= rng.uniform(size)
- U if k == 0:
- return mu - sigma * np.log(U)
- else:
- return mu + (sigma * (U**-k) - 1) / sigma
= len(ppd.measurements)
+ N = np.random.default_rng(seed=1234)
+ rng = {
+ true_param_values "mu": logit(0.25),
+ "tau": 0.18, # 2sds is 0.19 to 0.32 batting average
+ "b_K": 0.04, # slight effect of more attempts
+ "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
+
+ }= ppd.measurements["n_attempt"].values
+ K = (np.log(K) - np.log(K).mean()) / np.log(K).std()
+ log_K_std = (
+ alpha "mu"]
+ true_param_values[+ true_param_values["b_K"] * log_K_std
+ + true_param_values["tau"] * true_param_values["alpha_std"]
+
+ )= rng.random.binomial(K, expit(alpha))
+ y return {"N": N, "K": K, "y": y} | true_param_values
+
+
+def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
+"""Generate fake Stan input consistent with the gpareto model."""
+ = len(ppd.measurements)
+ N = ppd.measurements["n_attempt"].values
+ K = 0.1
+ min_alpha = np.random.default_rng(seed=1234)
+ rng = {"sigma": -1.098, "k": 0.18}
+ true_param_values "alpha"] = gpareto_rvs(
+ true_param_values[
+ rng,
+ N,
+ min_alpha,"k"],
+ true_param_values["sigma"],
+ true_param_values[
+ )= rng.binomial(K, expit(true_param_values["alpha"]))
+ y return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
+
+
+def gpareto_rvs(
+int, mu: float, k: float, sigma: float
+ rng: np.random.Generator, size:
+ ):"""Generate random numbers from a generalised pareto distribution.
+
+ See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
+ source.
+
+ """
+= rng.uniform(size)
+ U if k == 0:
+ return mu - sigma * np.log(U)
+ else:
+ return mu + (sigma * (U**-k) - 1) / sigma
Now all the building blocks for making statistical inferences – raw data, data preparation rules, statistical models and recipes for turning prepared data into model inputs – were in place. The last step before actually running Stan was to write down how put the blocks together. Bibat has another concept for this, called ‘inferences’.
An inference in bibat is a folder containing a special file called config.toml
. This file sets out what inferences you want to make: which statistical model, which prepared data function, which Stan input function, which parameters have which dimensions, which sampling modes to use and how to configure the sampler. The folder will later be filled up with the results of performing the specified inferences.
I started by deleting the example inferences and creating two fresh folders, leaving me with an inferences
folder looking like this:
inferences
-├── gpareto2006
-│ └── config.toml
-└── normal2006
-└── config.toml
inferences
+├── gpareto2006
+│ └── config.toml
+└── normal2006
+└── config.toml
Here is the file inferences/gpareto2006/config.toml
:
name = "gpareto2006"
-stan_file = "gpareto.stan"
-prepared_data_dir = "2006"
-stan_input_function = "get_stan_input_gpareto"
-modes = ["prior", "posterior"]
-
-[dims]
-alpha = ["player"]
-
-[stanc_options]
-warn-pedantic = true
-
-[sample_kwargs]
-save_warmup = false
-iter_warmup = 2000
-iter_sampling = 2000
-
-[mode_options.prior]
-chains = 2
-iter_warmup = 1000
-iter_sampling = 1000
Here is the file inferences/normal2006/config.toml
:
name = "normal2006"
-stan_file = "normal.stan"
+name = "gpareto2006"
+stan_file = "gpareto.stan"
prepared_data_dir = "2006"
-stan_input_function = "get_stan_input_normal"
+stan_input_function = "get_stan_input_gpareto"
modes = ["prior", "posterior"]
[dims]
@@ -3674,6 +3658,28 @@ Specifying inferences
chains = 2
iter_warmup = 1000
iter_sampling = 1000
+Here is the file inferences/normal2006/config.toml
:
+name = "normal2006"
+stan_file = "normal.stan"
+prepared_data_dir = "2006"
+stan_input_function = "get_stan_input_normal"
+modes = ["prior", "posterior"]
+
+[dims]
+alpha = ["player"]
+
+[stanc_options]
+warn-pedantic = true
+
+[sample_kwargs]
+save_warmup = false
+iter_warmup = 2000
+iter_sampling = 2000
+
+[mode_options.prior]
+chains = 2
+iter_warmup = 1000
+iter_sampling = 1000
Note that:
- The Stan file, prepared data folder and Stan input function are referred to by strings. The analysis should raise an error if you enter a non-existing value.
@@ -3682,13 +3688,13 @@ Specifying inferences
- You can enter mode-specific overrides in
[mode_options.<MODE>]
. This can be handy if you want to run more or fewer iterations for a certain mode.
With all these changes made I ran make analysis
again. There were no errors until after sampling, which was expected as I hadn’t yet customised the investigation code, and I saw messages indicating that Stan had run. I also found that the inferences
subfolders had been populated:
-inferences
-├── gpareto2006
-│ ├── config.toml
-│ └── idata.json
-└── normal2006
-├── config.toml
- └── idata.json
+inferences
+├── gpareto2006
+│ ├── config.toml
+│ └── idata.json
+└── normal2006
+├── config.toml
+ └── idata.json
Investigating the inferences
@@ -3699,34 +3705,34 @@ Investigating the inferences
Choosing priors using push-forward calibration
The trickiest thing about my analysis was setting prior distributions for the parameters \(k\) and \(\sigma\) in the generalised Pareto models. To choose some more or less plausible values I did a few prior-mode model runs and checked the distributions of the resulting alpha
variables. I wanted to make sure that they all lay in the range corresponding to batting averages between about 0.1 and a little over 0.4. Here is a graph that shows the 1% to 99% prior quantiles for each player’s latent success percentage in both datasets alongside their actually realised success rates.
-
+
Extending the analysis to the baseballdatabank data
To model the more recent data, all I had to do was create some new inference folders with appropriate prepared_data_dir
fields in their config.toml
files. For example, here is the config.toml
file for the gparetobdb
inference:
-name = "gparetobdb"
-stan_file = "gpareto.stan"
-prepared_data_dir = "bdb"
-stan_input_function = "get_stan_input_gpareto"
-modes = ["prior", "posterior"]
-
-[dims]
-alpha = ["player"]
-
-[stanc_options]
-warn-pedantic = true
-
-[sample_kwargs]
-save_warmup = false
-iter_warmup = 2000
-iter_sampling = 2000
-
-[mode_options.prior]
-chains = 2
-iter_warmup = 2000
-iter_sampling = 1000
+name = "gparetobdb"
+stan_file = "gpareto.stan"
+prepared_data_dir = "bdb"
+stan_input_function = "get_stan_input_gpareto"
+modes = ["prior", "posterior"]
+
+[dims]
+alpha = ["player"]
+
+[stanc_options]
+warn-pedantic = true
+
+[sample_kwargs]
+save_warmup = false
+iter_warmup = 2000
+iter_sampling = 2000
+
+[mode_options.prior]
+chains = 2
+iter_warmup = 2000
+iter_sampling = 1000
After running make analysis
one more time, I went back to the notebook baseball/investigate.ipynb
and made plots of both models’ posterior 1%-99% success rate intervals for both datasets:
-
+
I think this is very interesting. Both models’ prior distributions had similar regularisation levels, and they more or less agree about the abilities of the players with the most at-bats, both in terms of locations and the widths of the plausible intervals for the true success rate. However, the models ended up with dramatically different certainty levels about the abilities of players with few at-bats. This pattern was true both for the small 2006 dataset and the much larger baseballdatabank dataset.