From 35aad3d7a4fc1a0682e59c7bebae06e00c9c1724 Mon Sep 17 00:00:00 2001 From: teddygroves Date: Fri, 7 Jul 2023 13:24:17 +0200 Subject: [PATCH] Update vignette --- docs/_static/report.html | 662 ++++++++++++++++++++------------------- 1 file changed, 334 insertions(+), 328 deletions(-) diff --git a/docs/_static/report.html b/docs/_static/report.html index 7459361..1f6b978 100644 --- a/docs/_static/report.html +++ b/docs/_static/report.html @@ -3312,116 +3312,125 @@

Preparing the data

n_attempt: Series[int] = pa.Field(ge = 1) n_success: Series[int] = pa.Field(ge = 0)

The next step is to write functions that return PreparedData objects. In this case I wrote a couple of data preparation functions: prepare_data_2006 and prepare_data_bdb:

-
def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
-    """Prepare the 2006 data."""
-    measurements = measurements_raw.rename(
-        columns={"K": "n_attempt", "y": "n_success"}
-    ).assign(
-        season="2006",
-        player_season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
-    )
-    return PreparedData(
-        name="2006",
-        coords={
-            "player_season": measurements["player_season"].tolist(),
-            "season": measurements["season"].tolist(),
-        },
-        measurements=measurements,
-    )
+
def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
+    """Prepare the 2006 data."""
+    measurements = measurements_raw.rename(
+        columns={"K": "n_attempt", "y": "n_success"}
+    ).assign(
+        season="2006",
+        player_season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
+    )
+    return PreparedData(
+        name="2006",
+        coords={
+            "player_season": measurements["player_season"].tolist(),
+            "season": measurements["season"].astype(str).tolist(),
+        },
+        measurements=DataFrame[MeasurementsDF](measurements),
+    )
+
+
+def prepare_data_bdb(
+    measurements_main: pd.DataFrame,
+    measurements_post: pd.DataFrame,
+    appearances: pd.DataFrame,
+) -> PreparedData:
+    """Prepare the baseballdatabank data.
+
+    There are a few substantive data choices here.
+
+    First, the function excludes players who have a '1' in their position as
+    these are likely pitchers, as well as players with fewer than 20 at bats.
+
+    Second, the function defines a successes and attempts according to the
+    'on-base percentage' metric, so a success is a time when a player got a hit,
+    a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a
+    base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have
+    alternatively been calculated as just hits divided by at-bats, but my
+    understanding is that this method underrates players who are good at getting
+    walks.
+
+    """
+    pitchers = appearances.loc[
+        lambda df: df["G_p"] == df["G_all"], "playerID"
+    ].unique()
 
-
-def prepare_data_bdb(
-    measurements_main: pd.DataFrame,
-    measurements_post: pd.DataFrame,
-    appearances: pd.DataFrame,
-) -> PreparedData:
-    """Prepare the baseballdatabank data.
-
-    There are a few substantive data choices here.
-
-    First, the function excludes players who have a '1' in their position as
-    these are likely pitchers, as well as players with fewer than 20 at bats.
-
-    Second, the function defines a successes and attempts according to the
-    'on-base percentage' metric, so a success is a time when a player got a hit,
-    a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a
-    base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have
-    alternatively been calculated as just hits divided by at-bats, but my
-    understanding is that this method underrates players who are good at getting
-    walks.
-
-    """
-    pitchers = appearances.loc[
-        lambda df: df["G_p"] == df["G_all"], "playerID"
-    ].unique()
-
-    def filter_batters(df: pd.DataFrame):
-        return (
-            (df["AB"] >= 20)
-            & (df["season"].ge(2017))
-            & (~df["player"].isin(pitchers))
-        )
-
-    measurements_main, measurements_post = (
-        m.rename(columns={"yearID": "season", "playerID": "player"})
-        .assign(
-            player_season=lambda df: df["player"].str.cat(
-                df["season"].astype(str)
-            ),
-            n_attempt=lambda df: df[["AB", "BB", "HBP", "SF"]]
-            .fillna(0)
-            .sum(axis=1)
-            .astype(int),
-            n_success=lambda df: (
-                df[["H", "BB", "HBP"]].fillna(0).sum(axis=1).astype(int)
-            ),
-        )
-        .loc[
-            filter_batters,
-            ["player_season", "season", "n_attempt", "n_success"],
-        ]
-        .copy()
-        for m in [measurements_main, measurements_post]
-    )
-    measurements = (
-        pd.concat([measurements_main, measurements_post])
-        .groupby(["player_season", "season"])
-        .sum()
-        .reset_index()
-    )
-    return PreparedData(
-        name="bdb",
-        coords={
-            "player_season": measurements["player_season"].tolist(),
-            "season": measurements["season"].tolist(),
-        },
-        measurements=measurements,
-    )
-

I had to update the prepare_data function to take into account the two raw data sources:

-
def prepare_data():
-    """Run main function."""
-    print("Reading raw data...")
-    raw_data = {
-        k: [pd.read_csv(file, index_col=None) for file in v]
-        for k, v in RAW_DATA_FILES.items()
-    }
-    data_preparation_functions_to_run = {
-        "2006": prepare_data_2006,
-        "bdb": prepare_data_bdb,
-    }
-    print("Preparing data...")
-    for name, dpf in data_preparation_functions_to_run.items():
-        print(f"Running data preparation function {dpf.__name__}...")
-        prepared_data = dpf(*raw_data[name])
-        output_dir = os.path.join(PREPARED_DIR, prepared_data.name)
-        print(f"\twriting files to {output_dir}")
-        if not os.path.exists(PREPARED_DIR):
-            os.mkdir(PREPARED_DIR)
-        write_prepared_data(prepared_data, output_dir)
+ def filter_batters(df: pd.DataFrame): + return ( + (df["AB"] >= 20) + & (df["season"].ge(2017)) + & (~df["player"].isin(pitchers)) + ) + + measurements_main, measurements_post = ( + m.rename(columns={"yearID": "season", "playerID": "player"}) + .assign( + player_season=lambda df: df["player"].str.cat( + df["season"].astype(str) + ), + n_attempt=lambda df: df[["AB", "BB", "HBP", "SF"]] + .fillna(0) + .sum(axis=1) + .astype(int), + n_success=lambda df: ( + df[["H", "BB", "HBP"]].fillna(0).sum(axis=1).astype(int) + ), + ) + .loc[ + filter_batters, + ["player_season", "season", "n_attempt", "n_success"], + ] + .copy() + for m in [measurements_main, measurements_post] + ) + measurements = ( + pd.concat([measurements_main, measurements_post]) + .groupby(["player_season", "season"]) + .sum() + .reset_index() + ) + return PreparedData( + name="bdb", + coords={ + "player_season": measurements["player_season"].tolist(), + "season": measurements["season"].astype(str).tolist(), + }, + measurements=DataFrame[MeasurementsDF](measurements), + )
+

To take into account the inconsistency between the two raw data sources, I first had to change the variable RAW_DATA_FILES:

+
RAW_DATA_FILES = {
+    "2006": [os.path.join(RAW_DIR, "2006.csv")],
+    "bdb": [
+        os.path.join(RAW_DIR, "bdb-main.csv"),
+        os.path.join(RAW_DIR, "bdb-post.csv"),
+        os.path.join(RAW_DIR, "bdb-apps.csv"),
+    ],
+}
+

Next I changed the prepare_data function to handle the two different data sources.

+
def prepare_data():
+    """Run main function."""
+    print("Reading raw data...")
+    raw_data = {
+        k: [pd.read_csv(file, index_col=None) for file in v]
+        for k, v in RAW_DATA_FILES.items()
+    }
+    data_preparation_functions_to_run = {
+        "2006": prepare_data_2006,
+        "bdb": prepare_data_bdb,
+    }
+    print("Preparing data...")
+    for name, dpf in data_preparation_functions_to_run.items():
+        print(f"Running data preparation function {dpf.__name__}...")
+        prepared_data = dpf(*raw_data[name])
+        output_dir = os.path.join(PREPARED_DIR, prepared_data.name)
+        print(f"\twriting files to {output_dir}")
+        if not os.path.exists(PREPARED_DIR):
+            os.mkdir(PREPARED_DIR)
+        write_prepared_data(prepared_data, output_dir)

To finish off I deleted the unused global variables NEW_COLNAMES, DROPNA_COLS and DIMS, then checked if the function load_prepared_data needed any changes: I was pretty sure it didn’t.

To check that all this worked, I ran the data preparation script manually:2

  • 2 I could also have just run make analysis again. This would have caused an error on the step after prepare_data.py, which is fine!

  • -
    > source .venv/bin/activate
    -(baseball) > python baseball/prepare_data.py
    +
    > source .venv/bin/activate
    +(baseball) > python baseball/prepare_data.py

    Now the folder data/prepared/bdb contained a file data/prepared/bdb/measurements.csv that looked like this:

    ,player_season,season,n_attempt,n_success
     0,abreujo02,2018,553,180
    @@ -3449,214 +3458,189 @@ 

    Specifying statistical models

    \end{align*}\]

    Again I would choose priors for the hyperparameters that put most of the alphas between 0.1 and 0.4.

    To implement these models using Stan I first added the following function to the file custom_functions.stan. This was simply copied from the relevant part of the geomagnetic storms case study.

    -
    real gpareto_lpdf(vector y, real ymin, real k, real sigma) {
    -  // generalised Pareto log pdf
    -  int N = rows(y);
    -  real inv_k = inv(k);
    -  if (k<0 && max(y-ymin)/sigma > -inv_k)
    -    reject("k<0 and max(y-ymin)/sigma > -1/k; found k, sigma =", k, ", ", sigma);
    -  if (sigma<=0)
    -    reject("sigma<=0; found sigma =", sigma);
    -  if (fabs(k) > 1e-15)
    -    return -(1+inv_k)*sum(log1p((y-ymin) * (k/sigma))) -N*log(sigma);
    -  else
    -    return -sum(y-ymin)/sigma -N*log(sigma); // limit k->0
    -}
    +
    real gpareto_lpdf(vector y, real ymin, real k, real sigma) {
    +  // generalised Pareto log pdf
    +  int N = rows(y);
    +  real inv_k = inv(k);
    +  if (k<0 && max(y-ymin)/sigma > -inv_k)
    +    reject("k<0 and max(y-ymin)/sigma > -1/k; found k, sigma =", k, ", ", sigma);
    +  if (sigma<=0)
    +    reject("sigma<=0; found sigma =", sigma);
    +  if (fabs(k) > 1e-15)
    +    return -(1+inv_k)*sum(log1p((y-ymin) * (k/sigma))) -N*log(sigma);
    +  else
    +    return -sum(y-ymin)/sigma -N*log(sigma); // limit k->0
    +}

    Next I wrote a file gpareto.stan:

    -
    functions {
    -#include custom_functions.stan
    -}
    -data {
    -  int<lower=0> N; // items
    -  array[N] int<lower=0> K; // trials
    -  array[N] int<lower=0> y; // successes
    -  real min_alpha; // noone worse than this would be in the dataset
    -  real max_alpha;
    -  array[2] real prior_sigma;
    -  array[2] real prior_k;
    -  int<lower=0,upper=1> likelihood;
    -}
    -parameters {
    -  real<lower=0.001> sigma; // scale parameter of generalised pareto distribution
    -  real<lower=-sigma/(max_alpha-min_alpha)> k; // shape parameter of generalised pareto distribution
    -  vector<lower=min_alpha,upper=max_alpha>[N] alpha; // success log-odds
    -}
    -model {
    -  sigma ~ normal(prior_sigma[1], prior_sigma[2]);
    -  k ~ normal(prior_k[1], prior_k[2]);
    -  alpha ~ gpareto(min_alpha, k, sigma);
    -  if (likelihood){
    -    y ~ binomial_logit(K, alpha);
    -  }
    -}
    -generated quantities {
    -  vector[N] yrep;
    -  vector[N] llik;
    -  for (n in 1:N){
    -    yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));
    -    llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);
    -  }
    -}
    -

    Finally I wrote a file normal.stan:

    -
    data {
    -  int<lower=0> N; // items
    -  array[N] int<lower=0> K; // trials
    -  array[N] int<lower=0> y; // successes
    -  array[2] real prior_mu;
    -  array[2] real prior_tau;
    -  array[2] real prior_b_K;
    -  int<lower=0,upper=1> likelihood;
    -}
    -transformed data {
    -  vector[N] log_K = log(to_vector(K));
    -  vector[N] log_K_std = (log_K - mean(log_K)) / sd(log_K);
    +
    functions {
    +#include custom_functions.stan
    +}
    +data {
    +  int<lower=0> N; // items
    +  array[N] int<lower=0> K; // trials
    +  array[N] int<lower=0> y; // successes
    +  real min_alpha; // noone worse than this would be in the dataset
    +  real max_alpha;
    +  array[2] real prior_sigma;
    +  array[2] real prior_k;
    +  int<lower=0,upper=1> likelihood;
     }
     parameters {
    -  real mu; // population mean of success log-odds
    -  real<lower=0> tau; // population sd of success log-odds
    -  real b_K;
    -  vector[N] alpha_std; // success log-odds (standardized)
    -}
    -model {
    -  b_K ~ normal(prior_b_K[1], prior_b_K[2]);
    -  mu ~ normal(prior_mu[1], prior_mu[2]);
    -  tau ~ normal(prior_tau[1], prior_tau[2]);
    -  alpha_std ~ normal(0, 1);
    -  if (likelihood){
    -    y ~ binomial_logit(K, mu + b_K * log_K_std + tau * alpha_std);
    -  }
    -}
    -generated quantities {
    -  vector[N] alpha = mu + b_K * log_K_std + tau * alpha_std;
    -  vector[N] yrep;
    -  vector[N] llik;
    -  for (n in 1:N){
    -    yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));
    -    llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);
    -  }
    -}
    + real<lower=0.001> sigma; // scale parameter of generalised pareto distribution + real<lower=-sigma/(max_alpha-min_alpha)> k; // shape parameter of generalised pareto distribution + vector<lower=min_alpha,upper=max_alpha>[N] alpha; // success log-odds +} +model { + sigma ~ normal(prior_sigma[1], prior_sigma[2]); + k ~ normal(prior_k[1], prior_k[2]); + alpha ~ gpareto(min_alpha, k, sigma); + if (likelihood){ + y ~ binomial_logit(K, alpha); + } +} +generated quantities { + vector[N] yrep; + vector[N] llik; + for (n in 1:N){ + yrep[n] = binomial_rng(K[n], inv_logit(alpha[n])); + llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]); + } +}
    +

    Finally I wrote a file normal.stan:

    +
    data {
    +  int<lower=0> N; // items
    +  array[N] int<lower=0> K; // trials
    +  array[N] int<lower=0> y; // successes
    +  array[2] real prior_mu;
    +  array[2] real prior_tau;
    +  array[2] real prior_b_K;
    +  int<lower=0,upper=1> likelihood;
    +}
    +transformed data {
    +  vector[N] log_K = log(to_vector(K));
    +  vector[N] log_K_std = (log_K - mean(log_K)) / sd(log_K);
    +}
    +parameters {
    +  real mu; // population mean of success log-odds
    +  real<lower=0> tau; // population sd of success log-odds
    +  real b_K;
    +  vector[N] alpha_std; // success log-odds (standardized)
    +}
    +model {
    +  b_K ~ normal(prior_b_K[1], prior_b_K[2]);
    +  mu ~ normal(prior_mu[1], prior_mu[2]);
    +  tau ~ normal(prior_tau[1], prior_tau[2]);
    +  alpha_std ~ normal(0, 1);
    +  if (likelihood){
    +    y ~ binomial_logit(K, mu + b_K * log_K_std + tau * alpha_std);
    +  }
    +}
    +generated quantities {
    +  vector[N] alpha = mu + b_K * log_K_std + tau * alpha_std;
    +  vector[N] yrep;
    +  vector[N] llik;
    +  for (n in 1:N){
    +    yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));
    +    llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);
    +  }
    +}

    Generating Stan inputs

    Next I needed to tell the analysis how to turn some prepared data into a dictionary that can be used as input for Stan. Bibat assumes that this task is handled by functions that live in the file baseball/stan_input_functions.py, each of which takes in a PreparedData and returns a Python dictionary. You can write as many Stan input functions as you like and choose which one to run for any given inference.

    I started by defining some Stan input functions that pass arbitrary prepared data on to each of the models:3

  • 3 Note that this code uses the scipy function logit, which it imported like this: from scipy.special import logit

  • -
    
    -def get_stan_input_normal(ppd: PreparedData) -> Dict:
    -    """General function for creating a Stan input."""
    -    return {
    -        "N": len(ppd.measurements),
    -        "K": ppd.measurements["n_attempt"].values,
    -        "y": ppd.measurements["n_success"].values,
    -        "prior_mu": [logit(0.25), 0.2],
    -        "prior_tau": [0.2, 0.1],
    -        "prior_b_K": [0, 0.03],
    -    }
    -
    -
    -def get_stan_input_gpareto(ppd: PreparedData) -> Dict:
    -    """General function for creating a Stan input."""
    -    return {
    -        "N": len(ppd.measurements),
    -        "K": ppd.measurements["n_attempt"].values,
    -        "y": ppd.measurements["n_success"].values,
    -        "min_alpha": logit(0.07),
    -        "max_alpha": logit(0.5),
    -        "prior_sigma": [1.5, 0.4],
    -        "prior_k": [-0.5, 1],
    -    }
    +
        """General function for creating a Stan input."""
    +    return {
    +        "N": len(ppd.measurements),
    +        "K": ppd.measurements["n_attempt"].values,
    +        "y": ppd.measurements["n_success"].values,
    +        "prior_mu": [logit(0.25), 0.2],
    +        "prior_tau": [0.2, 0.1],
    +        "prior_b_K": [0, 0.03],
    +    }
    +
    +
    +def get_stan_input_gpareto(ppd: PreparedData) -> Dict:
    +    """General function for creating a Stan input."""
    +    return {
    +        "N": len(ppd.measurements),
    +        "K": ppd.measurements["n_attempt"].values,
    +        "y": ppd.measurements["n_success"].values,
    +        "min_alpha": logit(0.07),
    +        "max_alpha": logit(0.5),
    +        "prior_sigma": [1.5, 0.4],
    +        "prior_k": [-0.5, 1],
    +    }
    +
    +def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:

    But why stop there? It can also be useful to generate Stan input data with a model, by running it in simulation mode with hardcoded parameter values. Here are some functions that do this for both of our models:

    -
    
    -def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:
    -    """Generate fake Stan input consistent with the normal model."""
    -    N = len(ppd.measurements)
    -    rng = np.random.default_rng(seed=1234)
    -    true_param_values = {
    -        "mu": logit(0.25),
    -        "tau": 0.18,  # 2sds is 0.19 to 0.32 batting average
    -        "b_K": 0.04,  # slight effect of more attempts
    -        "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
    -    }
    -    K = ppd.measurements["n_attempt"].values
    -    log_K_std = (np.log(K) - np.log(K).mean()) / np.log(K).std()
    -    alpha = (
    -        true_param_values["mu"]
    -        + true_param_values["b_K"] * log_K_std
    -        + true_param_values["tau"] * true_param_values["alpha_std"]
    -    )
    -    y = rng.random.binomial(K, expit(alpha))
    -    return {"N": N, "K": K, "y": y} | true_param_values
    -
    -
    -def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
    -    """Generate fake Stan input consistent with the gpareto model."""
    -    N = len(ppd.measurements)
    -    K = ppd.measurements["n_attempt"].values
    -    min_alpha = 0.1
    -    rng = np.random.default_rng(seed=1234)
    -    true_param_values = {"sigma": -1.098, "k": 0.18}
    -    true_param_values["alpha"] = gpareto_rvs(
    -        rng,
    -        N,
    -        min_alpha,
    -        true_param_values["k"],
    -        true_param_values["sigma"],
    -    )
    -    y = rng.binomial(K, expit(true_param_values["alpha"]))
    -    return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
    -
    -
    -def gpareto_rvs(
    -    rng: np.random.Generator, size: int, mu: float, k: float, sigma: float
    -):
    -    """Generate random numbers from a generalised pareto distribution.
    -
    -    See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
    -    source.
    -
    -    """
    -    U = rng.uniform(size)
    -    if k == 0:
    -        return mu - sigma * np.log(U)
    -    else:
    -        return mu + (sigma * (U**-k) - 1) / sigma
    +
        N = len(ppd.measurements)
    +    rng = np.random.default_rng(seed=1234)
    +    true_param_values = {
    +        "mu": logit(0.25),
    +        "tau": 0.18,  # 2sds is 0.19 to 0.32 batting average
    +        "b_K": 0.04,  # slight effect of more attempts
    +        "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
    +    }
    +    K = ppd.measurements["n_attempt"].values
    +    log_K_std = (np.log(K) - np.log(K).mean()) / np.log(K).std()
    +    alpha = (
    +        true_param_values["mu"]
    +        + true_param_values["b_K"] * log_K_std
    +        + true_param_values["tau"] * true_param_values["alpha_std"]
    +    )
    +    y = rng.random.binomial(K, expit(alpha))
    +    return {"N": N, "K": K, "y": y} | true_param_values
    +
    +
    +def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
    +    """Generate fake Stan input consistent with the gpareto model."""
    +    N = len(ppd.measurements)
    +    K = ppd.measurements["n_attempt"].values
    +    min_alpha = 0.1
    +    rng = np.random.default_rng(seed=1234)
    +    true_param_values = {"sigma": -1.098, "k": 0.18}
    +    true_param_values["alpha"] = gpareto_rvs(
    +        rng,
    +        N,
    +        min_alpha,
    +        true_param_values["k"],
    +        true_param_values["sigma"],
    +    )
    +    y = rng.binomial(K, expit(true_param_values["alpha"]))
    +    return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
    +
    +
    +def gpareto_rvs(
    +    rng: np.random.Generator, size: int, mu: float, k: float, sigma: float
    +):
    +    """Generate random numbers from a generalised pareto distribution.
    +
    +    See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
    +    source.
    +
    +    """
    +    U = rng.uniform(size)
    +    if k == 0:
    +        return mu - sigma * np.log(U)
    +    else:
    +        return mu + (sigma * (U**-k) - 1) / sigma

    Specifying inferences

    Now all the building blocks for making statistical inferences – raw data, data preparation rules, statistical models and recipes for turning prepared data into model inputs – were in place. The last step before actually running Stan was to write down how put the blocks together. Bibat has another concept for this, called ‘inferences’.

    An inference in bibat is a folder containing a special file called config.toml. This file sets out what inferences you want to make: which statistical model, which prepared data function, which Stan input function, which parameters have which dimensions, which sampling modes to use and how to configure the sampler. The folder will later be filled up with the results of performing the specified inferences.

    I started by deleting the example inferences and creating two fresh folders, leaving me with an inferences folder looking like this:

    -
    inferences
    -├── gpareto2006
    -│   └── config.toml
    -└── normal2006
    -    └── config.toml
    +
    inferences
    +├── gpareto2006
    +│   └── config.toml
    +└── normal2006
    +    └── config.toml

    Here is the file inferences/gpareto2006/config.toml:

    -
    name = "gpareto2006"
    -stan_file = "gpareto.stan"
    -prepared_data_dir = "2006"
    -stan_input_function = "get_stan_input_gpareto"
    -modes = ["prior", "posterior"]
    -
    -[dims]
    -alpha = ["player"]
    -
    -[stanc_options]
    -warn-pedantic = true
    -
    -[sample_kwargs]
    -save_warmup = false
    -iter_warmup = 2000
    -iter_sampling = 2000
    -
    -[mode_options.prior]
    -chains = 2
    -iter_warmup = 1000
    -iter_sampling = 1000
    -

    Here is the file inferences/normal2006/config.toml:

    -
    name = "normal2006"
    -stan_file = "normal.stan"
    +
    name = "gpareto2006"
    +stan_file = "gpareto.stan"
     prepared_data_dir = "2006"
    -stan_input_function = "get_stan_input_normal"
    +stan_input_function = "get_stan_input_gpareto"
     modes = ["prior", "posterior"]
     
     [dims]
    @@ -3674,6 +3658,28 @@ 

    Specifying inferences

    chains = 2 iter_warmup = 1000 iter_sampling = 1000
    +

    Here is the file inferences/normal2006/config.toml:

    +
    name = "normal2006"
    +stan_file = "normal.stan"
    +prepared_data_dir = "2006"
    +stan_input_function = "get_stan_input_normal"
    +modes = ["prior", "posterior"]
    +
    +[dims]
    +alpha = ["player"]
    +
    +[stanc_options]
    +warn-pedantic = true
    +
    +[sample_kwargs]
    +save_warmup = false
    +iter_warmup = 2000
    +iter_sampling = 2000
    +
    +[mode_options.prior]
    +chains = 2
    +iter_warmup = 1000
    +iter_sampling = 1000

    Note that:

    • The Stan file, prepared data folder and Stan input function are referred to by strings. The analysis should raise an error if you enter a non-existing value.
    • @@ -3682,13 +3688,13 @@

      Specifying inferences

    • You can enter mode-specific overrides in [mode_options.<MODE>]. This can be handy if you want to run more or fewer iterations for a certain mode.

    With all these changes made I ran make analysis again. There were no errors until after sampling, which was expected as I hadn’t yet customised the investigation code, and I saw messages indicating that Stan had run. I also found that the inferences subfolders had been populated:

    -
    inferences
    -├── gpareto2006
    -│   ├── config.toml
    -│   └── idata.json
    -└── normal2006
    -    ├── config.toml
    -    └── idata.json
    +
    inferences
    +├── gpareto2006
    +│   ├── config.toml
    +│   └── idata.json
    +└── normal2006
    +    ├── config.toml
    +    └── idata.json

    Investigating the inferences

    @@ -3699,34 +3705,34 @@

    Investigating the inferences

    Choosing priors using push-forward calibration

    The trickiest thing about my analysis was setting prior distributions for the parameters \(k\) and \(\sigma\) in the generalised Pareto models. To choose some more or less plausible values I did a few prior-mode model runs and checked the distributions of the resulting alpha variables. I wanted to make sure that they all lay in the range corresponding to batting averages between about 0.1 and a little over 0.4. Here is a graph that shows the 1% to 99% prior quantiles for each player’s latent success percentage in both datasets alongside their actually realised success rates.

    -

    +

    Extending the analysis to the baseballdatabank data

    To model the more recent data, all I had to do was create some new inference folders with appropriate prepared_data_dir fields in their config.toml files. For example, here is the config.toml file for the gparetobdb inference:

    -
    name = "gparetobdb"
    -stan_file = "gpareto.stan"
    -prepared_data_dir = "bdb"
    -stan_input_function = "get_stan_input_gpareto"
    -modes = ["prior", "posterior"]
    -
    -[dims]
    -alpha = ["player"]
    -
    -[stanc_options]
    -warn-pedantic = true
    -
    -[sample_kwargs]
    -save_warmup = false
    -iter_warmup = 2000
    -iter_sampling = 2000
    -
    -[mode_options.prior]
    -chains = 2
    -iter_warmup = 2000
    -iter_sampling = 1000
    +
    name = "gparetobdb"
    +stan_file = "gpareto.stan"
    +prepared_data_dir = "bdb"
    +stan_input_function = "get_stan_input_gpareto"
    +modes = ["prior", "posterior"]
    +
    +[dims]
    +alpha = ["player"]
    +
    +[stanc_options]
    +warn-pedantic = true
    +
    +[sample_kwargs]
    +save_warmup = false
    +iter_warmup = 2000
    +iter_sampling = 2000
    +
    +[mode_options.prior]
    +chains = 2
    +iter_warmup = 2000
    +iter_sampling = 1000

    After running make analysis one more time, I went back to the notebook baseball/investigate.ipynb and made plots of both models’ posterior 1%-99% success rate intervals for both datasets:

    -

    +

    I think this is very interesting. Both models’ prior distributions had similar regularisation levels, and they more or less agree about the abilities of the players with the most at-bats, both in terms of locations and the widths of the plausible intervals for the true success rate. However, the models ended up with dramatically different certainty levels about the abilities of players with few at-bats. This pattern was true both for the small 2006 dataset and the much larger baseballdatabank dataset.