diff --git a/bibat/examples/baseball/baseball/data_preparation.py b/bibat/examples/baseball/baseball/data_preparation.py index 374dfd7..52e1fca 100644 --- a/bibat/examples/baseball/baseball/data_preparation.py +++ b/bibat/examples/baseball/baseball/data_preparation.py @@ -7,7 +7,6 @@ import json import os -import numpy as np import pandas as pd import pandera as pa from pandera.typing import DataFrame, Series diff --git a/bibat/examples/baseball/docs/report.html b/bibat/examples/baseball/docs/report.html index 1f6b978..fe8af7b 100644 --- a/bibat/examples/baseball/docs/report.html +++ b/bibat/examples/baseball/docs/report.html @@ -3312,91 +3312,91 @@

Preparing the data

n_attempt: Series[int] = pa.Field(ge = 1) n_success: Series[int] = pa.Field(ge = 0)

The next step is to write functions that return PreparedData objects. In this case I wrote a couple of data preparation functions: prepare_data_2006 and prepare_data_bdb:

-
def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
-    """Prepare the 2006 data."""
-    measurements = measurements_raw.rename(
-        columns={"K": "n_attempt", "y": "n_success"}
-    ).assign(
-        season="2006",
-        player_season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
-    )
-    return PreparedData(
-        name="2006",
-        coords={
-            "player_season": measurements["player_season"].tolist(),
-            "season": measurements["season"].astype(str).tolist(),
-        },
-        measurements=DataFrame[MeasurementsDF](measurements),
-    )
-
+
def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
+    """Prepare the 2006 data."""
+    measurements = measurements_raw.rename(
+        columns={"K": "n_attempt", "y": "n_success"}
+    ).assign(
+        season="2006",
+        player_season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
+    )
+    return PreparedData(
+        name="2006",
+        coords={
+            "player_season": measurements["player_season"].tolist(),
+            "season": measurements["season"].astype(str).tolist(),
+        },
+        measurements=DataFrame[MeasurementsDF](measurements),
+    )
 
-def prepare_data_bdb(
-    measurements_main: pd.DataFrame,
-    measurements_post: pd.DataFrame,
-    appearances: pd.DataFrame,
-) -> PreparedData:
-    """Prepare the baseballdatabank data.
-
-    There are a few substantive data choices here.
-
-    First, the function excludes players who have a '1' in their position as
-    these are likely pitchers, as well as players with fewer than 20 at bats.
-
-    Second, the function defines a successes and attempts according to the
-    'on-base percentage' metric, so a success is a time when a player got a hit,
-    a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a
-    base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have
-    alternatively been calculated as just hits divided by at-bats, but my
-    understanding is that this method underrates players who are good at getting
-    walks.
-
-    """
-    pitchers = appearances.loc[
-        lambda df: df["G_p"] == df["G_all"], "playerID"
-    ].unique()
-
-    def filter_batters(df: pd.DataFrame):
-        return (
-            (df["AB"] >= 20)
-            & (df["season"].ge(2017))
-            & (~df["player"].isin(pitchers))
-        )
-
-    measurements_main, measurements_post = (
-        m.rename(columns={"yearID": "season", "playerID": "player"})
-        .assign(
-            player_season=lambda df: df["player"].str.cat(
-                df["season"].astype(str)
-            ),
-            n_attempt=lambda df: df[["AB", "BB", "HBP", "SF"]]
-            .fillna(0)
-            .sum(axis=1)
-            .astype(int),
-            n_success=lambda df: (
-                df[["H", "BB", "HBP"]].fillna(0).sum(axis=1).astype(int)
-            ),
-        )
-        .loc[
-            filter_batters,
-            ["player_season", "season", "n_attempt", "n_success"],
-        ]
-        .copy()
-        for m in [measurements_main, measurements_post]
-    )
-    measurements = (
-        pd.concat([measurements_main, measurements_post])
-        .groupby(["player_season", "season"])
-        .sum()
-        .reset_index()
-    )
-    return PreparedData(
-        name="bdb",
-        coords={
-            "player_season": measurements["player_season"].tolist(),
-            "season": measurements["season"].astype(str).tolist(),
-        },
-        measurements=DataFrame[MeasurementsDF](measurements),
-    )
+ +def prepare_data_bdb( + measurements_main: pd.DataFrame, + measurements_post: pd.DataFrame, + appearances: pd.DataFrame, +) -> PreparedData: + """Prepare the baseballdatabank data. + + There are a few substantive data choices here. + + First, the function excludes players who have a '1' in their position as + these are likely pitchers, as well as players with fewer than 20 at bats. + + Second, the function defines a successes and attempts according to the + 'on-base percentage' metric, so a success is a time when a player got a hit, + a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a + base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have + alternatively been calculated as just hits divided by at-bats, but my + understanding is that this method underrates players who are good at getting + walks. + + """ + pitchers = appearances.loc[ + lambda df: df["G_p"] == df["G_all"], "playerID" + ].unique() + + def filter_batters(df: pd.DataFrame): + return ( + (df["AB"] >= 20) + & (df["season"].ge(2017)) + & (~df["player"].isin(pitchers)) + ) + + measurements_main, measurements_post = ( + m.rename(columns={"yearID": "season", "playerID": "player"}) + .assign( + player_season=lambda df: df["player"].str.cat( + df["season"].astype(str) + ), + n_attempt=lambda df: df[["AB", "BB", "HBP", "SF"]] + .fillna(0) + .sum(axis=1) + .astype(int), + n_success=lambda df: ( + df[["H", "BB", "HBP"]].fillna(0).sum(axis=1).astype(int) + ), + ) + .loc[ + filter_batters, + ["player_season", "season", "n_attempt", "n_success"], + ] + .copy() + for m in [measurements_main, measurements_post] + ) + measurements = ( + pd.concat([measurements_main, measurements_post]) + .groupby(["player_season", "season"]) + .sum() + .reset_index() + ) + return PreparedData( + name="bdb", + coords={ + "player_season": measurements["player_season"].tolist(), + "season": measurements["season"].astype(str).tolist(), + }, + measurements=DataFrame[MeasurementsDF](measurements), + )

To take into account the inconsistency between the two raw data sources, I first had to change the variable RAW_DATA_FILES:

RAW_DATA_FILES = {
     "2006": [os.path.join(RAW_DIR, "2006.csv")],
@@ -3407,26 +3407,27 @@ 

Preparing the data

], }

Next I changed the prepare_data function to handle the two different data sources.

-
def prepare_data():
-    """Run main function."""
-    print("Reading raw data...")
-    raw_data = {
-        k: [pd.read_csv(file, index_col=None) for file in v]
-        for k, v in RAW_DATA_FILES.items()
-    }
-    data_preparation_functions_to_run = {
-        "2006": prepare_data_2006,
-        "bdb": prepare_data_bdb,
-    }
-    print("Preparing data...")
-    for name, dpf in data_preparation_functions_to_run.items():
-        print(f"Running data preparation function {dpf.__name__}...")
-        prepared_data = dpf(*raw_data[name])
-        output_dir = os.path.join(PREPARED_DIR, prepared_data.name)
-        print(f"\twriting files to {output_dir}")
-        if not os.path.exists(PREPARED_DIR):
-            os.mkdir(PREPARED_DIR)
-        write_prepared_data(prepared_data, output_dir)
+
def prepare_data():
+    """Run main function."""
+    print("Reading raw data...")
+    raw_data = {
+        k: [pd.read_csv(file, index_col=None) for file in v]
+        for k, v in RAW_DATA_FILES.items()
+    }
+    data_preparation_functions_to_run = {
+        "2006": prepare_data_2006,
+        "bdb": prepare_data_bdb,
+    }
+    print("Preparing data...")
+    for name, dpf in data_preparation_functions_to_run.items():
+        print(f"Running data preparation function {dpf.__name__}...")
+        prepared_data = dpf(*raw_data[name])
+        output_dir = os.path.join(PREPARED_DIR, prepared_data.name)
+        print(f"\twriting files to {output_dir}")
+        if not os.path.exists(PREPARED_DIR):
+            os.mkdir(PREPARED_DIR)
+        write_prepared_data(prepared_data, output_dir)
+

To finish off I deleted the unused global variables NEW_COLNAMES, DROPNA_COLS and DIMS, then checked if the function load_prepared_data needed any changes: I was pretty sure it didn’t.

To check that all this worked, I ran the data preparation script manually:2

  • 2 I could also have just run make analysis again. This would have caused an error on the step after prepare_data.py, which is fine!

  • > source .venv/bin/activate
    @@ -3549,82 +3550,84 @@ 

    Specifying statistical models

    Generating Stan inputs

    Next I needed to tell the analysis how to turn some prepared data into a dictionary that can be used as input for Stan. Bibat assumes that this task is handled by functions that live in the file baseball/stan_input_functions.py, each of which takes in a PreparedData and returns a Python dictionary. You can write as many Stan input functions as you like and choose which one to run for any given inference.

    I started by defining some Stan input functions that pass arbitrary prepared data on to each of the models:3

  • 3 Note that this code uses the scipy function logit, which it imported like this: from scipy.special import logit

  • -
        """General function for creating a Stan input."""
    -    return {
    -        "N": len(ppd.measurements),
    -        "K": ppd.measurements["n_attempt"].values,
    -        "y": ppd.measurements["n_success"].values,
    -        "prior_mu": [logit(0.25), 0.2],
    -        "prior_tau": [0.2, 0.1],
    -        "prior_b_K": [0, 0.03],
    -    }
    -
    +
    def get_stan_input_normal(ppd: PreparedData) -> Dict:
    +    """General function for creating a Stan input."""
    +    return {
    +        "N": len(ppd.measurements),
    +        "K": ppd.measurements["n_attempt"].values,
    +        "y": ppd.measurements["n_success"].values,
    +        "prior_mu": [logit(0.25), 0.2],
    +        "prior_tau": [0.2, 0.1],
    +        "prior_b_K": [0, 0.03],
    +    }
     
    -def get_stan_input_gpareto(ppd: PreparedData) -> Dict:
    -    """General function for creating a Stan input."""
    -    return {
    -        "N": len(ppd.measurements),
    -        "K": ppd.measurements["n_attempt"].values,
    -        "y": ppd.measurements["n_success"].values,
    -        "min_alpha": logit(0.07),
    -        "max_alpha": logit(0.5),
    -        "prior_sigma": [1.5, 0.4],
    -        "prior_k": [-0.5, 1],
    -    }
    -
    -def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:
    + +def get_stan_input_gpareto(ppd: PreparedData) -> Dict: + """General function for creating a Stan input.""" + return { + "N": len(ppd.measurements), + "K": ppd.measurements["n_attempt"].values, + "y": ppd.measurements["n_success"].values, + "min_alpha": logit(0.07), + "max_alpha": logit(0.5), + "prior_sigma": [1.5, 0.4], + "prior_k": [-0.5, 1], + } +

    But why stop there? It can also be useful to generate Stan input data with a model, by running it in simulation mode with hardcoded parameter values. Here are some functions that do this for both of our models:

    -
        N = len(ppd.measurements)
    -    rng = np.random.default_rng(seed=1234)
    -    true_param_values = {
    -        "mu": logit(0.25),
    -        "tau": 0.18,  # 2sds is 0.19 to 0.32 batting average
    -        "b_K": 0.04,  # slight effect of more attempts
    -        "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
    -    }
    -    K = ppd.measurements["n_attempt"].values
    -    log_K_std = (np.log(K) - np.log(K).mean()) / np.log(K).std()
    -    alpha = (
    -        true_param_values["mu"]
    -        + true_param_values["b_K"] * log_K_std
    -        + true_param_values["tau"] * true_param_values["alpha_std"]
    -    )
    -    y = rng.random.binomial(K, expit(alpha))
    -    return {"N": N, "K": K, "y": y} | true_param_values
    -
    -
    -def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
    -    """Generate fake Stan input consistent with the gpareto model."""
    -    N = len(ppd.measurements)
    -    K = ppd.measurements["n_attempt"].values
    -    min_alpha = 0.1
    -    rng = np.random.default_rng(seed=1234)
    -    true_param_values = {"sigma": -1.098, "k": 0.18}
    -    true_param_values["alpha"] = gpareto_rvs(
    -        rng,
    -        N,
    -        min_alpha,
    -        true_param_values["k"],
    -        true_param_values["sigma"],
    -    )
    -    y = rng.binomial(K, expit(true_param_values["alpha"]))
    -    return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
    -
    -
    -def gpareto_rvs(
    -    rng: np.random.Generator, size: int, mu: float, k: float, sigma: float
    -):
    -    """Generate random numbers from a generalised pareto distribution.
    -
    -    See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
    -    source.
    -
    -    """
    -    U = rng.uniform(size)
    -    if k == 0:
    -        return mu - sigma * np.log(U)
    -    else:
    -        return mu + (sigma * (U**-k) - 1) / sigma
    +
    def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:
    +    """Generate fake Stan input consistent with the normal model."""
    +    N = len(ppd.measurements)
    +    rng = np.random.default_rng(seed=1234)
    +    true_param_values = {
    +        "mu": logit(0.25),
    +        "tau": 0.18,  # 2sds is 0.19 to 0.32 batting average
    +        "b_K": 0.04,  # slight effect of more attempts
    +        "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
    +    }
    +    K = ppd.measurements["n_attempt"].values
    +    log_K_std = (np.log(K) - np.log(K).mean()) / np.log(K).std()
    +    alpha = (
    +        true_param_values["mu"]
    +        + true_param_values["b_K"] * log_K_std
    +        + true_param_values["tau"] * true_param_values["alpha_std"]
    +    )
    +    y = rng.random.binomial(K, expit(alpha))
    +    return {"N": N, "K": K, "y": y} | true_param_values
    +
    +
    +def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
    +    """Generate fake Stan input consistent with the gpareto model."""
    +    N = len(ppd.measurements)
    +    K = ppd.measurements["n_attempt"].values
    +    min_alpha = 0.1
    +    rng = np.random.default_rng(seed=1234)
    +    true_param_values = {"sigma": -1.098, "k": 0.18}
    +    true_param_values["alpha"] = gpareto_rvs(
    +        rng,
    +        N,
    +        min_alpha,
    +        true_param_values["k"],
    +        true_param_values["sigma"],
    +    )
    +    y = rng.binomial(K, expit(true_param_values["alpha"]))
    +    return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
    +
    +
    +def gpareto_rvs(
    +    rng: np.random.Generator, size: int, mu: float, k: float, sigma: float
    +):
    +    """Generate random numbers from a generalised pareto distribution.
    +
    +    See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
    +    source.
    +
    +    """
    +    U = rng.uniform(size)
    +    if k == 0:
    +        return mu - sigma * np.log(U)
    +    else:
    +        return mu + (sigma * (U**-k) - 1) / sigma

    Specifying inferences

    diff --git a/bibat/examples/baseball/docs/report.qmd b/bibat/examples/baseball/docs/report.qmd index cf0a916..d837f91 100644 --- a/bibat/examples/baseball/docs/report.qmd +++ b/bibat/examples/baseball/docs/report.qmd @@ -212,7 +212,7 @@ The next step is to write functions that return `PreparedData` objects. In this case I wrote a couple of data preparation functions: `prepare_data_2006` and `prepare_data_bdb`: -```{.python include="../baseball/data_preparation.py" start-line=77 end-line=161} +```{.python include="../baseball/data_preparation.py" start-line=78 end-line=162} ``` To take into account the inconsistency between the two raw data sources, I @@ -224,7 +224,7 @@ first had to change the variable `RAW_DATA_FILES`: Next I changed the `prepare_data` function to handle the two different data sources. -```{.python include="../baseball/data_preparation.py" start-line=35 end-line=54} +```{.python include="../baseball/data_preparation.py" start-line=36 end-line=56} ``` To finish off I deleted the unused global variables `NEW_COLNAMES`, diff --git a/docs/_static/report.html b/docs/_static/report.html index 1f6b978..fe8af7b 100644 --- a/docs/_static/report.html +++ b/docs/_static/report.html @@ -3312,91 +3312,91 @@

    Preparing the data

    n_attempt: Series[int] = pa.Field(ge = 1) n_success: Series[int] = pa.Field(ge = 0)

    The next step is to write functions that return PreparedData objects. In this case I wrote a couple of data preparation functions: prepare_data_2006 and prepare_data_bdb:

    -
    def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
    -    """Prepare the 2006 data."""
    -    measurements = measurements_raw.rename(
    -        columns={"K": "n_attempt", "y": "n_success"}
    -    ).assign(
    -        season="2006",
    -        player_season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
    -    )
    -    return PreparedData(
    -        name="2006",
    -        coords={
    -            "player_season": measurements["player_season"].tolist(),
    -            "season": measurements["season"].astype(str).tolist(),
    -        },
    -        measurements=DataFrame[MeasurementsDF](measurements),
    -    )
    -
    +
    def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
    +    """Prepare the 2006 data."""
    +    measurements = measurements_raw.rename(
    +        columns={"K": "n_attempt", "y": "n_success"}
    +    ).assign(
    +        season="2006",
    +        player_season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
    +    )
    +    return PreparedData(
    +        name="2006",
    +        coords={
    +            "player_season": measurements["player_season"].tolist(),
    +            "season": measurements["season"].astype(str).tolist(),
    +        },
    +        measurements=DataFrame[MeasurementsDF](measurements),
    +    )
     
    -def prepare_data_bdb(
    -    measurements_main: pd.DataFrame,
    -    measurements_post: pd.DataFrame,
    -    appearances: pd.DataFrame,
    -) -> PreparedData:
    -    """Prepare the baseballdatabank data.
    -
    -    There are a few substantive data choices here.
    -
    -    First, the function excludes players who have a '1' in their position as
    -    these are likely pitchers, as well as players with fewer than 20 at bats.
    -
    -    Second, the function defines a successes and attempts according to the
    -    'on-base percentage' metric, so a success is a time when a player got a hit,
    -    a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a
    -    base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have
    -    alternatively been calculated as just hits divided by at-bats, but my
    -    understanding is that this method underrates players who are good at getting
    -    walks.
    -
    -    """
    -    pitchers = appearances.loc[
    -        lambda df: df["G_p"] == df["G_all"], "playerID"
    -    ].unique()
    -
    -    def filter_batters(df: pd.DataFrame):
    -        return (
    -            (df["AB"] >= 20)
    -            & (df["season"].ge(2017))
    -            & (~df["player"].isin(pitchers))
    -        )
    -
    -    measurements_main, measurements_post = (
    -        m.rename(columns={"yearID": "season", "playerID": "player"})
    -        .assign(
    -            player_season=lambda df: df["player"].str.cat(
    -                df["season"].astype(str)
    -            ),
    -            n_attempt=lambda df: df[["AB", "BB", "HBP", "SF"]]
    -            .fillna(0)
    -            .sum(axis=1)
    -            .astype(int),
    -            n_success=lambda df: (
    -                df[["H", "BB", "HBP"]].fillna(0).sum(axis=1).astype(int)
    -            ),
    -        )
    -        .loc[
    -            filter_batters,
    -            ["player_season", "season", "n_attempt", "n_success"],
    -        ]
    -        .copy()
    -        for m in [measurements_main, measurements_post]
    -    )
    -    measurements = (
    -        pd.concat([measurements_main, measurements_post])
    -        .groupby(["player_season", "season"])
    -        .sum()
    -        .reset_index()
    -    )
    -    return PreparedData(
    -        name="bdb",
    -        coords={
    -            "player_season": measurements["player_season"].tolist(),
    -            "season": measurements["season"].astype(str).tolist(),
    -        },
    -        measurements=DataFrame[MeasurementsDF](measurements),
    -    )
    + +def prepare_data_bdb( + measurements_main: pd.DataFrame, + measurements_post: pd.DataFrame, + appearances: pd.DataFrame, +) -> PreparedData: + """Prepare the baseballdatabank data. + + There are a few substantive data choices here. + + First, the function excludes players who have a '1' in their position as + these are likely pitchers, as well as players with fewer than 20 at bats. + + Second, the function defines a successes and attempts according to the + 'on-base percentage' metric, so a success is a time when a player got a hit, + a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a + base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have + alternatively been calculated as just hits divided by at-bats, but my + understanding is that this method underrates players who are good at getting + walks. + + """ + pitchers = appearances.loc[ + lambda df: df["G_p"] == df["G_all"], "playerID" + ].unique() + + def filter_batters(df: pd.DataFrame): + return ( + (df["AB"] >= 20) + & (df["season"].ge(2017)) + & (~df["player"].isin(pitchers)) + ) + + measurements_main, measurements_post = ( + m.rename(columns={"yearID": "season", "playerID": "player"}) + .assign( + player_season=lambda df: df["player"].str.cat( + df["season"].astype(str) + ), + n_attempt=lambda df: df[["AB", "BB", "HBP", "SF"]] + .fillna(0) + .sum(axis=1) + .astype(int), + n_success=lambda df: ( + df[["H", "BB", "HBP"]].fillna(0).sum(axis=1).astype(int) + ), + ) + .loc[ + filter_batters, + ["player_season", "season", "n_attempt", "n_success"], + ] + .copy() + for m in [measurements_main, measurements_post] + ) + measurements = ( + pd.concat([measurements_main, measurements_post]) + .groupby(["player_season", "season"]) + .sum() + .reset_index() + ) + return PreparedData( + name="bdb", + coords={ + "player_season": measurements["player_season"].tolist(), + "season": measurements["season"].astype(str).tolist(), + }, + measurements=DataFrame[MeasurementsDF](measurements), + )

    To take into account the inconsistency between the two raw data sources, I first had to change the variable RAW_DATA_FILES:

    RAW_DATA_FILES = {
         "2006": [os.path.join(RAW_DIR, "2006.csv")],
    @@ -3407,26 +3407,27 @@ 

    Preparing the data

    ], }

    Next I changed the prepare_data function to handle the two different data sources.

    -
    def prepare_data():
    -    """Run main function."""
    -    print("Reading raw data...")
    -    raw_data = {
    -        k: [pd.read_csv(file, index_col=None) for file in v]
    -        for k, v in RAW_DATA_FILES.items()
    -    }
    -    data_preparation_functions_to_run = {
    -        "2006": prepare_data_2006,
    -        "bdb": prepare_data_bdb,
    -    }
    -    print("Preparing data...")
    -    for name, dpf in data_preparation_functions_to_run.items():
    -        print(f"Running data preparation function {dpf.__name__}...")
    -        prepared_data = dpf(*raw_data[name])
    -        output_dir = os.path.join(PREPARED_DIR, prepared_data.name)
    -        print(f"\twriting files to {output_dir}")
    -        if not os.path.exists(PREPARED_DIR):
    -            os.mkdir(PREPARED_DIR)
    -        write_prepared_data(prepared_data, output_dir)
    +
    def prepare_data():
    +    """Run main function."""
    +    print("Reading raw data...")
    +    raw_data = {
    +        k: [pd.read_csv(file, index_col=None) for file in v]
    +        for k, v in RAW_DATA_FILES.items()
    +    }
    +    data_preparation_functions_to_run = {
    +        "2006": prepare_data_2006,
    +        "bdb": prepare_data_bdb,
    +    }
    +    print("Preparing data...")
    +    for name, dpf in data_preparation_functions_to_run.items():
    +        print(f"Running data preparation function {dpf.__name__}...")
    +        prepared_data = dpf(*raw_data[name])
    +        output_dir = os.path.join(PREPARED_DIR, prepared_data.name)
    +        print(f"\twriting files to {output_dir}")
    +        if not os.path.exists(PREPARED_DIR):
    +            os.mkdir(PREPARED_DIR)
    +        write_prepared_data(prepared_data, output_dir)
    +

    To finish off I deleted the unused global variables NEW_COLNAMES, DROPNA_COLS and DIMS, then checked if the function load_prepared_data needed any changes: I was pretty sure it didn’t.

    To check that all this worked, I ran the data preparation script manually:2

  • 2 I could also have just run make analysis again. This would have caused an error on the step after prepare_data.py, which is fine!

  • > source .venv/bin/activate
    @@ -3549,82 +3550,84 @@ 

    Specifying statistical models

    Generating Stan inputs

    Next I needed to tell the analysis how to turn some prepared data into a dictionary that can be used as input for Stan. Bibat assumes that this task is handled by functions that live in the file baseball/stan_input_functions.py, each of which takes in a PreparedData and returns a Python dictionary. You can write as many Stan input functions as you like and choose which one to run for any given inference.

    I started by defining some Stan input functions that pass arbitrary prepared data on to each of the models:3

  • 3 Note that this code uses the scipy function logit, which it imported like this: from scipy.special import logit

  • -
        """General function for creating a Stan input."""
    -    return {
    -        "N": len(ppd.measurements),
    -        "K": ppd.measurements["n_attempt"].values,
    -        "y": ppd.measurements["n_success"].values,
    -        "prior_mu": [logit(0.25), 0.2],
    -        "prior_tau": [0.2, 0.1],
    -        "prior_b_K": [0, 0.03],
    -    }
    -
    +
    def get_stan_input_normal(ppd: PreparedData) -> Dict:
    +    """General function for creating a Stan input."""
    +    return {
    +        "N": len(ppd.measurements),
    +        "K": ppd.measurements["n_attempt"].values,
    +        "y": ppd.measurements["n_success"].values,
    +        "prior_mu": [logit(0.25), 0.2],
    +        "prior_tau": [0.2, 0.1],
    +        "prior_b_K": [0, 0.03],
    +    }
     
    -def get_stan_input_gpareto(ppd: PreparedData) -> Dict:
    -    """General function for creating a Stan input."""
    -    return {
    -        "N": len(ppd.measurements),
    -        "K": ppd.measurements["n_attempt"].values,
    -        "y": ppd.measurements["n_success"].values,
    -        "min_alpha": logit(0.07),
    -        "max_alpha": logit(0.5),
    -        "prior_sigma": [1.5, 0.4],
    -        "prior_k": [-0.5, 1],
    -    }
    -
    -def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:
    + +def get_stan_input_gpareto(ppd: PreparedData) -> Dict: + """General function for creating a Stan input.""" + return { + "N": len(ppd.measurements), + "K": ppd.measurements["n_attempt"].values, + "y": ppd.measurements["n_success"].values, + "min_alpha": logit(0.07), + "max_alpha": logit(0.5), + "prior_sigma": [1.5, 0.4], + "prior_k": [-0.5, 1], + } +

    But why stop there? It can also be useful to generate Stan input data with a model, by running it in simulation mode with hardcoded parameter values. Here are some functions that do this for both of our models:

    -
        N = len(ppd.measurements)
    -    rng = np.random.default_rng(seed=1234)
    -    true_param_values = {
    -        "mu": logit(0.25),
    -        "tau": 0.18,  # 2sds is 0.19 to 0.32 batting average
    -        "b_K": 0.04,  # slight effect of more attempts
    -        "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
    -    }
    -    K = ppd.measurements["n_attempt"].values
    -    log_K_std = (np.log(K) - np.log(K).mean()) / np.log(K).std()
    -    alpha = (
    -        true_param_values["mu"]
    -        + true_param_values["b_K"] * log_K_std
    -        + true_param_values["tau"] * true_param_values["alpha_std"]
    -    )
    -    y = rng.random.binomial(K, expit(alpha))
    -    return {"N": N, "K": K, "y": y} | true_param_values
    -
    -
    -def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
    -    """Generate fake Stan input consistent with the gpareto model."""
    -    N = len(ppd.measurements)
    -    K = ppd.measurements["n_attempt"].values
    -    min_alpha = 0.1
    -    rng = np.random.default_rng(seed=1234)
    -    true_param_values = {"sigma": -1.098, "k": 0.18}
    -    true_param_values["alpha"] = gpareto_rvs(
    -        rng,
    -        N,
    -        min_alpha,
    -        true_param_values["k"],
    -        true_param_values["sigma"],
    -    )
    -    y = rng.binomial(K, expit(true_param_values["alpha"]))
    -    return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
    -
    -
    -def gpareto_rvs(
    -    rng: np.random.Generator, size: int, mu: float, k: float, sigma: float
    -):
    -    """Generate random numbers from a generalised pareto distribution.
    -
    -    See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
    -    source.
    -
    -    """
    -    U = rng.uniform(size)
    -    if k == 0:
    -        return mu - sigma * np.log(U)
    -    else:
    -        return mu + (sigma * (U**-k) - 1) / sigma
    +
    def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:
    +    """Generate fake Stan input consistent with the normal model."""
    +    N = len(ppd.measurements)
    +    rng = np.random.default_rng(seed=1234)
    +    true_param_values = {
    +        "mu": logit(0.25),
    +        "tau": 0.18,  # 2sds is 0.19 to 0.32 batting average
    +        "b_K": 0.04,  # slight effect of more attempts
    +        "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
    +    }
    +    K = ppd.measurements["n_attempt"].values
    +    log_K_std = (np.log(K) - np.log(K).mean()) / np.log(K).std()
    +    alpha = (
    +        true_param_values["mu"]
    +        + true_param_values["b_K"] * log_K_std
    +        + true_param_values["tau"] * true_param_values["alpha_std"]
    +    )
    +    y = rng.random.binomial(K, expit(alpha))
    +    return {"N": N, "K": K, "y": y} | true_param_values
    +
    +
    +def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
    +    """Generate fake Stan input consistent with the gpareto model."""
    +    N = len(ppd.measurements)
    +    K = ppd.measurements["n_attempt"].values
    +    min_alpha = 0.1
    +    rng = np.random.default_rng(seed=1234)
    +    true_param_values = {"sigma": -1.098, "k": 0.18}
    +    true_param_values["alpha"] = gpareto_rvs(
    +        rng,
    +        N,
    +        min_alpha,
    +        true_param_values["k"],
    +        true_param_values["sigma"],
    +    )
    +    y = rng.binomial(K, expit(true_param_values["alpha"]))
    +    return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
    +
    +
    +def gpareto_rvs(
    +    rng: np.random.Generator, size: int, mu: float, k: float, sigma: float
    +):
    +    """Generate random numbers from a generalised pareto distribution.
    +
    +    See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
    +    source.
    +
    +    """
    +    U = rng.uniform(size)
    +    if k == 0:
    +        return mu - sigma * np.log(U)
    +    else:
    +        return mu + (sigma * (U**-k) - 1) / sigma

    Specifying inferences