diff --git a/bibat/examples/baseball/baseball/data_preparation.py b/bibat/examples/baseball/baseball/data_preparation.py index 374dfd7..52e1fca 100644 --- a/bibat/examples/baseball/baseball/data_preparation.py +++ b/bibat/examples/baseball/baseball/data_preparation.py @@ -7,7 +7,6 @@ import json import os -import numpy as np import pandas as pd import pandera as pa from pandera.typing import DataFrame, Series diff --git a/bibat/examples/baseball/docs/report.html b/bibat/examples/baseball/docs/report.html index 1f6b978..fe8af7b 100644 --- a/bibat/examples/baseball/docs/report.html +++ b/bibat/examples/baseball/docs/report.html @@ -3312,91 +3312,91 @@
The next step is to write functions that return PreparedData
objects. In this case I wrote a couple of data preparation functions: prepare_data_2006
and prepare_data_bdb
:
def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
-"""Prepare the 2006 data."""
- = measurements_raw.rename(
- measurements ={"K": "n_attempt", "y": "n_success"}
- columns
- ).assign(="2006",
- season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
- player_season
- )return PreparedData(
- ="2006",
- name={
- coords"player_season": measurements["player_season"].tolist(),
- "season": measurements["season"].astype(str).tolist(),
-
- },=DataFrame[MeasurementsDF](measurements),
- measurements
- )
+def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
+"""Prepare the 2006 data."""
+ = measurements_raw.rename(
+ measurements ={"K": "n_attempt", "y": "n_success"}
+ columns
+ ).assign(="2006",
+ season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
+ player_season
+ )return PreparedData(
+ ="2006",
+ name={
+ coords"player_season": measurements["player_season"].tolist(),
+ "season": measurements["season"].astype(str).tolist(),
+
+ },=DataFrame[MeasurementsDF](measurements),
+ measurements
)
-def prepare_data_bdb(
-
- measurements_main: pd.DataFrame,
- measurements_post: pd.DataFrame,
- appearances: pd.DataFrame,-> PreparedData:
- ) """Prepare the baseballdatabank data.
-
- There are a few substantive data choices here.
-
- First, the function excludes players who have a '1' in their position as
- these are likely pitchers, as well as players with fewer than 20 at bats.
-
- Second, the function defines a successes and attempts according to the
- 'on-base percentage' metric, so a success is a time when a player got a hit,
- a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a
- base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have
- alternatively been calculated as just hits divided by at-bats, but my
- understanding is that this method underrates players who are good at getting
- walks.
-
- """
-= appearances.loc[
- pitchers lambda df: df["G_p"] == df["G_all"], "playerID"
-
- ].unique()
-def filter_batters(df: pd.DataFrame):
- return (
- "AB"] >= 20)
- (df[& (df["season"].ge(2017))
- & (~df["player"].isin(pitchers))
-
- )
-= (
- measurements_main, measurements_post ={"yearID": "season", "playerID": "player"})
- m.rename(columns
- .assign(=lambda df: df["player"].str.cat(
- player_season"season"].astype(str)
- df[
- ),=lambda df: df[["AB", "BB", "HBP", "SF"]]
- n_attempt0)
- .fillna(sum(axis=1)
- .int),
- .astype(=lambda df: (
- n_success"H", "BB", "HBP"]].fillna(0).sum(axis=1).astype(int)
- df[[
- ),
- )
- .loc[
- filter_batters,"player_season", "season", "n_attempt", "n_success"],
- [
- ]
- .copy()for m in [measurements_main, measurements_post]
-
- )= (
- measurements
- pd.concat([measurements_main, measurements_post])"player_season", "season"])
- .groupby([sum()
- .
- .reset_index()
- )return PreparedData(
- ="bdb",
- name={
- coords"player_season": measurements["player_season"].tolist(),
- "season": measurements["season"].astype(str).tolist(),
-
- },=DataFrame[MeasurementsDF](measurements),
- measurements )
+
+def prepare_data_bdb(
+
+ measurements_main: pd.DataFrame,
+ measurements_post: pd.DataFrame,
+ appearances: pd.DataFrame,-> PreparedData:
+ ) """Prepare the baseballdatabank data.
+
+ There are a few substantive data choices here.
+
+ First, the function excludes players who have a '1' in their position as
+ these are likely pitchers, as well as players with fewer than 20 at bats.
+
+ Second, the function defines a successes and attempts according to the
+ 'on-base percentage' metric, so a success is a time when a player got a hit,
+ a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a
+ base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have
+ alternatively been calculated as just hits divided by at-bats, but my
+ understanding is that this method underrates players who are good at getting
+ walks.
+
+ """
+= appearances.loc[
+ pitchers lambda df: df["G_p"] == df["G_all"], "playerID"
+
+ ].unique()
+def filter_batters(df: pd.DataFrame):
+ return (
+ "AB"] >= 20)
+ (df[& (df["season"].ge(2017))
+ & (~df["player"].isin(pitchers))
+
+ )
+= (
+ measurements_main, measurements_post ={"yearID": "season", "playerID": "player"})
+ m.rename(columns
+ .assign(=lambda df: df["player"].str.cat(
+ player_season"season"].astype(str)
+ df[
+ ),=lambda df: df[["AB", "BB", "HBP", "SF"]]
+ n_attempt0)
+ .fillna(sum(axis=1)
+ .int),
+ .astype(=lambda df: (
+ n_success"H", "BB", "HBP"]].fillna(0).sum(axis=1).astype(int)
+ df[[
+ ),
+ )
+ .loc[
+ filter_batters,"player_season", "season", "n_attempt", "n_success"],
+ [
+ ]
+ .copy()for m in [measurements_main, measurements_post]
+
+ )= (
+ measurements
+ pd.concat([measurements_main, measurements_post])"player_season", "season"])
+ .groupby([sum()
+ .
+ .reset_index()
+ )return PreparedData(
+ ="bdb",
+ name={
+ coords"player_season": measurements["player_season"].tolist(),
+ "season": measurements["season"].astype(str).tolist(),
+
+ },=DataFrame[MeasurementsDF](measurements),
+ measurements )
To take into account the inconsistency between the two raw data sources, I first had to change the variable RAW_DATA_FILES
:
= {
RAW_DATA_FILES "2006": [os.path.join(RAW_DIR, "2006.csv")],
@@ -3407,26 +3407,27 @@ Preparing the data
], }
Next I changed the prepare_data
function to handle the two different data sources.
def prepare_data():
-"""Run main function."""
- print("Reading raw data...")
- = {
- raw_data file, index_col=None) for file in v]
- k: [pd.read_csv(for k, v in RAW_DATA_FILES.items()
-
- }= {
- data_preparation_functions_to_run "2006": prepare_data_2006,
- "bdb": prepare_data_bdb,
-
- }print("Preparing data...")
- for name, dpf in data_preparation_functions_to_run.items():
- print(f"Running data preparation function {dpf.__name__}...")
- = dpf(*raw_data[name])
- prepared_data = os.path.join(PREPARED_DIR, prepared_data.name)
- output_dir print(f"\twriting files to {output_dir}")
- if not os.path.exists(PREPARED_DIR):
-
- os.mkdir(PREPARED_DIR) write_prepared_data(prepared_data, output_dir)
def prepare_data():
+"""Run main function."""
+ print("Reading raw data...")
+ = {
+ raw_data file, index_col=None) for file in v]
+ k: [pd.read_csv(for k, v in RAW_DATA_FILES.items()
+
+ }= {
+ data_preparation_functions_to_run "2006": prepare_data_2006,
+ "bdb": prepare_data_bdb,
+
+ }print("Preparing data...")
+ for name, dpf in data_preparation_functions_to_run.items():
+ print(f"Running data preparation function {dpf.__name__}...")
+ = dpf(*raw_data[name])
+ prepared_data = os.path.join(PREPARED_DIR, prepared_data.name)
+ output_dir print(f"\twriting files to {output_dir}")
+ if not os.path.exists(PREPARED_DIR):
+
+ os.mkdir(PREPARED_DIR)
+ write_prepared_data(prepared_data, output_dir)
To finish off I deleted the unused global variables NEW_COLNAMES
, DROPNA_COLS
and DIMS
, then checked if the function load_prepared_data
needed any changes: I was pretty sure it didn’t.
To check that all this worked, I ran the data preparation script manually:2
2 I could also have just run make analysis
again. This would have caused an error on the step after prepare_data.py, which is fine!
> source .venv/bin/activate
@@ -3549,82 +3550,84 @@ Specifying statistical models
Generating Stan inputs
Next I needed to tell the analysis how to turn some prepared data into a dictionary that can be used as input for Stan. Bibat assumes that this task is handled by functions that live in the file baseball/stan_input_functions.py
, each of which takes in a PreparedData
and returns a Python dictionary. You can write as many Stan input functions as you like and choose which one to run for any given inference.
I started by defining some Stan input functions that pass arbitrary prepared data on to each of the models:3
3 Note that this code uses the scipy function logit
, which it imported like this: from scipy.special import logit
-"""General function for creating a Stan input."""
- return {
- "N": len(ppd.measurements),
- "K": ppd.measurements["n_attempt"].values,
- "y": ppd.measurements["n_success"].values,
- "prior_mu": [logit(0.25), 0.2],
- "prior_tau": [0.2, 0.1],
- "prior_b_K": [0, 0.03],
-
- }
+def get_stan_input_normal(ppd: PreparedData) -> Dict:
+"""General function for creating a Stan input."""
+ return {
+ "N": len(ppd.measurements),
+ "K": ppd.measurements["n_attempt"].values,
+ "y": ppd.measurements["n_success"].values,
+ "prior_mu": [logit(0.25), 0.2],
+ "prior_tau": [0.2, 0.1],
+ "prior_b_K": [0, 0.03],
+
}
-def get_stan_input_gpareto(ppd: PreparedData) -> Dict:
-"""General function for creating a Stan input."""
- return {
- "N": len(ppd.measurements),
- "K": ppd.measurements["n_attempt"].values,
- "y": ppd.measurements["n_success"].values,
- "min_alpha": logit(0.07),
- "max_alpha": logit(0.5),
- "prior_sigma": [1.5, 0.4],
- "prior_k": [-0.5, 1],
-
- }
-def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:
+
+def get_stan_input_gpareto(ppd: PreparedData) -> Dict:
+"""General function for creating a Stan input."""
+ return {
+ "N": len(ppd.measurements),
+ "K": ppd.measurements["n_attempt"].values,
+ "y": ppd.measurements["n_success"].values,
+ "min_alpha": logit(0.07),
+ "max_alpha": logit(0.5),
+ "prior_sigma": [1.5, 0.4],
+ "prior_k": [-0.5, 1],
+
+ }
But why stop there? It can also be useful to generate Stan input data with a model, by running it in simulation mode with hardcoded parameter values. Here are some functions that do this for both of our models:
-= len(ppd.measurements)
- N = np.random.default_rng(seed=1234)
- rng = {
- true_param_values "mu": logit(0.25),
- "tau": 0.18, # 2sds is 0.19 to 0.32 batting average
- "b_K": 0.04, # slight effect of more attempts
- "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
-
- }= ppd.measurements["n_attempt"].values
- K = (np.log(K) - np.log(K).mean()) / np.log(K).std()
- log_K_std = (
- alpha "mu"]
- true_param_values[+ true_param_values["b_K"] * log_K_std
- + true_param_values["tau"] * true_param_values["alpha_std"]
-
- )= rng.random.binomial(K, expit(alpha))
- y return {"N": N, "K": K, "y": y} | true_param_values
-
-
-def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
-"""Generate fake Stan input consistent with the gpareto model."""
- = len(ppd.measurements)
- N = ppd.measurements["n_attempt"].values
- K = 0.1
- min_alpha = np.random.default_rng(seed=1234)
- rng = {"sigma": -1.098, "k": 0.18}
- true_param_values "alpha"] = gpareto_rvs(
- true_param_values[
- rng,
- N,
- min_alpha,"k"],
- true_param_values["sigma"],
- true_param_values[
- )= rng.binomial(K, expit(true_param_values["alpha"]))
- y return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
-
-
-def gpareto_rvs(
-int, mu: float, k: float, sigma: float
- rng: np.random.Generator, size:
- ):"""Generate random numbers from a generalised pareto distribution.
-
- See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
- source.
-
- """
-= rng.uniform(size)
- U if k == 0:
- return mu - sigma * np.log(U)
- else:
- return mu + (sigma * (U**-k) - 1) / sigma
+def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:
+"""Generate fake Stan input consistent with the normal model."""
+ = len(ppd.measurements)
+ N = np.random.default_rng(seed=1234)
+ rng = {
+ true_param_values "mu": logit(0.25),
+ "tau": 0.18, # 2sds is 0.19 to 0.32 batting average
+ "b_K": 0.04, # slight effect of more attempts
+ "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
+
+ }= ppd.measurements["n_attempt"].values
+ K = (np.log(K) - np.log(K).mean()) / np.log(K).std()
+ log_K_std = (
+ alpha "mu"]
+ true_param_values[+ true_param_values["b_K"] * log_K_std
+ + true_param_values["tau"] * true_param_values["alpha_std"]
+
+ )= rng.random.binomial(K, expit(alpha))
+ y return {"N": N, "K": K, "y": y} | true_param_values
+
+
+def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
+"""Generate fake Stan input consistent with the gpareto model."""
+ = len(ppd.measurements)
+ N = ppd.measurements["n_attempt"].values
+ K = 0.1
+ min_alpha = np.random.default_rng(seed=1234)
+ rng = {"sigma": -1.098, "k": 0.18}
+ true_param_values "alpha"] = gpareto_rvs(
+ true_param_values[
+ rng,
+ N,
+ min_alpha,"k"],
+ true_param_values["sigma"],
+ true_param_values[
+ )= rng.binomial(K, expit(true_param_values["alpha"]))
+ y return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
+
+
+def gpareto_rvs(
+int, mu: float, k: float, sigma: float
+ rng: np.random.Generator, size:
+ ):"""Generate random numbers from a generalised pareto distribution.
+
+ See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
+ source.
+
+ """
+= rng.uniform(size)
+ U if k == 0:
+ return mu - sigma * np.log(U)
+ else:
+ return mu + (sigma * (U**-k) - 1) / sigma
Specifying inferences
diff --git a/bibat/examples/baseball/docs/report.qmd b/bibat/examples/baseball/docs/report.qmd
index cf0a916..d837f91 100644
--- a/bibat/examples/baseball/docs/report.qmd
+++ b/bibat/examples/baseball/docs/report.qmd
@@ -212,7 +212,7 @@ The next step is to write functions that return `PreparedData` objects. In this
case I wrote a couple of data preparation functions: `prepare_data_2006` and
`prepare_data_bdb`:
-```{.python include="../baseball/data_preparation.py" start-line=77 end-line=161}
+```{.python include="../baseball/data_preparation.py" start-line=78 end-line=162}
```
To take into account the inconsistency between the two raw data sources, I
@@ -224,7 +224,7 @@ first had to change the variable `RAW_DATA_FILES`:
Next I changed the `prepare_data` function to handle the two different data
sources.
-```{.python include="../baseball/data_preparation.py" start-line=35 end-line=54}
+```{.python include="../baseball/data_preparation.py" start-line=36 end-line=56}
```
To finish off I deleted the unused global variables `NEW_COLNAMES`,
diff --git a/docs/_static/report.html b/docs/_static/report.html
index 1f6b978..fe8af7b 100644
--- a/docs/_static/report.html
+++ b/docs/_static/report.html
@@ -3312,91 +3312,91 @@ Preparing the data
int] = pa.Field(ge = 1)
n_attempt: Series[int] = pa.Field(ge = 0) n_success: Series[
The next step is to write functions that return PreparedData
objects. In this case I wrote a couple of data preparation functions: prepare_data_2006
and prepare_data_bdb
:
def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
-"""Prepare the 2006 data."""
- = measurements_raw.rename(
- measurements ={"K": "n_attempt", "y": "n_success"}
- columns
- ).assign(="2006",
- season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
- player_season
- )return PreparedData(
- ="2006",
- name={
- coords"player_season": measurements["player_season"].tolist(),
- "season": measurements["season"].astype(str).tolist(),
-
- },=DataFrame[MeasurementsDF](measurements),
- measurements
- )
+def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
+"""Prepare the 2006 data."""
+ = measurements_raw.rename(
+ measurements ={"K": "n_attempt", "y": "n_success"}
+ columns
+ ).assign(="2006",
+ season=lambda df: [f"2006-player-{i+1}" for i in range(len(df))],
+ player_season
+ )return PreparedData(
+ ="2006",
+ name={
+ coords"player_season": measurements["player_season"].tolist(),
+ "season": measurements["season"].astype(str).tolist(),
+
+ },=DataFrame[MeasurementsDF](measurements),
+ measurements
)
-def prepare_data_bdb(
-
- measurements_main: pd.DataFrame,
- measurements_post: pd.DataFrame,
- appearances: pd.DataFrame,-> PreparedData:
- ) """Prepare the baseballdatabank data.
-
- There are a few substantive data choices here.
-
- First, the function excludes players who have a '1' in their position as
- these are likely pitchers, as well as players with fewer than 20 at bats.
-
- Second, the function defines a successes and attempts according to the
- 'on-base percentage' metric, so a success is a time when a player got a hit,
- a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a
- base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have
- alternatively been calculated as just hits divided by at-bats, but my
- understanding is that this method underrates players who are good at getting
- walks.
-
- """
-= appearances.loc[
- pitchers lambda df: df["G_p"] == df["G_all"], "playerID"
-
- ].unique()
-def filter_batters(df: pd.DataFrame):
- return (
- "AB"] >= 20)
- (df[& (df["season"].ge(2017))
- & (~df["player"].isin(pitchers))
-
- )
-= (
- measurements_main, measurements_post ={"yearID": "season", "playerID": "player"})
- m.rename(columns
- .assign(=lambda df: df["player"].str.cat(
- player_season"season"].astype(str)
- df[
- ),=lambda df: df[["AB", "BB", "HBP", "SF"]]
- n_attempt0)
- .fillna(sum(axis=1)
- .int),
- .astype(=lambda df: (
- n_success"H", "BB", "HBP"]].fillna(0).sum(axis=1).astype(int)
- df[[
- ),
- )
- .loc[
- filter_batters,"player_season", "season", "n_attempt", "n_success"],
- [
- ]
- .copy()for m in [measurements_main, measurements_post]
-
- )= (
- measurements
- pd.concat([measurements_main, measurements_post])"player_season", "season"])
- .groupby([sum()
- .
- .reset_index()
- )return PreparedData(
- ="bdb",
- name={
- coords"player_season": measurements["player_season"].tolist(),
- "season": measurements["season"].astype(str).tolist(),
-
- },=DataFrame[MeasurementsDF](measurements),
- measurements )
+
+def prepare_data_bdb(
+
+ measurements_main: pd.DataFrame,
+ measurements_post: pd.DataFrame,
+ appearances: pd.DataFrame,-> PreparedData:
+ ) """Prepare the baseballdatabank data.
+
+ There are a few substantive data choices here.
+
+ First, the function excludes players who have a '1' in their position as
+ these are likely pitchers, as well as players with fewer than 20 at bats.
+
+ Second, the function defines a successes and attempts according to the
+ 'on-base percentage' metric, so a success is a time when a player got a hit,
+ a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a
+ base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have
+ alternatively been calculated as just hits divided by at-bats, but my
+ understanding is that this method underrates players who are good at getting
+ walks.
+
+ """
+= appearances.loc[
+ pitchers lambda df: df["G_p"] == df["G_all"], "playerID"
+
+ ].unique()
+def filter_batters(df: pd.DataFrame):
+ return (
+ "AB"] >= 20)
+ (df[& (df["season"].ge(2017))
+ & (~df["player"].isin(pitchers))
+
+ )
+= (
+ measurements_main, measurements_post ={"yearID": "season", "playerID": "player"})
+ m.rename(columns
+ .assign(=lambda df: df["player"].str.cat(
+ player_season"season"].astype(str)
+ df[
+ ),=lambda df: df[["AB", "BB", "HBP", "SF"]]
+ n_attempt0)
+ .fillna(sum(axis=1)
+ .int),
+ .astype(=lambda df: (
+ n_success"H", "BB", "HBP"]].fillna(0).sum(axis=1).astype(int)
+ df[[
+ ),
+ )
+ .loc[
+ filter_batters,"player_season", "season", "n_attempt", "n_success"],
+ [
+ ]
+ .copy()for m in [measurements_main, measurements_post]
+
+ )= (
+ measurements
+ pd.concat([measurements_main, measurements_post])"player_season", "season"])
+ .groupby([sum()
+ .
+ .reset_index()
+ )return PreparedData(
+ ="bdb",
+ name={
+ coords"player_season": measurements["player_season"].tolist(),
+ "season": measurements["season"].astype(str).tolist(),
+
+ },=DataFrame[MeasurementsDF](measurements),
+ measurements )
To take into account the inconsistency between the two raw data sources, I first had to change the variable RAW_DATA_FILES
:
= {
RAW_DATA_FILES "2006": [os.path.join(RAW_DIR, "2006.csv")],
@@ -3407,26 +3407,27 @@ Preparing the data
], }
Next I changed the prepare_data
function to handle the two different data sources.
def prepare_data():
-"""Run main function."""
- print("Reading raw data...")
- = {
- raw_data file, index_col=None) for file in v]
- k: [pd.read_csv(for k, v in RAW_DATA_FILES.items()
-
- }= {
- data_preparation_functions_to_run "2006": prepare_data_2006,
- "bdb": prepare_data_bdb,
-
- }print("Preparing data...")
- for name, dpf in data_preparation_functions_to_run.items():
- print(f"Running data preparation function {dpf.__name__}...")
- = dpf(*raw_data[name])
- prepared_data = os.path.join(PREPARED_DIR, prepared_data.name)
- output_dir print(f"\twriting files to {output_dir}")
- if not os.path.exists(PREPARED_DIR):
-
- os.mkdir(PREPARED_DIR) write_prepared_data(prepared_data, output_dir)
def prepare_data():
+"""Run main function."""
+ print("Reading raw data...")
+ = {
+ raw_data file, index_col=None) for file in v]
+ k: [pd.read_csv(for k, v in RAW_DATA_FILES.items()
+
+ }= {
+ data_preparation_functions_to_run "2006": prepare_data_2006,
+ "bdb": prepare_data_bdb,
+
+ }print("Preparing data...")
+ for name, dpf in data_preparation_functions_to_run.items():
+ print(f"Running data preparation function {dpf.__name__}...")
+ = dpf(*raw_data[name])
+ prepared_data = os.path.join(PREPARED_DIR, prepared_data.name)
+ output_dir print(f"\twriting files to {output_dir}")
+ if not os.path.exists(PREPARED_DIR):
+
+ os.mkdir(PREPARED_DIR)
+ write_prepared_data(prepared_data, output_dir)
To finish off I deleted the unused global variables NEW_COLNAMES
, DROPNA_COLS
and DIMS
, then checked if the function load_prepared_data
needed any changes: I was pretty sure it didn’t.
To check that all this worked, I ran the data preparation script manually:2
2 I could also have just run make analysis
again. This would have caused an error on the step after prepare_data.py, which is fine!
> source .venv/bin/activate
@@ -3549,82 +3550,84 @@ Specifying statistical models
Generating Stan inputs
Next I needed to tell the analysis how to turn some prepared data into a dictionary that can be used as input for Stan. Bibat assumes that this task is handled by functions that live in the file baseball/stan_input_functions.py
, each of which takes in a PreparedData
and returns a Python dictionary. You can write as many Stan input functions as you like and choose which one to run for any given inference.
I started by defining some Stan input functions that pass arbitrary prepared data on to each of the models:3
3 Note that this code uses the scipy function logit
, which it imported like this: from scipy.special import logit
-"""General function for creating a Stan input."""
- return {
- "N": len(ppd.measurements),
- "K": ppd.measurements["n_attempt"].values,
- "y": ppd.measurements["n_success"].values,
- "prior_mu": [logit(0.25), 0.2],
- "prior_tau": [0.2, 0.1],
- "prior_b_K": [0, 0.03],
-
- }
+def get_stan_input_normal(ppd: PreparedData) -> Dict:
+"""General function for creating a Stan input."""
+ return {
+ "N": len(ppd.measurements),
+ "K": ppd.measurements["n_attempt"].values,
+ "y": ppd.measurements["n_success"].values,
+ "prior_mu": [logit(0.25), 0.2],
+ "prior_tau": [0.2, 0.1],
+ "prior_b_K": [0, 0.03],
+
}
-def get_stan_input_gpareto(ppd: PreparedData) -> Dict:
-"""General function for creating a Stan input."""
- return {
- "N": len(ppd.measurements),
- "K": ppd.measurements["n_attempt"].values,
- "y": ppd.measurements["n_success"].values,
- "min_alpha": logit(0.07),
- "max_alpha": logit(0.5),
- "prior_sigma": [1.5, 0.4],
- "prior_k": [-0.5, 1],
-
- }
-def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:
+
+def get_stan_input_gpareto(ppd: PreparedData) -> Dict:
+"""General function for creating a Stan input."""
+ return {
+ "N": len(ppd.measurements),
+ "K": ppd.measurements["n_attempt"].values,
+ "y": ppd.measurements["n_success"].values,
+ "min_alpha": logit(0.07),
+ "max_alpha": logit(0.5),
+ "prior_sigma": [1.5, 0.4],
+ "prior_k": [-0.5, 1],
+
+ }
But why stop there? It can also be useful to generate Stan input data with a model, by running it in simulation mode with hardcoded parameter values. Here are some functions that do this for both of our models:
-= len(ppd.measurements)
- N = np.random.default_rng(seed=1234)
- rng = {
- true_param_values "mu": logit(0.25),
- "tau": 0.18, # 2sds is 0.19 to 0.32 batting average
- "b_K": 0.04, # slight effect of more attempts
- "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
-
- }= ppd.measurements["n_attempt"].values
- K = (np.log(K) - np.log(K).mean()) / np.log(K).std()
- log_K_std = (
- alpha "mu"]
- true_param_values[+ true_param_values["b_K"] * log_K_std
- + true_param_values["tau"] * true_param_values["alpha_std"]
-
- )= rng.random.binomial(K, expit(alpha))
- y return {"N": N, "K": K, "y": y} | true_param_values
-
-
-def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
-"""Generate fake Stan input consistent with the gpareto model."""
- = len(ppd.measurements)
- N = ppd.measurements["n_attempt"].values
- K = 0.1
- min_alpha = np.random.default_rng(seed=1234)
- rng = {"sigma": -1.098, "k": 0.18}
- true_param_values "alpha"] = gpareto_rvs(
- true_param_values[
- rng,
- N,
- min_alpha,"k"],
- true_param_values["sigma"],
- true_param_values[
- )= rng.binomial(K, expit(true_param_values["alpha"]))
- y return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
-
-
-def gpareto_rvs(
-int, mu: float, k: float, sigma: float
- rng: np.random.Generator, size:
- ):"""Generate random numbers from a generalised pareto distribution.
-
- See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
- source.
-
- """
-= rng.uniform(size)
- U if k == 0:
- return mu - sigma * np.log(U)
- else:
- return mu + (sigma * (U**-k) - 1) / sigma
+def get_stan_input_normal_fake(ppd: PreparedData) -> Dict:
+"""Generate fake Stan input consistent with the normal model."""
+ = len(ppd.measurements)
+ N = np.random.default_rng(seed=1234)
+ rng = {
+ true_param_values "mu": logit(0.25),
+ "tau": 0.18, # 2sds is 0.19 to 0.32 batting average
+ "b_K": 0.04, # slight effect of more attempts
+ "alpha_std": rng.random.normal(loc=0, scale=1, size=N),
+
+ }= ppd.measurements["n_attempt"].values
+ K = (np.log(K) - np.log(K).mean()) / np.log(K).std()
+ log_K_std = (
+ alpha "mu"]
+ true_param_values[+ true_param_values["b_K"] * log_K_std
+ + true_param_values["tau"] * true_param_values["alpha_std"]
+
+ )= rng.random.binomial(K, expit(alpha))
+ y return {"N": N, "K": K, "y": y} | true_param_values
+
+
+def get_stan_input_gpareto_fake(ppd: PreparedData) -> Dict:
+"""Generate fake Stan input consistent with the gpareto model."""
+ = len(ppd.measurements)
+ N = ppd.measurements["n_attempt"].values
+ K = 0.1
+ min_alpha = np.random.default_rng(seed=1234)
+ rng = {"sigma": -1.098, "k": 0.18}
+ true_param_values "alpha"] = gpareto_rvs(
+ true_param_values[
+ rng,
+ N,
+ min_alpha,"k"],
+ true_param_values["sigma"],
+ true_param_values[
+ )= rng.binomial(K, expit(true_param_values["alpha"]))
+ y return {"N": N, "K": K, "y": y, "min_alpha": min_alpha} | true_param_values
+
+
+def gpareto_rvs(
+int, mu: float, k: float, sigma: float
+ rng: np.random.Generator, size:
+ ):"""Generate random numbers from a generalised pareto distribution.
+
+ See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for
+ source.
+
+ """
+= rng.uniform(size)
+ U if k == 0:
+ return mu - sigma * np.log(U)
+ else:
+ return mu + (sigma * (U**-k) - 1) / sigma
Specifying inferences