Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Creating test_utils.py #113

Open
wants to merge 13 commits into
base: dev
Choose a base branch
from
11 changes: 0 additions & 11 deletions msdbook/model.py

This file was deleted.

18 changes: 0 additions & 18 deletions msdbook/tests/test_model.py

This file was deleted.

123 changes: 123 additions & 0 deletions msdbook/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import pytest
erexer marked this conversation as resolved.
Show resolved Hide resolved
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from msdbook.utils import fit_logit, plot_contour_map
from statsmodels.base.wrapper import ResultsWrapper

@pytest.fixture
def sample_data():
"""Fixture to provide sample data for testing."""
np.random.seed(0) # For reproducibility

# Number of samples
n = 100

# Generate some random data
df = pd.DataFrame({
'Success': np.random.randint(0, 2, size=n), # Binary outcome variable (0 or 1)
'Predictor1': np.random.randn(n), # Random values for Predictor1
'Predictor2': np.random.randn(n), # Random values for Predictor2
'Interaction': np.random.randn(n) # Random values for Interaction term (not necessarily related)
})

return df
def test_fit_logit(sample_data):
"""Test the fit_logit function."""
predictors = ['Predictor1', 'Predictor2']
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should define this in sample_data()

result = fit_logit(sample_data, predictors)

# Check if result is a statsmodels LogitResultsWrapper object
assert isinstance(result, ResultsWrapper)

# Check if the result object has the expected attributes
assert hasattr(result, 'params')
assert hasattr(result, 'pvalues')
assert hasattr(result, 'predict')

# Check that parameters (coefficients) are not empty
assert result.params is not None
assert result.pvalues is not None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to check that the parameters and values are correct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the tests only confirm that the parameters and p-values are not empty or None. They do not ensure that the values are correct in a meaningful way, such as being logically reasonable or statistically valid.


def test_plot_contour_map(sample_data):
"""Test the plot_contour_map function."""
fig, ax = plt.subplots()

# Fit a logit model for the purpose of plotting
predictors = ['Predictor1', 'Predictor2']
result = fit_logit(sample_data, predictors)

# Dynamically generate grid and levels
xgrid = np.linspace(sample_data['Predictor1'].min() - 1, sample_data['Predictor1'].max() + 1, 50)
ygrid = np.linspace(sample_data['Predictor2'].min() - 1, sample_data['Predictor2'].max() + 1, 50)
levels = np.linspace(0, 1, 10)

contour_cmap = 'viridis'
dot_cmap = 'coolwarm'

# Call the plot function
contourset = plot_contour_map(
ax, result, sample_data, contour_cmap, dot_cmap, levels, xgrid, ygrid, 'Predictor1', 'Predictor2', base=0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should avoid hardcoding values as much as possible

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should have been clearer here, for tests you want to have hardcoded values as inputs and outputs so that you can make sure the function is doing the right thing, but in general, if you're using the same value over and over, you should put it into a variable and use that variable. For example, you use 'Predictor1' and 'Predictor2' a lot so you should put those into variables.

)

# Check if the contour plot is created
assert contourset is not None

# Check if the axis limits and labels are set correctly
assert ax.get_xlim() == (np.min(xgrid), np.max(xgrid))
assert ax.get_ylim() == (np.min(ygrid), np.max(ygrid))
assert ax.get_xlabel() == 'Predictor1'
assert ax.get_ylabel() == 'Predictor2'

# Verify that scatter plot is present by checking number of points
assert len(ax.collections) > 0
plt.close(fig)

def test_empty_data():
"""Test with empty data to ensure no errors."""
empty_df = pd.DataFrame({
'Success': [],
'Predictor1': [],
'Predictor2': [],
'Interaction': []
})

predictors = ['Predictor1', 'Predictor2']

# Check if fitting with empty data raises an error
with pytest.raises(ValueError):
fit_logit(empty_df, predictors)

# We should not attempt plotting with empty data
fig, ax = plt.subplots()

# Check if plotting with empty data doesn't crash
if not empty_df.empty:
erexer marked this conversation as resolved.
Show resolved Hide resolved
result = fit_logit(empty_df, predictors)
contourset = plot_contour_map(

Check warning on line 97 in msdbook/tests/test_utils.py

View check run for this annotation

Codecov / codecov/patch

msdbook/tests/test_utils.py#L96-L97

Added lines #L96 - L97 were not covered by tests
ax, result, empty_df,
'viridis', 'coolwarm', np.linspace(0, 1, 10), np.linspace(-2, 2, 50),
np.linspace(-2, 2, 50), 'Predictor1', 'Predictor2', base=0
)
assert contourset is not None

Check warning on line 102 in msdbook/tests/test_utils.py

View check run for this annotation

Codecov / codecov/patch

msdbook/tests/test_utils.py#L102

Added line #L102 was not covered by tests
else:
# Skip if no result is generated (empty DataFrame)
pass
plt.close(fig)

def test_invalid_predictors(sample_data):
"""Test with invalid predictors."""
invalid_predictors = ['InvalidPredictor1', 'InvalidPredictor2']

with pytest.raises(KeyError):
fit_logit(sample_data, invalid_predictors)

def test_logit_with_interaction(sample_data):
"""Test logistic regression with interaction term."""
sample_data["Interaction"] = sample_data["Predictor1"] * sample_data["Predictor2"]
predictors = ['Predictor1', 'Predictor2']

result = fit_logit(sample_data, predictors)

# Ensure the interaction term is included in the result
assert 'Interaction' in result.params.index
23 changes: 13 additions & 10 deletions msdbook/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,30 @@
import numpy as np
import statsmodels.api as sm


def fit_logit(dta, predictors):
"""Logistic regression"""

# concatenate intercept column of 1s
# Add intercept column of 1s
dta["Intercept"] = np.ones(np.shape(dta)[0])
# get columns of predictors

# Get columns of predictors
cols = dta.columns.tolist()[-1:] + predictors + ["Interaction"]
# fit logistic regression
logit = sm.Logit(dta["Success"], dta[cols], disp=False)
result = logit.fit()


# Fit logistic regression without the deprecated 'disp' argument
logit = sm.Logit(dta["Success"], dta[cols])
result = logit.fit(method='bfgs') # Use method='bfgs' or another supported method

return result


def plot_contour_map(
ax, result, dta, contour_cmap, dot_cmap, levels, xgrid, ygrid, xvar, yvar, base
):
"""Plot the contour map"""

# TODO: see why this warning is being raised about the tight layout
# Ignore tight layout warnings
warnings.filterwarnings("ignore")

# find probability of success for x=xgrid, y=ygrid
# Generate probability of success for x=xgrid, y=ygrid
X, Y = np.meshgrid(xgrid, ygrid)
x = X.flatten()
y = Y.flatten()
Expand All @@ -36,9 +36,12 @@ def plot_contour_map(
Z = np.reshape(z, np.shape(X))

contourset = ax.contourf(X, Y, Z, levels, cmap=contour_cmap, aspect="auto")

# Plot scatter points based on the data
xpoints = np.mean(dta[xvar].values.reshape(-1, 10), axis=1)
ypoints = np.mean(dta[yvar].values.reshape(-1, 10), axis=1)
colors = np.round(np.mean(dta["Success"].values.reshape(-1, 10), axis=1), 0)

ax.scatter(xpoints, ypoints, s=10, c=colors, edgecolor="none", cmap=dot_cmap)
ax.set_xlim(np.min(xgrid), np.max(xgrid))
ax.set_ylim(np.min(ygrid), np.max(ygrid))
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ dependencies = [
"scipy>=1.13.1",
"seaborn>=0.13.2",
"statsmodels>=0.14.2",
"pytest>=7.0.0",
"pytest-mock>=3.10",
erexer marked this conversation as resolved.
Show resolved Hide resolved
]

[project.optional-dependencies]
Expand Down
Loading