From 7b02abbbff4ea20a2dcc7ab6d79ce598cbcd8781 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konstantin=20Sch=C3=BCrholt?= <55194201+kschuerholt@users.noreply.github.com> Date: Fri, 21 Jan 2022 13:42:19 +0100 Subject: [PATCH 1/2] Add confidence interval for MWU #225 Implemented CI from 'Calculating confidence intervals for some non-parametric analyses', Campbell and Gardner 1988. CI Style is adapted from ttest. The same publication offers a solution for wilcoxon, which is not yet implemented but could be added fairly easily. --- pingouin/nonparametric.py | 56 ++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/pingouin/nonparametric.py b/pingouin/nonparametric.py index da1fe85a..81d3ab7d 100644 --- a/pingouin/nonparametric.py +++ b/pingouin/nonparametric.py @@ -144,7 +144,7 @@ def madmedianrule(a): return (np.fabs(a - np.median(a)) / mad(a)) > k -def mwu(x, y, alternative='two-sided', **kwargs): +def mwu(x, y, alternative='two-sided',confidence=0.95, **kwargs): """Mann-Whitney U Test (= Wilcoxon rank-sum test). It is the non-parametric version of the independent T-test. @@ -157,6 +157,8 @@ def mwu(x, y, alternative='two-sided', **kwargs): Defines the alternative hypothesis, or tail of the test. Must be one of "two-sided" (default), "greater" or "less". See :py:func:`scipy.stats.mannwhitneyu` for more details. + confidence : float + Confidence level on confidence interval of difference of medians between x and y. **kwargs : dict Additional keywords arguments that are passed to :py:func:`scipy.stats.mannwhitneyu`. @@ -169,6 +171,7 @@ def mwu(x, y, alternative='two-sided', **kwargs): * ``'p-val'``: p-value * ``'RBC'`` : rank-biserial correlation * ``'CLES'`` : common language effect size + * ``'CI'`` : confidence interval of difference of medians See also -------- @@ -222,6 +225,10 @@ def mwu(x, y, alternative='two-sided', **kwargs): Association and the American Statistical Association, 25(2), 101–132. https://doi.org/10.2307/1165329 + .. [5] Campbell, M. J. & Gardner, M. J. (1988). Calculating confidence + intervals for some non-parametric analyses. + British Medical Journal Volume 226, 1988. + Examples -------- >>> import numpy as np @@ -229,9 +236,9 @@ def mwu(x, y, alternative='two-sided', **kwargs): >>> np.random.seed(123) >>> x = np.random.uniform(low=0, high=1, size=20) >>> y = np.random.uniform(low=0.2, high=1.2, size=20) - >>> pg.mwu(x, y, alternative='two-sided') - U-val alternative p-val RBC CLES - MWU 97.0 two-sided 0.00556 0.515 0.2425 + >>> pg.mwu(x, y, alternative='two-sided',confidence=0.95) + U-val alternative p-val RBC CLES CI95% + MWU 97.0 two-sided 0.00556 0.515 0.2425 [-0.39290395101879694, -0.09400270319896187] Compare with SciPy @@ -241,19 +248,19 @@ def mwu(x, y, alternative='two-sided', **kwargs): One-sided test - >>> pg.mwu(x, y, alternative='greater') - U-val alternative p-val RBC CLES - MWU 97.0 greater 0.997442 0.515 0.2425 + >>> pg.mwu(x, y, alternative='greater',confidence=0.95) + U-val alternative p-val RBC CLES CI95% + MWU 97.0 greater 0.997442 0.515 0.2425 [-0.3711286134873304, inf] - >>> pg.mwu(x, y, alternative='less') - U-val alternative p-val RBC CLES - MWU 97.0 less 0.00278 0.515 0.7575 + >>> pg.mwu(x, y, alternative='less',confidence=0.95) + U-val alternative p-val RBC CLES CI95% + MWU 97.0 less 0.00278 0.515 0.7575 [-inf, -0.10192641647044609] Passing keyword arguments to :py:func:`scipy.stats.mannwhitneyu`: - >>> pg.mwu(x, y, alternative='two-sided', method='exact') - U-val alternative p-val RBC CLES - MWU 97.0 two-sided 0.004681 0.515 0.2425 + >>> pg.mwu(x, y, alternative='two-sided',confidence=0.95, method='exact') + U-val alternative p-val RBC CLES CI95% + MWU 97.0 two-sided 0.004681 0.515 0.2425 [-0.39290395101879694, -0.09400270319896187] """ x = np.asarray(x) y = np.asarray(y) @@ -281,14 +288,33 @@ def mwu(x, y, alternative='two-sided', **kwargs): # Effect size 2: rank biserial correlation (Wendt 1972) rbc = 1 - (2 * uval) / diff.size # diff.size = x.size * y.size - + + # Confidence interval for the (difference in) medians + # Campbell and Gardner 2000 + if alternative == "two-sided": + alpha = 1.0 - confidence + conf = 1.0 - alpha / 2 # 0.975 + else: + conf = confidence + N = scipy.stats.norm.ppf(conf) + ct1, ct2 = len(x),len(y) # count samples + diffs = sorted([i-j for i in x for j in y]) # get ct1xct2 difference + k = int(round(ct1*ct2/2 - (N * (ct1*ct2*(ct1+ct2+1)/12)**0.5))) + ci = [diffs[k], diffs[len(diffs)-k]] + if alternative == "greater": + ci[1] = np.inf + elif alternative == "less": + ci[0] = -np.inf + # Rename CI + ci_name = 'CI%.0f%%' % (100 * confidence) # Fill output DataFrame stats = pd.DataFrame({ 'U-val': uval, 'alternative': alternative, 'p-val': pval, 'RBC': rbc, - 'CLES': cles}, index=['MWU']) + 'CLES': cles, + ci_name: [ci]}, index=['MWU']) return _postprocess_dataframe(stats) From f31b2e5f3d17116cd435e5df50c3c14fe9054d22 Mon Sep 17 00:00:00 2001 From: Raphael Vallat Date: Mon, 20 Jun 2022 10:22:34 -0700 Subject: [PATCH 2/2] Black formatting --- pingouin/nonparametric.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pingouin/nonparametric.py b/pingouin/nonparametric.py index 159f1b7e..cec7f5ee 100644 --- a/pingouin/nonparametric.py +++ b/pingouin/nonparametric.py @@ -234,7 +234,7 @@ def mwu(x, y, alternative="two-sided", confidence=0.95, **kwargs): Association and the American Statistical Association, 25(2), 101–132. https://doi.org/10.2307/1165329 - .. [5] Campbell, M. J. & Gardner, M. J. (1988). Calculating confidence + .. [5] Campbell, M. J. & Gardner, M. J. (1988). Calculating confidence intervals for some non-parametric analyses. British Medical Journal Volume 226, 1988. @@ -301,25 +301,25 @@ def mwu(x, y, alternative="two-sided", confidence=0.95, **kwargs): # Effect size 2: rank biserial correlation (Wendt 1972) rbc = 1 - (2 * uval) / diff.size # diff.size = x.size * y.size - - # Confidence interval for the (difference in) medians + + # Confidence interval for the (difference in) medians # Campbell and Gardner 2000 if alternative == "two-sided": alpha = 1.0 - confidence conf = 1.0 - alpha / 2 # 0.975 else: conf = confidence - N = scipy.stats.norm.ppf(conf) - ct1, ct2 = len(x),len(y) # count samples - diffs = sorted([i-j for i in x for j in y]) # get ct1xct2 difference - k = int(round(ct1*ct2/2 - (N * (ct1*ct2*(ct1+ct2+1)/12)**0.5))) - ci = [diffs[k], diffs[len(diffs)-k]] + N = scipy.stats.norm.ppf(conf) + ct1, ct2 = len(x), len(y) # count samples + diffs = sorted([i - j for i in x for j in y]) # get ct1xct2 difference + k = int(round(ct1 * ct2 / 2 - (N * (ct1 * ct2 * (ct1 + ct2 + 1) / 12) ** 0.5))) + ci = [diffs[k], diffs[len(diffs) - k]] if alternative == "greater": ci[1] = np.inf elif alternative == "less": ci[0] = -np.inf # Rename CI - ci_name = 'CI%.0f%%' % (100 * confidence) + ci_name = "CI%.0f%%" % (100 * confidence) # Fill output DataFrame stats = pd.DataFrame( {"U-val": uval, "alternative": alternative, "p-val": pval, "RBC": rbc, "CLES": cles},