diff --git a/taxcalc/dropq/dropq.py b/taxcalc/dropq/dropq.py index e0be06bed..14303a790 100644 --- a/taxcalc/dropq/dropq.py +++ b/taxcalc/dropq/dropq.py @@ -193,8 +193,8 @@ def drop_records(df1, df2, mask): df2['mask'] = mask df1['mask'] = mask - df2 = add_weighted_decile_bins(df2) - df1 = add_weighted_decile_bins(df1) + df2 = add_weighted_income_bins(df2) + df1 = add_weighted_income_bins(df1) gp2_dec = df2.groupby('bins') income_bins = WEBAPP_INCOME_BINS diff --git a/taxcalc/dropq/dropq_utils.py b/taxcalc/dropq/dropq_utils.py index dada94249..247f3f734 100644 --- a/taxcalc/dropq/dropq_utils.py +++ b/taxcalc/dropq/dropq_utils.py @@ -73,7 +73,7 @@ def create_json_table(df, row_names=None, column_types=None, num_decimals=2): def create_dropq_difference_table(df1, df2, groupby, res_col, diff_col, suffix, wsum): if groupby == "weighted_deciles": - df = add_weighted_decile_bins(df2) + df = add_weighted_income_bins(df2, num_bins=10) elif groupby == "small_income_bins": df = add_income_bins(df2, compare_with="soi") elif groupby == "large_income_bins": @@ -141,7 +141,7 @@ def create_dropq_distribution_table(calc, groupby, result_type, suffix): res[returnsAMTsuf] = res[s006suf].where(res[c09600suf] > 0, 0) if groupby == "weighted_deciles": - df = add_weighted_decile_bins(res) + df = add_weighted_income_bins(res, num_bins=10) elif groupby == "small_income_bins": df = add_income_bins(res, compare_with="soi") elif groupby == "large_income_bins": diff --git a/taxcalc/tests/test_utils.py b/taxcalc/tests/test_utils.py index 1d1dc7852..5c63ed4f7 100644 --- a/taxcalc/tests/test_utils.py +++ b/taxcalc/tests/test_utils.py @@ -329,18 +329,18 @@ def test_add_income_bins_raises(): df = add_income_bins(df, compare_with='stuff') -def test_add_weighted_decile_bins(): +def test_add_weighted_income_bins(): df = DataFrame(data=data, columns=['_expanded_income', 's006', 'label']) - df = add_weighted_decile_bins(df, num_bins=100) + df = add_weighted_income_bins(df, num_bins=100) bin_labels = df['bins'].unique() default_labels = set(range(1, 101)) for lab in bin_labels: assert lab in default_labels # Custom labels - df = add_weighted_decile_bins(df, weight_by_income_measure=True) + df = add_weighted_income_bins(df, weight_by_income_measure=True) assert 'bins' in df custom_labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] - df = add_weighted_decile_bins(df, labels=custom_labels) + df = add_weighted_income_bins(df, labels=custom_labels) assert 'bins' in df bin_labels = df['bins'].unique() for lab in bin_labels: diff --git a/taxcalc/utils.py b/taxcalc/utils.py index 29b092a5e..126c7f046 100644 --- a/taxcalc/utils.py +++ b/taxcalc/utils.py @@ -166,37 +166,27 @@ def weighted_share_of_total(pdf, col_name, total): return float(weighted_sum(pdf, col_name)) / (float(total) + EPSILON) -def add_weighted_decile_bins(pdf, income_measure='_expanded_income', - num_bins=10, labels=None, +def add_weighted_income_bins(pdf, num_bins=10, labels=None, + income_measure='_expanded_income', weight_by_income_measure=False): """ - Add a column of income bins based on each 10% of the income_measure, - weighted by s006. - - The default income_measure is `expanded_income`, but `c00100` also works. - - This function will server as a 'grouper' later on. + Add a column of income bins to specified Pandas DataFrame, pdf, with + the new column being named 'bins'. Assumes that specified pdf contains + columns for the specified income_measure and for sample weights, s006. """ - # First, weight income measure by s006 if desired - if weight_by_income_measure: - pdf['s006_weighted'] = np.multiply(pdf[income_measure].values, - pdf['s006'].values) - # Next, sort by income_measure pdf.sort_values(by=income_measure, inplace=True) - # Do a cumulative sum if weight_by_income_measure: - pdf['cumsum_weights'] = np.cumsum(pdf['s006_weighted'].values) + pdf['cumsum_temp'] = np.cumsum(np.multiply(pdf[income_measure].values, + pdf['s006'].values)) else: - pdf['cumsum_weights'] = np.cumsum(pdf['s006'].values) - # Max value of cum sum of weights - max_ = pdf['cumsum_weights'].values[-1] - # Create 10 bins and labels based on this cumulative weight + pdf['cumsum_temp'] = np.cumsum(pdf['s006'].values) + max_cumsum = pdf['cumsum_temp'].values[-1] bin_edges = [0] + list(np.arange(1, (num_bins + 1)) * - (max_ / float(num_bins))) + (max_cumsum / float(num_bins))) if not labels: labels = range(1, (num_bins + 1)) - # Groupby weighted deciles - pdf['bins'] = pd.cut(pdf['cumsum_weights'], bins=bin_edges, labels=labels) + pdf['bins'] = pd.cut(pdf['cumsum_temp'], bins=bin_edges, labels=labels) + pdf.drop('cumsum_temp', axis=1, inplace=True) return pdf @@ -233,13 +223,10 @@ def add_income_bins(pdf, compare_with='soi', bins=None, right=True, if not bins: if compare_with == 'tpc': bins = LARGE_INCOME_BINS - elif compare_with == 'soi': bins = SMALL_INCOME_BINS - elif compare_with == 'webapp': bins = WEBAPP_INCOME_BINS - else: msg = 'Unknown compare_with arg {0}'.format(compare_with) raise ValueError(msg) @@ -423,7 +410,8 @@ def create_distribution_table(obj, groupby, result_type, res['s006'] = res_base['s006'] # sorts the data if groupby == 'weighted_deciles': - pdf = add_weighted_decile_bins(res, income_measure=income_measure) + pdf = add_weighted_income_bins(res, num_bins=10, + income_measure=income_measure) elif groupby == 'small_income_bins': pdf = add_income_bins(res, compare_with='soi', income_measure=income_measure) @@ -495,7 +483,8 @@ def create_difference_table(recs1, recs2, groupby, res2[baseline_income_measure] = res1[income_measure] income_measure = baseline_income_measure if groupby == 'weighted_deciles': - pdf = add_weighted_decile_bins(res2, income_measure=income_measure) + pdf = add_weighted_income_bins(res2, num_bins=10, + income_measure=income_measure) elif groupby == 'small_income_bins': pdf = add_income_bins(res2, compare_with='soi', income_measure=income_measure) @@ -844,13 +833,11 @@ def mtr_graph_data(calc1, calc2, df1 = df1[df1['MARS'] == mars] df2 = df2[df2['MARS'] == mars] # create 'bins' column given specified income_var and dollar_weighting - df1 = add_weighted_decile_bins(df1, + df1 = add_weighted_income_bins(df1, num_bins=100, income_measure=income_var, - num_bins=100, weight_by_income_measure=dollar_weighting) - df2 = add_weighted_decile_bins(df2, + df2 = add_weighted_income_bins(df2, num_bins=100, income_measure=income_var, - num_bins=100, weight_by_income_measure=dollar_weighting) # split into groups specified by 'bins' gdf1 = df1.groupby('bins', as_index=False)