forked from PSLmodels/taxdata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreatepuf.py
185 lines (173 loc) · 4.81 KB
/
createpuf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import pandas as pd
import numpy as np
from taxdata import cps, puf
from taxdata.matching import statmatch
from pathlib import Path
CUR_PATH = Path(__file__).resolve().parent
DATA_PATH = Path(CUR_PATH, "data")
CPS_YEAR = 2016
PUF_YEAR = 2011
# variables used in the matching process
PARTITION_VARS = ["dsi", "mars", "agede", "_depne", "people"]
REG_VARS = [
"const",
"agede",
"e00200",
"e00300",
"e00600",
"e00900",
"e02100",
"e01500",
"e02400",
"e02300",
"e00800",
"wage_share",
"cap_share",
"se1",
"se2",
"se3",
]
INC_VARS = [
"e00200",
"e00300",
"e00400",
"e00600",
"bil",
"fil",
"e02400",
"e01500",
"e00800",
"e02300",
]
CAP_VARS = ["e00300", "e00400", "e00600"]
def dataprep(data):
"""
Prep data for matching
"""
# we use a slightly modified version of mars for matching.
# _mars = 1 if single, 3 if HoH, 2 any type of joint filer
data["_mars"] = np.where(data["mars"] == 1, 1, np.where(data["mars"] == 4, 3, 2))
data["const"] = 1
data["bil"] = np.maximum(0, data["e00900"])
data["fil"] = np.maximum(0, data["e02100"])
data["tpi"] = data[INC_VARS].sum(axis=1)
data["wage_share"] = np.divide(
data["e00200"], data["tpi"], out=np.zeros(data.shape[0]), where=data["tpi"] != 0
)
data["cap_inc"] = data[CAP_VARS].sum(axis=1)
data["cap_share"] = np.divide(
data["cap_inc"],
data["tpi"],
out=np.zeros(data.shape[0]),
where=data["tpi"] != 0,
)
data["_depne"] = np.where(
data["dsi"] == 0,
np.where(
data["_mars"] == 2,
np.minimum(data["depne"], 5),
np.minimum(data["depne"], 3),
),
0,
)
data["people"] = np.where(data["_mars"] == 2, data["depne"] + 2, data["depne"] + 1)
data["people"] = np.minimum(5, data["people"])
wage_flag = (data["e00200"] != 0).astype(int)
# self employment flag
se_flag = np.logical_or(data["e00900"] != 0, data["e02100"] != 0).astype(int)
# income source flags
data["se1"] = np.where(wage_flag & ~se_flag, 1, 0)
data["se2"] = np.where(~wage_flag & se_flag, 1, 0)
data["se3"] = np.where(wage_flag & se_flag, 1, 0)
data["_depne"] = np.where(
np.logical_and(data["mars"] == 3, data["_depne"] == 0), 1, data["_depne"]
)
return data
# create CPS tax units
print("Creating CPS tax units")
raw_cps = cps.create(DATA_PATH, exportpkl=True, cps_files=[CPS_YEAR], benefits=False)
# minor PUF prep
print("Prepping PUF")
puf2011 = pd.read_csv(Path(DATA_PATH, "puf2011.csv"))
raw_puf = puf.preppuf(puf2011, PUF_YEAR)
# rename CPS file to match PUF
print("Prepping CPS")
cps_rename = {
"dep_stat": "dsi",
"divs": "e00600",
"statetax": "e18400",
"realest": "e18500",
"miscitem": "e20400",
"medex": "e17500",
}
raw_cps.rename(columns=cps_rename, inplace=True)
raw_cps.columns = map(str.lower, raw_cps.columns)
raw_cps["flpdyr"] = max(raw_puf["flpdyr"])
raw_cps["g20500"] = 0
# Split charitable contributions into cash and non-cash using ratio in PUF
cash = 0.82013
non_cash = 1.0 - cash
raw_cps["e19800"] = raw_cps["charitable"] * cash
raw_cps["e20100"] = raw_cps["charitable"] * non_cash
raw_cps = dataprep(raw_cps)
raw_puf = dataprep(raw_puf)
raw_cps["recid"] = range(1, len(raw_cps.index) + 1)
raw_cps["agerange"] = 0
raw_cps["eic"] = np.minimum(3, raw_cps["eic"])
raw_cps.drop(
[
"mcaid_ben",
"mcare_ben",
"other_ben",
"snap_ben",
"housing_ben",
"ssi_ben",
"tanf_ben",
"vet_ben",
],
axis=1,
inplace=True,
)
raw_cps.to_csv(Path(DATA_PATH, "tu16.csv"), index=False)
# split CPS into filers and non-filers
filers = raw_cps[raw_cps["filer"] == 1].copy()
nonfilers = raw_cps[raw_cps["filer"] == 0].copy()
print("Begining statistical match")
# statistical matching
match_index = statmatch.match(
raw_puf,
filers,
"recid",
"recid",
"s006",
"s006",
"e04800",
REG_VARS,
PARTITION_VARS,
)
# merge all the data together
print("Merging matched data")
data = pd.merge(raw_puf, match_index, how="inner", left_on="recid", right_on="recip")
data = pd.merge(
data,
filers,
how="inner",
left_on="donor",
right_on="recid",
suffixes=(None, "_cps"),
)
# filter off the duplicated columns
data.drop(list(data.filter(regex=".*_cps")), axis=1, inplace=True)
# add back non-filers
print("Adding non-filers")
nonfilers.rename(columns={"s006": "matched_weight"}, inplace=True)
data = pd.concat([data, nonfilers], sort=False, ignore_index=True)
data = data.fillna(0.0)
data.reset_index(inplace=True)
print("Exporting raw data")
data.to_csv(Path(DATA_PATH, "cps-matched-puf.csv"), index=False)
print("Cleaning data")
data = puf.finalprep(data)
print("Exporting data")
data.to_csv(Path(DATA_PATH, "puf.csv"), index=False)
print("Done!")