-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
54 lines (48 loc) · 1.68 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Helper function help clean preprocess feature column
def split_multicolumn(col_series):
result_df = col_series.to_frame()
options = []
# Iterate over the column
for idx, value in col_series[col_series.notnull()].items():
# Break each value into list of options
for option in value.split(';'):
# Add the option as a column to result
if not option in result_df.columns:
options.append(option)
result_df[option] = 0.
# Mark the value in the option column as True
result_df.at[idx, option] = 1.
return result_df[options]
def shorten_categories(categories, cutoff):
categorical_map = {}
for i in range(len(categories)):
if categories.values[i] >= cutoff:
categorical_map[categories.index[i]] = categories.index[i]
else:
categorical_map[categories.index[i]] = 'Other'
return categorical_map
def age_process(val):
if val == "45-54 years old" or val == "55-64 years old":
return "Over 45"
return val
def clean_education(x):
if 'Bachelor’s degree' in x:
return 'Bachelor’s degree'
if 'Master’s degree' in x:
return 'Master’s degree'
if 'Professional degree' in x or 'Other doctoral' in x:
return 'Post grad'
return 'Less than a Bachelors'
def YearCodeProProcess(x):
if x == 'More than 50 years':
return 50
if x == 'Less than 1 year':
return 0.5
return float(x)
def remote_work_process(val):
if val == 'Hybrid (some remote, some in-person)':
return "Hybrid"
elif val == 'Fully remote':
return "Remote"
else:
return "Person"