-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdraft.py
141 lines (133 loc) · 6.6 KB
/
draft.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import csv
#all headers
#def dataHeaders():
# with open('alabama.csv', 'rb') as csvfile:
# spamreader = csv.reader(csvfile)
# for row in spamreader:
# return row
#
#headers in current data
#headers_long = ["ST","CIT","CITWP05","CITWP12","COW","DEAR","DEYE","DOUT","DPHY","DREM","ENG","FER","GCL","GCM","GCR","HINS1","HINS2","HINS3","HINS4","HINS5","HINS7","JWMNP","JWRIP","JWTR","LANX","MAR","MARHD","MARHM","MARHT","MARHW","MARHYP05","MARHYP05","MIG","MIL","MLPA","MLPB","MLPCD","MLPE","MLPH","MLPJ","NWLA","NWLK","OIP","PAP","RELP","RETP","SCH","SCHG","SCHL","SEMP","SEX","WKHP","WKL","WKW","YOEP05","YOEP12","ANC","ANC1P05","ANC1P12","ESP","FOD1P","FOD2P","HICOV","INDP","JWAP","JWDP","LANP05","MSP"]
headers = ["JWMNP","CITWP05","MARHYP05","WKHP","WAGP","JWDP","ANC","ST","ANC1P12","FER","JWAP","MARHT","NWLA","CIT","FOD1P","JWRIP","MARHW","NWLK","COW","GCL","JWTR","MIG","SCHL","DEAR","GCM","LANP05","DEYE","HICOV","MAR","MIL","VPS","ENG","HISP","MARHD","NATIVITY","WAOB","ESP","INDP","MARHM","NOP","WKW"]
def dataHeaders():
with open('alabama.csv', 'rb') as csvfile:
spamreader = csv.reader(csvfile)
for row in spamreader:
return row
#get headers in use index in all headers
def getHeaderIndex():
headerDictionary = {}
indexList = []
for header in headers:
#print header
headerIndex = dataHeaders().index(header)
headerDictionary[header]=headerIndex
indexList.append(headerIndex)
#return headerDictionary
return indexList
def reduceDataByColumn(infile,outfile):
print "reduce to useful columns ..."
indexList = getHeaderIndex()
reducedRowsList = []
with open(outfile,'wb') as outputFile:
spamwriter = csv.writer(outputFile)
with open(infile, 'rb') as csvfile:
spamreader = csv.reader(csvfile)
# headerDictionary = replaceHeaderCodes()
rowsDone = 0
for row in spamreader:
reducedRow = []
for index in indexList:
reducedRow.append(row[index])
if reducedRow in reducedRowsList:
print "dupilicat"
else:
spamwriter.writerow(reducedRow)
# print reducedRow
def columnDicts(infile):
with open(infile, 'rb') as csvfile:
spamreader = csv.reader(csvfile)
headers = spamreader.next()
#print headers
outfile = {}
for row in spamreader:
outfile[row[0]]=row[1]
return outfile
def fillInData(infile,outfile):
print "filling in data ..."
rowsDone = 0
with open(outfile,'wb') as outputfile:
w = csv.writer(outputfile)
with open(infile,'rb') as datafile:
r = csv.reader(datafile)
headers = r.next()
print headers
#print currentIndex
for row in r:
rowsDone +=1
if rowsDone%10000==0:
print rowsDone
for i in headers:
currentIndex = headers.index(i)
currentDictionary = columnDicts("category_dictionaries/"+i+'.csv')
if row[currentIndex] in currentDictionary:
if i == "ST":
state = currentDictionary[row[currentIndex]]
greeting = "hi from "+state+","
row[0]=greeting
row[currentIndex]=""
else:
row[currentIndex]=currentDictionary[row[currentIndex]]
elif i == "CITWP05":
if row[currentIndex]!="" and row[currentIndex]!="-009" and len(row[currentIndex])==4:
#print row[currentIndex]
row[currentIndex] = "I was naturalized in " + str(row[currentIndex])+". "
#print row[currentIndex]
else:
row[currentIndex] =""
elif i == "JWMNP":
if [row[currentIndex]][0] !="":
commute = int([row[currentIndex]][0])
if commute < 11:
row[currentIndex] = "it only takes me "+str(commute)+"mins to get to work."
elif commute >10:
row[currentIndex] = "my daily commute is "+str(commute)+"mins long"
elif commute >120:
row[currentIndex] = "It takes me more than 2 hours to get to work. "
else:
row[currentIndex] = ""
else:
row[currentIndex] = ""
elif i == "MARHYP05":
#check times married
timesMarried = row[headers.index("MARHT")]
if timesMarried == "1" or timesMarried == "I have only been married once. ":
phrase = "I got married in "
row[currentIndex] = phrase+row[currentIndex]+". "
else:
if row[currentIndex] !="" and row[currentIndex]!= " " and len(row[currentIndex])==4:
phrase = "Last time I got married was in "
row[currentIndex] = phrase+row[currentIndex]+". "
else:
row[currentIndex] = ""
#print row[currentIndex], row[headers.index("MARHT")]
elif i == "WKHP":
if row[currentIndex]!="":
hoursWorkedPerWeek = int([row[currentIndex]][0])
if hoursWorkedPerWeek > 40:
row[currentIndex] = "Usually work "+str(hoursWorkedPerWeek)+"hrs per week. "
else:
row[currentIndex] = "I work less than 40 hours per week. "
else:
row[currentIndex] =""
w.writerow(row)
states = ['ak','usa','usb','usc','usd']
fileRoot = 'data/ss13p'
for i in range(len(states)):
print i
infile = fileRoot+states[i]+".csv"
outfile = fileRoot+states[i]+"_out.csv"
outfile2 = fileRoot+states[i]+"_filledin.csv"
print infile,outfile,outfile2
reduceDataByColumn(infile,outfile)
fillInData(outfile,outfile2)