-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataFormater.py
402 lines (359 loc) · 16.2 KB
/
dataFormater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/usr/bin/python
#-*- coding:utf-8 -*-
import json, codecs, nltk
import utilsOs, utilsGraph, utilsString
##################################################################################
#TSV FORMATTER
##################################################################################
def dumpFromExcelToTsv(inputPath, outputPath, sheet=None):
''' takes an excel file and dumps it into a standard tsv '''
if sheet != None:
xclDf = pd.read_excel(inputPath, sheet_name=sheet)
else:
xclDf = pd.read_excel(inputPath)
xclDf.to_csv(outputPath, sep='\t', index=False)
def joinMultTsvIntoOne(folderInputPath, outputFilePath=None, idColumnName=None, columnsToDropList=None, rowsToDropList=None, orderOfTheColumnsList=None):
'''
takes the tsv files in the given folder path and returns/dumps
one unified tsv data frame
'''
if folderInputPath[-1] != u'/':
folderInputPath = u'{0}/'.format(folderInputPath)
#get non hidden tsv files in the input folder
folderTsvFilesList = [file for file in utilsOs.getContentOfFolder(folderInputPath) if u'.tsv' in file if file[0] != u'.']
#get the first tsv file as the base for the unified dataframe
if len(folderTsvFilesList) >= 1:
unifiedDf = utilsGraph.getDataFrameFromArgs(u'{0}{1}'.format(folderInputPath, folderTsvFilesList[0]))
#if there are more than one tsv file
if len(folderTsvFilesList) > 1:
#open each tsv file and apend them to only one dataframe
for tsvFilePath in folderTsvFilesList[1:]:
fileDf = utilsGraph.getDataFrameFromArgs(u'{0}{1}'.format(folderInputPath, tsvFilePath))
unifiedDf = unifiedDf.append(fileDf)
#sort the dataframe but if there is no column name to which sort it, then we take the name of the first column
if idColumnName == None:
idColumnName = list(unifiedDf)[0]
#drop unwanted columns
if columnsToDropList != None:
for columnName in columnsToDropList:
unifiedDf = unifiedDf.drop(labels=columnName, axis=1)
#drop unwanted rows
if rowsToDropList != None:
for rowIndex in rowsToDropList:
unifiedDf = unifiedDf.drop(labels=rowIndex, axis=0)
#order the columns in the desired order
if orderOfTheColumnsList != None:
unifiedDf = unifiedDf[orderOfTheColumnsList]
#sort them in the order of the id
unifiedDf.sort_values(by=[idColumnName])
#drop row doubles
unifiedDf.drop_duplicates()
#dump the df
unifiedDf.to_csv(outputFilePath, sep='\t', index=False)
return unifiedDf
##################################################################################
#LINKEDIN JOB DATA FORMATING
##################################################################################
def getJobData(profile, dictJobTitlesData={}):
'''
Saves the function's information in a dict
'''
skillDict = {}
if u'experiences' in profile:
for experience in profile[u'experiences']:
dataDict = {}
#if there is a job title in the profile
if u'function' in experience:
function = experience[u'function']
#if it's the first time we have encountered this job title
#we save the function data to the dict
if function not in dictJobTitlesData:
listFunction = nltk.word_tokenize(function)
#coreference
dataDict[u'nbCoreferenceJobTitleInCorpus'] = 1#how many times we encountered the same function in the corpus
#tokens
dataDict[u'tokens'] = listFunction
dataDict[u'nbOfTokens'] = len(listFunction)
#pos
dataDict[u'pos'] = nltk.pos_tag(listFunction)
#alternative names
dataDict[u'alternativeNames'] = [function.lower(), function.upper(), (function.lower()).capitalize()]
if u'missions' in experience:
dataDict[u'description'] = [(experience[u'missions']).replace(u'\n', u' ').replace(u'\t', u' ').replace(u'\r', u' ')]
#pitch
if u'personalBranding_pitch' in profile:
dataDict[u'pitch'] = [(profile[u'personalBranding_pitch']).replace(u'\n', u' ').replace(u'\t', u' ').replace(u'\r', u' ')]
else:
dataDict[u'pitch'] = []
#skills
if u'skills' in profile:
for skill in profile[u'skills']:
skillDict[skill[u'name']] = 1
skillDict[skill[u'name'].lower()] = 1
skillDict[skill[u'name'].upper()] = 1
skillDict[skill[u'name'].capitalize()] = 1
dataDict[u'possibleSkills'] = skillDict
#we save the function info
dictJobTitlesData[function] = dataDict
#if there is a duplicate
else:
dictJobTitlesData = manageDuplicates(function, profile, dictJobTitlesData)
#we return the dict
return dictJobTitlesData
def manageDuplicates(function, profile, dictJobTitlesData):
'''
if there is a duplicate in the dict, we add the data to the existing one
'''
emptyList = []
#we save the function data to the dict
dictJobTitlesData[function][u'nbCoreferenceJobTitleInCorpus'] += 1#how many times we encountered the same function in the corpus
#if the profile has data for job description
if u'missions' in profile[u'experiences']:
dictJobTitlesData[function][u'description'].append((profile[u'experiences'][u'missions']).replace(u'\n', u' ').replace(u'\t', u' ').replace(u'\r', u' '))
#pitch
if u'personalBranding_pitch' in profile:
(dictJobTitlesData[function].get(u'pitch', emptyList)).append((profile[u'personalBranding_pitch']).replace(u'\n', u' ').replace(u'\t', u' ').replace(u'\r', u' '))
#skills
if u'skills' in profile:
for skill in profile[u'skills']:
dictJobTitlesData[function][u'possibleSkills'][skill[u'name']] = dictJobTitlesData[function][u'possibleSkills'].get(skill[u'name'], 0) + 1
if skill[u'name'] != skill[u'name'].lower():
dictJobTitlesData[function][u'possibleSkills'][skill[u'name'].lower()] = dictJobTitlesData[function][u'possibleSkills'].get(skill[u'name'].lower(), 0) + 1
if skill[u'name'] != skill[u'name'].upper():
dictJobTitlesData[function][u'possibleSkills'][skill[u'name'].upper()] = dictJobTitlesData[function][u'possibleSkills'].get(skill[u'name'].upper(), 0) + 1
if skill[u'name'] != skill[u'name'].capitalize():
dictJobTitlesData[function][u'possibleSkills'][skill[u'name'].capitalize()] = dictJobTitlesData[function][u'possibleSkills'].get(skill[u'name'].capitalize(), 0) + 1
return dictJobTitlesData
##################################################################################
#SIMPLIFIED DATA DUMPING
##################################################################################
def dumpJobTitleAndDescription(jsonDict, pathOutputFile='./job+pitch.tsv', addJobDescription=False):
'''
Saves the basic linkedIn job data in a tsv file
each line shows:
- one job title (or variant)
- its description(s) / personal pitch(es) / NA (non applicable)
'''
#file of job titles names (and optional description)
with utilsOs.createEmptyFile(pathOutputFile) as outputJobtitles:
for jobTitle, jobData in jsonDict.items():
#only the job title
if addJobDescription == False:
content = u'%s\n' %(jobTitle)
#job title + description
else:
#if there is one or multiple specific description of the job
if u'description' in jobData and len(jobData[u'description']) > 0:
content = u'%s\t%s\n' %(jobTitle, u' _#####_ '.join(jobData[u'description']))
else:
#if there is one or multiple personal pitches that might give us an idea of what is the job
if u'pitch' in jobData and len(jobData[u'pitch']) > 0:
content = u'%s\t%s\n' %(jobTitle, u' _#####_ '.join(jobData[u'pitch']))
#if there is nothing then it's Non Applicable
else:
content = u'%s\t%s\n' %(jobTitle, u'NA')
#dumping to file
outputJobtitles.write(content)
return
def dumpSetToJson(aSet, pathOutput):
'''
Mixes all given taxonomies/ontologies
and returns a set of their content
regardless of the hierarchy (flattens jobtitle at same level)
'''
jsonDict = {}
for elem in aSet:
jsonDict[elem] = None
utilsOs.dumpDictToJsonFile(jsonDict, pathOutput)
return
##################################################################################
#JOB SET MAKER FROM TAXONOMY/ONTOLOGY (flatten the tree so we can compare them on the same level)
##################################################################################
def makeJobSetFromOnto(lowercaseItAll, *argv):
'''
Mixes all given taxonomies/ontologies
and returns a set of their content
regardless of the hierarchy (flattens jobtitle at same level)
'''
allSet = set()
for taxoonto in argv:
allSet = dfsExtractor(taxoonto, allSet, lowercaseItAll)
return allSet
##################################################################################
#TAXONOMY BROWSING TOOLS (from the uniformed json taxonomy format we made)
##################################################################################
def dfsExtractor(tree, setNodes, lowercaseItAll=False):
'''
depth first search of uniformized taxonomy tree
key: code___name of job or category
'''
if len(tree) == 0:
return None
elif type(tree) is dict:
for node, dictNode in tree.items():
if u'___' in node:
#we do not take the code into account
name = node.split(u'___')[1]
#if we want all in lowercase
if lowercaseItAll == True:
name = name.lower()
#we add to the set
setNodes.add(name)
result = dfsExtractor(dictNode, setNodes, lowercaseItAll)
#we add to the set
if result != None:
setNodes = setNodes.union(result)
else:
result = dfsExtractor(dictNode, setNodes, lowercaseItAll)
if result != None:
setNodes = setNodes.union(result)
elif type(tree) is list:
for name in tree:
setNodes.add(name)
return setNodes
##################################################################################
#DICTIONARY SAMPLE MAKER FROM LINKEDIN DATA
##################################################################################
def makeSampleFileHavingNJobTitles(pathInput, pathOutput, n=1000000, addJobDescription=False):
'''
takes the real linkedIn data and makes a sample containing how
many profiles necesary to achieve N functions (jobtitles)
'''
dictJobTitlesData = {}
#sample of all candidates data
outputJson = utilsOs.createEmptyFile(u'%ssample.json' %(pathOutput))
#we read the original json file line by line
with codecs.open(pathInput, u'r', encoding=u'utf8') as jsonFile:
while len(dictJobTitlesData) < n:
jsonData = jsonFile.readline()
#we dump each line into the sample file
outputJson.write(jsonData.replace(u'\r', ''))
#we make a dict out of the string line
jsonLine = utilsOs.convertJsonLineToDict(jsonData)
if jsonLine != None:
#we dump each job title into the jobtitle file
dictJobTitlesData = getJobData(jsonLine, dictJobTitlesData)
#dumping dict content in json
utilsOs.dumpDictToJsonFile(dictJobTitlesData, pathOutputFile=u'%sjobTitlesDataDict.json'%(pathOutput))
#SIMPLIFIED DATA dumping job title (and optional dexcription) to a file
dumpJobTitleAndDescription(dictJobTitlesData, u'%sjob+pitch.tsv'%(pathOutput), addJobDescription)
#closing the files
outputJson.close()
return None
##################################################################################
#JOB SET MAKER FROM LINKEDIN PROFILES
##################################################################################
def makeJobSetFromLinkedIn(pathInput, lowercaseItAll=False, pathOutput=None, n=float('inf')):
'''
makes a set of jobs taken from linkedIn profiles
IF a "pathOutput" is given we save the set as a txt file
of one job per line
IF an "n" argument is given it makes a samble containing how
many profiles necesary to achieve N functions (jobtitles)
'''
jobSet = set()
#open the file
if pathOutput != None:
outputTxt = utilsOs.createEmptyFile(u'%slistOfJobs.linkedIn' %(pathOutput))
#we read the original json file line by line
with open(pathInput) as jsonFile:
#if we want to make a sample n must be an int otherwise it will keep going until eof
jsonData = jsonFile.readline()
while jsonData and len(jobSet) < n:
jobDict = getJobData(json.loads(jsonData))
#we add the job titles
jobsToAdd = set(jobDict.keys())
#if we want them lowercased
if lowercaseItAll != False:
jobsToAdd = set([e.lower() for e in jobsToAdd])
#we dump each job title into the txt file
if pathOutput != None:
for jobTitle in jobsToAdd:
outputTxt.write(u'{0}\n'.format(jobTitle))
#adding to the set
jobSet = jobSet.union(jobsToAdd)
#nextLine
jsonData = jsonFile.readline()
#closing the file
if pathOutput != None:
outputTxt.close()
return jobSet
def loadJobSetFromFile(pathInput, n=float('inf')):
'''
loads a txt file having one job per line as a set
'''
jobSet = set()
with codecs.open(pathInput, u'r', encoding=u'utf8') as txtFile:
line = txtFile.readline()
while line and len(jobSet) < n:
job = line.split(u'\n')[0]
jobSet.add(job)
return jobSet
##################################################################################
#LINKEDIN GRAPH FILES MAKER IN O(n^2) where n is the nb of skills
##################################################################################
def linkedInJobSkillEdgeAndNodeList(pathEdgeFileInput, pathEdgeFileOutput, pathNodeFileOutput, lowercaseItAll=False):
'''
takes the linkedin data and makes an edge list of (columns):
- jobNode(source)
- skillNode(target)
- weight(coreference)
- nbOfTimesJobTitleAppeared
'''
pathTempFile = u'./temp.txt'
#we populate the temp file with reliable profiles (having ONLY one job title)
pathCorefDict = utilsGraph.edgeListTemp(pathEdgeFileInput, pathTempFile, pathEdgeFileOutput, lowercaseItAll)
#we populate the final output file
utilsGraph.edgeListDump(pathTempFile, pathEdgeFileOutput, pathCorefDict)
#we populate the node list (defining each node as source or target)
utilsGraph.nodeListIdType(pathEdgeFileOutput, pathNodeFileOutput)
#delete the temp file
utilsOs.deleteTheFile(u'./', u'temp', u'txt')
##################################################################################
#GOLD STANDARD
##################################################################################
def makeGoldStandardOrora(inputPath, outputPath, goldStandardPath):
'''
opens the input and output file and makes one input with corresponding output file
'''
#INCLUDING the information block columns
###headerLine = u'Id\tCommentIn\tCommentOut\tInformation blocks\tCoded information block\tBlock type'
#EXCLUSING the information block columns
headerLine = u'Id\tCommentIn\tCommentOut'
#create empty gold standard file
gsFile = utilsOs.createEmptyFile(goldStandardPath, headerLine)
#browse the output file line by line
with open(outputPath, u'r', encoding=u'utf8') as outFile:
#header line
headerList = (outFile.readline().replace(u'\n', u'')).split(u'\t')
idCodeColName, commentColName = headerList[1], headerList[2]
#dataframe
inputDf = utilsGraph.getDataFrameFromArgs(inputPath)
line = outFile.readline()
#populate edge list
while line:
#get data
lineList = (line.replace(u'\n', u'')).split(u'\t')
#we replace any tabulation in the comments with ' ___ ' so there are no inconsistencies with the tsv (tab based)
idCode, commentOutLabel, theRest = lineList[1], lineList[2].replace(u'\t', u' ___ '), lineList[3:]
#select
selectInputDf = inputDf.loc[ inputDf[u'Id'] == int(idCode) ]
#get the input comment
commentInLabel = ( selectInputDf.loc[ selectInputDf[idCodeColName] == int(idCode) ] )[u'Comment'].tolist()
if len(commentInLabel) == 1:
commentInLabel = commentInLabel[0].replace(u'\t', u' ___ ')
else:
print('there are multiple rows with the same ID code', idCode, commentInLabel)
#write the line INCLUDING the information block columns
###gsFile.write( u'{0}\t{1}\t{2}\t{3}\n'.format(idCode, commentInLabel, commentOutLabel, u'\t'.join(theRest)) )
#write the line EXCLUDING the information block columns
gsFile.write( u'{0}\t{1}\t{2}\n'.format(idCode, commentInLabel, commentOutLabel) )
#next line
line = outFile.readline()
#close the file
gsFile.close()
#remove the row doubles
gsDf = utilsGraph.getDataFrameFromArgs(goldStandardPath)
gsDf = gsDf.drop_duplicates()
gsDf.to_csv(goldStandardPath, sep='\t', index=False)
return gsDf