-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathfunctions.py
95 lines (76 loc) · 2.37 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# import packages
import bs4
from bs4 import BeautifulSoup
# get soup object
def get_soup(text):
return BeautifulSoup(text, "lxml", from_encoding="utf-8")
# extract company
def extract_company(div):
company = div.find_all(name="span", attrs={"class":"company"})
if len(company) > 0:
for b in company:
return (b.text.strip())
else:
sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
for span in sec_try:
return (span.text.strip())
return 'NOT_FOUND'
# extract job salary
def extract_salary(div):
try:
return (div.find('nobr').text)
except:
try:
div_two = div.find(name='div', attrs={'class':'sjcl'})
div_three = div_two.find('div')
salaries.append(div_three.text.strip())
except:
return ('NOT_FOUND')
return 'NOT_FOUND'
# extract job location
def extract_location(div):
for span in div.findAll('span', attrs={'class': 'location'}):
return (span.text)
return 'NOT_FOUND'
# extract job title
def extract_job_title(div):
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
return (a['title'])
return('NOT_FOUND')
# extract jd summary
def extract_summary(div):
spans = div.findAll('span', attrs={'class': 'summary'})
for span in spans:
return (span.text.strip())
return 'NOT_FOUND'
# extract link of job description
def extract_link(div):
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
return (a['href'])
return('NOT_FOUND')
# extract date of job when it was posted
def extract_date(div):
try:
spans = div.findAll('span', attrs={'class': 'date'})
for span in spans:
return (span.text.strip())
except:
return 'NOT_FOUND'
return 'NOT_FOUND'
# extract full job description from link
def extract_fulltext(url):
try:
page = requests.get('http://www.indeed.com' + url)
soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8")
spans = soup.findAll('span', attrs={'class': 'summary'})
for span in spans:
return (span.text.strip())
except:
return 'NOT_FOUND'
return 'NOT_FOUND'
# write logs to file
def write_logs(text):
# print(text + '\n')
f = open('log.txt','a')
f.write(text + '\n')
f.close()