-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdb.py
87 lines (67 loc) · 3.32 KB
/
db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from constants import DB_FILE_PATH
import sqlite3
import os
#################################################################
def store_results(jobs):
'''Store results into a database.
:param jobs: list of dictionaries which contain job details.
'''
if len(jobs) == 0:
return
con = sqlite3.connect(DB_FILE_PATH)
cur = con.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS jobs
(uid integer primary key, url text, isUrlHiddenFromCrawler integer, dateCaption text, salary integer, content text, fullContent text,
position text, isNew integer, isPremium integer, isEasyApply integer, isRemoteJob integer, isResumeRequired integer,
isAdvertLabel integer, isFavorite integer, destination text, similarGroupId integer, impressionId integer, recommendId integer,
alreadyAppliedText text, hasFewApplies integer, isJobLabelsDisabled integer, hasQuestions integer, projectLogoUrl text,
jobType text, isDeleted integer, robots text, id integer, tags text, tagsNew text, highlightTags text, company_isVerified integer,
company_name text, company_link text, company_isContactsVerified integer, company_doesHaveHires integer,
company_doesHaveManyHires integer, company_isActiveResponses integer, company_logoUrl text, location_name text, location_link text,
location_isWalkingDistanceFromAddress integer, location_isShiftJob integer, awayData_domain text, awayData_link text,
hasSimilarJobsLink integer, isDteJob integer)'''
)
tuple_jobs = [tuple(job.values()) for job in jobs]
keys = jobs[0].keys()
keys_str = ', '.join(keys)
placeholders = ', '.join('?'*len(keys))
query = f"INSERT INTO jobs ({keys_str}) VALUES ({placeholders});"
for job in tuple_jobs:
with con:
try:
cur.execute(query, job)
except sqlite3.IntegrityError:
print(f"Job with uid {job[tuple(keys).index('uid')]} is already stored. Ignoring...")
else:
con.commit()
if len(jobs) != con.total_changes:
print()
print("Result storing is complete. \nNumber of results that are stored in this execution:", con.total_changes, end='\n\n')
cur.close()
con.close()
def exclude_already_stored(jobs):
'''Filter jobs by excluding those which are not stored in the database.
:param jobs: list of dictionaries which contain job details.
:return: filtered list of dictionaries which represent new job.
'''
if len(jobs) == 0:
return
# Check if database exists. If it doesn't then the list of jobs should
# be considered as list of new jobs (which are not seen before)
if not os.path.isfile(DB_FILE_PATH):
return jobs
filtered = []
con = sqlite3.connect(DB_FILE_PATH)
cur = con.cursor()
ids = tuple([int(job['uid']) for job in jobs])
placeholders = ', '.join('?'*len(ids))
# Return found jobs which are also present in the database
query = f"SELECT uid from jobs WHERE uid IN ({placeholders});"
cur.execute(query, ids)
stored_ids = list(map(lambda x: x[0], cur.fetchall()))
for job in jobs:
if int(job['uid']) not in stored_ids:
filtered.append(job)
cur.close()
con.close()
return filtered