Skip to content

Commit

Permalink
Thread sites (#40)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson authored Sep 6, 2023
1 parent 70e2218 commit fd88317
Show file tree
Hide file tree
Showing 7 changed files with 977 additions and 168 deletions.
1,007 changes: 890 additions & 117 deletions JobSpy_Demo.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.0"
version = "1.1.1"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <[email protected]>", "Cullen Watson <[email protected]>"]
readme = "README.md"
Expand Down
91 changes: 65 additions & 26 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
import pandas as pd
from typing import List, Tuple
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, NamedTuple, Dict

from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import ScraperInput, Site, JobResponse, Country


SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
}


class ScrapeResults(NamedTuple):
jobs: pd.DataFrame
errors: pd.DataFrame


def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]

Expand All @@ -28,8 +34,9 @@ def scrape_jobs(
job_type: JobType = None,
easy_apply: bool = False, # linkedin
results_wanted: int = 15,
country: str = "usa",
) -> pd.DataFrame:
country_indeed: str = "usa",
hyperlinks: bool = False
) -> ScrapeResults:
"""
Asynchronously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data
Expand All @@ -38,7 +45,7 @@ def scrape_jobs(
if type(site_name) == str:
site_name = _map_str_to_site(site_name)

country_enum = Country.from_string(country)
country_enum = Country.from_string(country_indeed)

site_type = [site_name] if type(site_name) == Site else site_name
scraper_input = ScraperInput(
Expand All @@ -54,22 +61,35 @@ def scrape_jobs(
)

def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class()
scraped_data: JobResponse = scraper.scrape(scraper_input)

try:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class()
scraped_data: JobResponse = scraper.scrape(scraper_input)
except Exception as e:
scraped_data = JobResponse(jobs=[], error=str(e), success=False)
return site.value, scraped_data

results = {}
for site in scraper_input.site_type:
results, errors = {}, {}

def worker(site):
site_value, scraped_data = scrape_site(site)
results[site_value] = scraped_data
return site_value, scraped_data

with ThreadPoolExecutor() as executor:
future_to_site = {executor.submit(worker, site): site for site in scraper_input.site_type}

for future in concurrent.futures.as_completed(future_to_site):
site_value, scraped_data = future.result()
results[site_value] = scraped_data
if scraped_data.error:
errors[site_value] = scraped_data.error

dfs = []

for site, job_response in results.items():
for job in job_response.jobs:
data = job.dict()
data["job_url_hyper"] = f'<a href="{data["job_url"]}">{data["job_url"]}</a>'
data["site"] = site
data["company"] = data["company_name"]
if data["job_type"]:
Expand Down Expand Up @@ -99,23 +119,42 @@ def scrape_site(site: Site) -> Tuple[str, JobResponse]:
job_df = pd.DataFrame([data])
dfs.append(job_df)

errors_list = [(key, value) for key, value in errors.items()]
errors_df = pd.DataFrame(errors_list, columns=["Site", "Error"])


if dfs:
df = pd.concat(dfs, ignore_index=True)
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url",
"description",
]
if hyperlinks:
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url_hyper",
"description",
]
else:
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url",
"description",
]
df = df[desired_order]
else:
df = pd.DataFrame()

return df
return ScrapeResults(jobs=df, errors=errors_df)
8 changes: 0 additions & 8 deletions src/jobspy/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,6 @@ class ScraperInput(BaseModel):
results_wanted: int = 15


class CommonResponse(BaseModel):
status: Optional[str]
error: Optional[str]
linkedin: Optional[Any] = None
indeed: Optional[Any] = None
zip_recruiter: Optional[Any] = None


class Scraper:
def __init__(self, site: Site):
self.site = site
Expand Down
3 changes: 0 additions & 3 deletions src/jobspy/scrapers/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,6 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
error=f"Indeed failed to parse response: {e}",
)
except Exception as e:
print(f"LinkedIn failed to scrape: {e}\n{traceback.format_exc()}")
return JobResponse(
success=False,
error=f"Indeed failed to scrape: {e}",
Expand Down Expand Up @@ -230,11 +229,9 @@ def get_description(self, job_page_url: str, session: tls_client.Session) -> str
formatted_url, allow_redirects=True, timeout_seconds=5
)
except requests.exceptions.Timeout:
print("The request timed out.")
return None

if response.status_code not in range(200, 400):
print("status code not in range")
return None

raw_description = response.json()["body"]["jobInfoWrapperModel"][
Expand Down
32 changes: 21 additions & 11 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import Optional, Tuple
from datetime import datetime
import traceback

import requests
from requests.exceptions import Timeout
from bs4 import BeautifulSoup
from bs4.element import Tag

Expand Down Expand Up @@ -67,9 +69,12 @@ def job_type_code(job_type):
)

if response.status_code != 200:
reason = ' (too many requests)' if response.status_code == 429 else ''
return JobResponse(
success=False,
error=f"Response returned {response.status_code}",
error=f"LinkedIn returned {response.status_code} {reason}",
jobs=job_list,
total_results=job_count,
)

soup = BeautifulSoup(response.text, "html.parser")
Expand Down Expand Up @@ -113,7 +118,10 @@ def job_type_code(job_type):
description, job_type = LinkedInScraper.get_description(job_url)
if datetime_tag:
datetime_str = datetime_tag["datetime"]
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except Exception as e:
date_posted = None
else:
date_posted = None

Expand All @@ -130,15 +138,13 @@ def job_type_code(job_type):
),
)
job_list.append(job_post)
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
if processed_jobs >= job_count:
break
if (
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
if len(job_list) >= scraper_input.results_wanted:
break
if processed_jobs >= job_count:
break
if len(job_list) >= scraper_input.results_wanted:
break

page += 1
Expand All @@ -158,7 +164,11 @@ def get_description(job_page_url: str) -> Optional[str]:
:param job_page_url:
:return: description or None
"""
response = requests.get(job_page_url, allow_redirects=True)
try:
response = requests.get(job_page_url, timeout=5)
except Timeout:
return None, None

if response.status_code not in range(200, 400):
return None, None

Expand Down
2 changes: 0 additions & 2 deletions src/jobspy/scrapers/ziprecruiter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
error=f"ZipRecruiter returned status code {e.status_code}",
)
except Exception as e:
print(f"ZipRecruiter failed to scrape: {e}\n{traceback.format_exc()}")
return JobResponse(
success=False,
error=f"ZipRecruiter failed to scrape: {e}",
Expand Down Expand Up @@ -302,7 +301,6 @@ def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[st
timeout_seconds=5,
)
except requests.exceptions.Timeout:
print("The request timed out.")
return None

html_string = response.content
Expand Down

0 comments on commit fd88317

Please sign in to comment.