-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
executable file
·61 lines (49 loc) · 1.98 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import requests
import urllib.request
import time
import pdb
import csv
from bs4 import BeautifulSoup
def span_filter(span):
children = span.findChildren()
if len(children) > 1:
return False
if "#" in children[0]["href"]:
return False
type_col = span.findParent().findParent().find_all(class_='col3')[0]
if len(type_col.findChildren()) < 1:
return False
elif span.findParent().findParent().find_all(class_='col3')[0].findChildren()[0]['title'] == 'French suited cards':
return True
def complete_links(span):
return span.findChild()['href'].replace('..', 'https://www.pagat.com')
def content_filter(content):
# what do we need to do in terms of preventing a bunch of garbage ending up in our data?
if content.name == 'h1' or content.name == 'ul':
return False
return True
def content_manipulation(content):
if content.name == 'h2':
return f"\n<{kabobify(content.text)}>"
return content.text
def kabobify(text):
return text.lower().replace(' ', '-').replace(',', '').replace('/', '')
url = "https://www.pagat.com/alpha/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# pdb.set_trace()
spans = list(filter(span_filter, soup.find_all(class_="lname")))
game_links = list(map(complete_links, spans))
# pdb.set_trace()
for link in game_links:
game_response = requests.get(link)
game_soup = BeautifulSoup(game_response.content, "html.parser")
title = game_soup.findChildren('h1')[0].text.strip()
text = ''
content = game_soup.find_all(class_="mainContent")[0]
content_parts = map(content_manipulation, filter(content_filter, content.findChildren()))
for text_item in content_parts:
text = text + text_item
with open(f"card-games/{kabobify(title)}.txt", mode='w') as card_game:
csv_writer = csv.writer(card_game, delimiter=' ', quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow([f"<title>{title}</title>", text])