-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathscrape_tokens.py
137 lines (111 loc) · 4.48 KB
/
scrape_tokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import concurrent.futures as cf
import json
import requests
import os
from bs4 import BeautifulSoup
from requests.exceptions import HTTPError
WIKI_URL = "https://wiki.bloodontheclocktower.com{}"
def is_offical(entry):
# An official character has a wiki page for itself.
name = entry["name"]
character_url = "/" + name.replace(" ", "_")
try:
response = requests.get(WIKI_URL.format(character_url));
response.raise_for_status()
return True
except HTTPError as e:
# An unofficial character has no wiki page.
if e.response.reason == "Not Found": return False
# Other errors that may occur are unintended.
raise e
def sync_description(entry):
name = entry["name"]
character_url = "/" + name.replace(" ", "_")
# Step 1: Get the HTML for the character.
response = requests.get(WIKI_URL.format(character_url));
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Step 2: Find the "Summary" elment.
summary_header = soup.find(id='Summary')
# The wiki is set up in a very predictable manner:
# <h2>
# <div id="Summary"> ... </div>
# <h2>
# \n
# <p> "Each night, something bad happens."\n</p>
# This gets the text and trims the quotes.
summary_element = summary_header.parent.nextSibling.nextSibling
summary = summary_element.text[1:-2]
# Step 3: determine if an update is necessary.
if summary == entry["description"]: return
entry["description"] = summary
print("UPDATE " + (name + ": ").ljust(20) + summary)
def download_image(entry):
icon_url = "https://wiki.bloodontheclocktower.com/File:Icon_{}.png"
name = entry["name"]
id = entry["id"]
# Step 1: Get HTML for the image page
response = requests.get(icon_url.format(id))
response.raise_for_status() # Raise an error for bad responses
# Step 2: Parse HTML to find the element with id "file"
soup = BeautifulSoup(response.content, 'html.parser')
file_element = soup.find(id='file')
if file_element is None:
print(f"No element with id 'file' found for ID {id}.")
return
# Step 3: Get the first child which should be an <img> tag
img_tag = file_element.find('img')
if not img_tag or 'src' not in img_tag.attrs:
print(f"No <img> found for ID {id}.")
return
# Step 4: Get the relative src url
relative_url = img_tag['src']
full_image_url = WIKI_URL.format(relative_url)
# Step 5: Download and save the image
img_response = requests.get(full_image_url)
img_response.raise_for_status()
# Step 6: Compare to existing image, and see if an edit is necessary.
filePath = f"assets/icons/official/{id}.png"
if os.path.exists(filePath):
with open(filePath, "rb") as f:
if f.read() == img_response.content:
return
# Save the image
with open(filePath, 'wb') as img_file:
img_file.write(img_response.content)
print("DOWNLOAD " + (name + ": ").ljust(20) + f"{id}.png")
def check_type(entry, official_keys, homebrew_keys):
# Unofficial characters will break the parser if we try to search for
# Them.
if is_offical(entry):
official_keys.append(entry["id"])
homebrew_keys.append(entry["id"])
def main():
with open("data/tokens.json") as f:
data = json.loads(f.read())
official_keys = list()
homebrew_keys = list()
with cf.ThreadPoolExecutor(max_workers=16) as executor:
official_threader = [executor.submit(check_type, data[k], official_keys, homebrew_keys) for k in data.keys()]
cf.wait(official_threader)
with cf.ThreadPoolExecutor(max_workers=16) as executor:
# download_image(entry)
# sync_description(entry)
downloader_threader = [executor.submit(download_image, data[k]) for k in official_keys]
desc_threader = [executor.submit(sync_description, data[k]) for k in official_keys]
cf.wait(downloader_threader)
cf.wait(desc_threader)
# The tokens.json file is intended to be sorted in alphabetical order.
# However, unofficial characters MUST come after official characters.
homebrew_keys.sort()
official_keys.sort()
new_data = dict()
for k in official_keys:
new_data[k] = data[k]
for k in homebrew_keys:
new_data[k] = data[k]
# Put the data back in the box.
with open("data/tokens.json", "w") as f:
f.write(json.dumps(new_data, indent=4))
if __name__ == "__main__":
main()