forked from commoncrawl/cc-mrjob
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunique_server_analysis.py
53 lines (42 loc) · 2.04 KB
/
unique_server_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import json
from urlparse import urlparse
from mrcc import CCJob
from mrjob.step import MRStep
class ServerAnalysis(CCJob):
def process_record(self, record):
# We're only interested in the JSON responses
if record['Content-Type'] != 'application/json':
return
payload = record.payload.read()
data = json.loads(payload)
# Only interested in 'response, skip the 'metadata' and 'request' entries
if data['Envelope']['WARC-Header-Metadata']['WARC-Type'] != 'response':
return
try:
server = data['Envelope']['Payload-Metadata']['HTTP-Response-Metadata']['Headers']['Server']
url = data['Envelope']['WARC-Header-Metadata']['WARC-Target-URI']
# Extract the domain (aws.amazon.com) from the URL (http://aws.amazon.com/blogs/aws/ec2-maintenance/)
domain = urlparse(url).netloc
# Output the server and the domain
yield server, tuple([domain])
self.increment_counter('commoncrawl', 'processed_server_headers', 1)
except KeyError:
pass
self.increment_counter('commoncrawl', 'processed_pages', 1)
def reducer(self, key, value):
# Take a list of tuples of domains, combine them, find only the unique ones (using a set), return a new tuple
# (example: [('msn.com'), ('wikipedia.org', 'msn.com'), (...)])
# Note: tuple is required as they will be hashed during the MapReduce process
out_val = set(reduce(lambda x, y: x + y, value))
yield key, tuple(out_val)
def reducer_count_total(self, key, value):
# Return the total number of elements in value, which will only be one item in length
# If it was more than one in length, we would turn this into a for loop
yield key, len(value.next())
def steps(self):
return [
MRStep(mapper=self.mapper, combiner=self.reducer, reducer=self.reducer),
MRStep(reducer=self.reducer_count_total)
]
if __name__ == '__main__':
ServerAnalysis.run()