forked from commoncrawl/cc-mrjob
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathserver_analysis.py
29 lines (21 loc) · 885 Bytes
/
server_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import json
from mrcc import CCJob
class ServerAnalysis(CCJob):
def process_record(self, record):
# We're only interested in the JSON responses
if record['Content-Type'] != 'application/json':
return
payload = record.payload.read()
data = json.loads(payload)
# Only interested in 'response, skip the 'metadata' and 'request' entries
if data['Envelope']['WARC-Header-Metadata']['WARC-Type'] != 'response':
return
try:
server = data['Envelope']['Payload-Metadata']['HTTP-Response-Metadata']['Headers']['Server']
yield server, 1
self.increment_counter('commoncrawl', 'processed_server_headers', 1)
except KeyError:
pass
self.increment_counter('commoncrawl', 'processed_pages', 1)
if __name__ == '__main__':
ServerAnalysis.run()