-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscraper.py
103 lines (74 loc) · 3.08 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import csv
import os
import google.oauth2.credentials
import pickle
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow
CLIENT_SECRETS_FILE = "client_secret.json"
SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'
def get_authenticated_service():
credentials = None
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
credentials = pickle.load(token)
if not credentials or not credentials.valid:
if credentials and credentials.expired and credentials.refresh_token:
credentials.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
CLIENT_SECRETS_FILE, SCOPES)
credentials = flow.run_console()
with open('token.pickle', 'wb') as token:
pickle.dump(credentials, token)
return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)
def get_video_comments(service, **kwargs):
comments = []
results = service.commentThreads().list(**kwargs).execute()
while results:
for item in results['items']:
comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
comments.append(comment)
if 'nextPageToken' in results:
kwargs['pageToken'] = results['nextPageToken']
results = service.commentThreads().list(**kwargs).execute()
else:
break
return comments
def write_to_csv(comments):
with open('comments.csv', 'w') as comments_file:
comments_writer = csv.writer(comments_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
comments_writer.writerow(['Video ID', 'Title', 'Comment'])
for row in comments:
comments_writer.writerow(list(row))
def get_videos(service, **kwargs):
final_results = []
results = service.search().list(**kwargs).execute()
i = 0
max_pages = 3
while results and i < max_pages:
final_results.extend(results['items'])
# Check if another page exists
if 'nextPageToken' in results:
kwargs['pageToken'] = results['nextPageToken']
results = service.search().list(**kwargs).execute()
i += 1
else:
break
return final_results
def search_videos_by_keyword(service, **kwargs):
results = get_videos(service, **kwargs)
final_result = []
for item in results:
title = item['snippet']['title']
video_id = item['id']['videoId']
comments = get_video_comments(service, part='snippet', videoId=video_id, textFormat='plainText')
final_result.extend([(video_id, title, comment) for comment in comments])
write_to_csv(final_result)
if __name__ == '__main__':
os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
service = get_authenticated_service()
keyword = input('Enter a keyword: ')
search_videos_by_keyword(service, q=keyword, part='id,snippet', eventType='completed', type='video')