-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtwitter_analysis.py
447 lines (324 loc) · 17.2 KB
/
twitter_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
# -*- coding: utf-8 -*-
# Setup
"""
# Import needed libraries
from google.colab import drive # to mount Google Drive to Colab notebook
import tweepy # Python wrapper around Twitter API
import json
import pandas as pd
import csv
from datetime import date
from datetime import datetime
import time
import matplotlib.pyplot as plt
# Mounting Google Drive
drive.mount('/content/gdrive')
path = './gdrive/My Drive/datasets/twitter_analysis/'
"""# Twitter Data Collection
## Log into Twitter API
"""
# Load Twitter API secrets from an external file
secrets = json.loads(open(path + 'secrets.json').read())
consumer_key = secrets['consumer_key']
consumer_secret = secrets['consumer_secret']
access_token = secrets['access_token']
access_token_secret = secrets['access_token_secret']
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
"""## Helper Functions
### Save JSON file
"""
# Helper function to save data into a JSON file
# file_name: the name of the data file you want to save on your Google Drive
# file_content: the data you want to save
def save_json(file_name, file_content):
with open(path + file_name, 'w', encoding='utf-8') as f:
json.dump(file_content, f, ensure_ascii=False, indent=4)
"""### Twitter API limit handling"""
# Helper function to handle twitter API rate limit
def limit_handled(cursor, list_name):
while True:
try:
yield cursor.next()
except tweepy.RateLimitError:
print("\nCurrent number of data points in list = " + str(len(list_name)))
print('Hit Twitter API rate limit.')
for i in range(3, 0, -1):
print("Wait for {} mins.".format(i * 5))
time.sleep(5 * 60)
except tweepy.error.TweepError:
print('\nCaught TweepError exception' )
# update these for whatever tweet you want to process replies to
name = 'malkassabi'
tweet_id = '1212081182365622273'
replies=[]
cursor = tweepy.Cursor(api.search,q='to:'+name, result_type='recent', timeout=999999).items(1000)
for tweet in cursor:
if hasattr(tweet, 'in_reply_to_status_id_str'):
if (tweet.in_reply_to_status_id_str==tweet_id):
replies.append(tweet)
# save_json('replies.json', replies)
with open(path + 'replies.csv', 'wb') as f:
csv_writer = csv.DictWriter(f, fieldnames=('user', 'text'))
csv_writer.writeheader()
for tweet in replies:
row = {'user': tweet.user.screen_name, 'text': tweet.text.encode('ascii', 'ignore').replace('\n', ' ')}
csv_writer.writerow(row)
len(replies)
"""## Data Collection Functions
### Get all tweets
"""
# Helper function to get all tweets for a specified user
# NOTE: This method only allows access to the most recent 3240 tweets
# Source: https://gist.github.com/yanofsky/5436496
def get_all_tweets(screen_name):
# initialize a list to hold all the tweepy Tweets
alltweets = []
# make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
# save most recent tweets
alltweets.extend(new_tweets)
# save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
# keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print("getting tweets before %s" % (oldest))
# all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
# save most recent tweets
alltweets.extend(new_tweets)
# update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print("...%s tweets downloaded so far" % (len(alltweets)))
# transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.text, tweet.favorite_count,
tweet.in_reply_to_screen_name, tweet.retweeted] for tweet in alltweets]
# write the csv
with open(path + '%s_tweets.csv' % screen_name, 'w') as f:
writer = csv.writer(f)
writer.writerow(["id","created_at","text","likes","in reply to","retweeted"])
writer.writerows(outtweets)
pass
"""### Get today's twitter stats"""
# Helper function to get today's numbers of followers and friends and store
# them into a JSON file
def todays_stats(dict_name):
info = api.me()
followers_cnt = info.followers_count
following_cnt = info.friends_count
today = date.today()
d = today.strftime("%b %d, %Y")
if d not in dict_name:
dict_name[d] = {"followers":followers_cnt, "following":following_cnt}
save_json("follower_history.json", dict_name)
else:
print('Today\'s stats already exist')
"""### Get followers data"""
# Helper function to load follower objects into a list and save it into
# a JSON file.
def get_followers():
followers = []
cursor = tweepy.Cursor(api.followers, count=200).pages()
for i, page in enumerate(limit_handled(cursor, followers)):
print("\r"+"Loading"+ i % 5 *".", end='')
followers += page
followers = [x._json for x in followers]
save_json('followers_data.json', followers)
"""### Get friends data"""
# Load friends into list
def get_friends():
friends = []
for i, page in enumerate(limit_handled(tweepy.Cursor(api.friends, count=200).pages(), friends)):
print("\r"+"Loading"+ i % 5 *".", end='')
friends += page
friends = [x._json for x in friends]
save_json('friends_data.json', friends)
"""## Data Collection Main Script"""
if __name__ == '__main__':
#pass in the username of the account you want to download
get_all_tweets("alihilal94")
with open(path + 'follower_history.json') as json_file:
history = json.load(json_file)
todays_stats(history)
get_followers()
get_friends()
api.followers()
type(test)
"""# Analyzing tweets
## Load saved data from Google Drive
"""
# Load all saved tweets from @alihilal94
tweets = pd.read_csv(path + 'alihilal94_tweets.csv')
"""### Classify tweets"""
# Classify the type of each tweet (i.e. Tweet, Reply, or Retweet)
tweets.retweeted = tweets.text.apply(lambda x: True if 'RT @' in x else False)
tweets['in reply to'] = tweets['in reply to'].fillna('N/A')
tweets['type'] = tweets['in reply to'].apply(lambda x: 'Tweet' if x == "alihilal94" or x == 'N/A' else 'Reply')
tweets.loc[tweets.retweeted, 'type'] = 'Retweet'
tweets.to_csv(path + 'alihilal94_tweets.csv')
# Find the number of tweets in each type
print('Total number of tweets = {}'.format(len(tweets)))
print('\nNumber of tweets by')
tweets.groupby('type').count().id
# Find the average number of likes per tweet type
tweets.groupby('type').mean().likes
# Find and print the top 10 tweets by number of likes
top10 = tweets.sort_values('likes',ascending=False)
top10 = top10.reset_index(drop=True)
top10 = top10.head(10)
for i in range(len(top10)):
print("{}) At {} likes:".format(i+1, top10.likes[i]))
print(top10.text[i]+"\n")
# Find tweets with likes > 49 but < 100
# 49 = the average number of likes my tweets get
# 100 was arbitrarily chosen
tweets[((tweets.likes > 49) & (tweets.likes < 100))].sort_values('likes',ascending=False)
plot_data = tweets[tweets.type == "Tweet"]
plot_data = plot_data[['id','created_at','likes']]
plot_data['time'] = plot_data['created_at'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,6))
fig.suptitle('Tweets vs. Likes Received')
ax1.scatter(plot_data.time, plot_data.likes)
ax2.scatter(plot_data.time, plot_data.likes)
left = date(2018, 10, 1)
right = date(2020, 1, 1)
ax2.set_xlim(left=left, right=right);
"""#Analyzing followers
##Load Saved data from Google Drive
"""
followers = pd.read_json(path + 'followers_data.json') # Followers User ojbects
df = followers
df = df.fillna('Empty')
df = df.replace('','Empty')
# Create list of tags to identify each location
turkey_tags =['turkey', 'Türkiye', 'Ankara', 'İstanbul', 'deprem', 'istanbulhavalimanı', 'Kayseri']
saudi_tags = ['saudi', 'ksa', 'riyadh', 'jeddah', 'makkah', 'madina', 'dammam', 'khobar', 'السعودية', 'السعوديه', 'جدة', 'الرياض', 'الدمام', 'neom', 'eastern', 'riyad', 'riydah', 'med', 'jed',
'jubail', 'المدينة', 'ابها', 'مكة', 'sa', 'k.s.a', 'qatif', 'medina', 'tabuk', 'dhahran', 'abha', 'hail', 'qassim', 'mecca', 'ruh', 'buraydah', 'الخبر', 'الشرقية', 'الحجاز', 'بريدة', 'القصيم', 'المملكة العربية', 'جده', 'مكه',
'الحد_الجنوبي', 'الحرمين', 'نجد', 'الرّس', 'j town', 'جازان', 'unayzah', 'الخُبر', 'الظهران', 'تبوك', 'حائل', 'طيبة', 'المجمعة', 'yanbu', 'taif', 'baljurashi', 'الأحساء', 'jeedah', '966', 'suudi', 'أملج', 'اللهم احفظ بلادي', 'الجبيل',
'سعودية', 'ينبع', 'ryiadh', 'الطائف', 'jdh', 'Alkharj', 'طويق', 'الباحة', 'أم الملح', 'Al Badai', 'البدائع', 'الدرعيه', 'к.ѕ.α', 'المنوره', 'Abu Arish', 'Alkhober', 'rio', '🇸🇦', 'ام الملح', 'فوق هام السحب']
gulf_tags = ['uae', 'kuwait', 'qatar', 'yemen', 'oman', 'bahrain', 'اليمن', 'الإمارات', 'الكويت', 'قطر', 'البحرين', 'عمان', 'abu dhabi', 'q8', 'الجزيره',
'dubai', 'الامارات', 'مسقط', 'شبه الجزيرة', 'جزيرة العرب', 'al ain', 'kwt', 'u.a.e', 'حضرموت', 'alain', 'arabian island']
mena_tags = ['egypt', 'مصر', 'libya', 'ليبيا', 'iraq', 'العراق', 'sudan', 'syria', 'jordan', 'الأردن', 'morocco', 'tunisia',
'algeria', 'mauritania', 'palestine', 'فلسطين', 'cairo', 'gaza', 'الوطن العربي', 'Algiers', 'Marrakech', 'Mansoura', 'Algérie', 'amman', 'rafah', 'الاردن', 'لبنان', 'غزة', 'egy', 'eg', 'سوريا', 'حلب', 'الجزائر',
'khartoum', 'tunis', 'الإسكندرية', 'السودان', 'lebanon', 'irbid', 'Touggourt', 'ramallah', 'سطيف', 'عمّـان', 'الأسكندرية', 'العالم الإسلامي', 'بلد المليون شهيد', 'MENA']
us_tags = ['us', 'united', 'states', 'usa', 'أمريكا', 'los angeles', 'NY', 'IL', 'DC', 'boston', 'stanford', 'philadelphia', 'CT', 'new orleans', 'miami', 'u.s.a', 'omaha',
'new york', 'manhattan', 'mn', 'menlo park', 'wa', 'az', 'sf', 'tx', 'pa', 'or', 'portola valley', 'bay area', 'cambridge', 'va', 'fl', 'ga', 'lincoln', 'irvine', 'detroit', 'halifax',
'ohio', 'nm', 'تكساس', 'co', 'oh', 'nc', 'الامريكية', 'cypress', '92108', 'West Lafayette', 'Utah']
uk_tags = ['uk', 'london', 'leeds', 'new castle', 'manchester', 'liverpool', 'england', 'ireland', 'scotland', 'glasgow']
euro_tags = ['paris', 'france', 'spain', 'munich', 'italy', 'netherlands', 'nederland', 'Deutschland', 'germany', 'berlin', 'finland', 'switzerland', 'sweden', 'stuttgart', 'Malmö', 'Sverige', 'Rotterdam']
canada_tags =['canada', 'british columbia', 'ontario', 'Montréal', 'calgary']
# Create logical masks to clean up data
saudi = df.location.str.contains('|'.join(saudi_tags), case=False)
gulf = df.location.str.contains('|'.join(gulf_tags), case=False)
mena = df.location.str.contains('|'.join(mena_tags), case=False)
us = df.location.str.contains('|'.join(us_tags), case=False)
uk = df.location.str.contains('|'.join(uk_tags), case=False)
euro = df.location.str.contains('|'.join(euro_tags), case=False)
canada = df.location.str.contains('|'.join(canada_tags), case=False)
turkey = df.location.str.contains('|'.join(turkey_tags), case=False)
empty = df.location.str.contains('Empty')
other = saudi | gulf | mena | us | uk | euro | canada | turkey | empty
other = ~other
# Clean up data
df.loc[gulf, 'location'] = 'Gulf'
df.loc[mena, 'location'] = 'MENA'
df.loc[us, 'location'] = 'US'
df.loc[uk, 'location'] = 'UK'
df.loc[euro, 'location'] = 'Europe'
df.loc[canada, 'location'] = 'Canada'
df.loc[turkey, 'location'] = 'Turkey'
df.loc[saudi, 'location'] = 'Saudi Arabia'
df.loc[empty, 'location'] = 'Empty'
df.loc[other, 'location'] = 'Other'
print('Number of unique locations before clean up: ' + str(len(followers.location.unique())))
print('Number of unique locations after clean up: ' + str(len(df.location.unique())))
print('Total number of followers: {}\n'.format(len(followers)))
print('Followers by')
df.groupby('location').count().sort_values('id',ascending=False).id
# List the unique locations under Other
ids = df[df.location == 'Other'].id
followers[followers.id.isin(ids)].location.unique()
print("Number of followers who have more than 30k followers: {}".format(len(df.loc[df.followers_count >= 3e4])))
print('Number of verified followers: {}'.format(len(df[df.verified == True])))
"""#Analyzing following
## Load Saved Data from Google Drive
"""
following = pd.read_json(path + 'friends_data.json') # Friends User ojbects
df = following
df = df.fillna('Empty')
df = df.replace('','Empty')
# Create list of tags to categorize following
gov_url_tags =['.gov', 'neom', 'qiddiya', '.sa', '.com.sa', '.tr', '.state', '.iq', '.ae']
gov_bio_tags = ['وزير', 'نائب مدير', 'وكيل', 'وزارة', 'محافظ', 'هيئة', 'مدير', 'minster', 'bakan', 'member']
academic_url_tags =['.edu']
academic_bio_tags =['prof', 'professor', 'research', 'أستاذ']
vc_tags = ['fund', 'vc', 'partner', 'investor', 'venture', 'gp']
ai_tags = ['ai', 'ml', 'deep learning', 'machine learning', 'vision', 'keras', 'tensorflow', 'Data Scientist', 'Data Science', 'nlp', 'xgboost', 'علم البيانات']
product_tags = ['product', 'cpo', 'design', 'PM']
fitness_tags = ['fitness', 'muscle', 'nba']
ceo_tags = ['ceo', 'الرئيس التنفيذي']
magazine_tags = ['forbes', 'techcrunch', 'inc', 'nat geo', 'harvardbiz', 'engadget', 'freakonomics', 'tedtalks', 'reutersbiz', 'huffpost',
'wsj', 'business', 'theeconomist', 'bw', 'businessinsider', 'cnn', 'time', 'popmech', 'Entrepreneur', 'fastcompany', 'alarabiya']
writer_tags = ['writer', 'editor', 'author', 'journalist', 'skinny canadian', 'news', 'report', 'cover', 'كاتب', 'صحفي', 'أكتب', 'writing', 'correspondent']
energy_tags = ['energy', 'exxon', 'أرامكو']
startup_tags = ['founder', 'yc', 'ycw', 'co-founder', 'مؤسس']
programmer_tags = ['Python','Swift', 'Android','iOS','Java', 'developer', 'software', 'linux', 'hacker', 'هاكر', 'مطور', 'مبرمج', '.net', 'php', 'laravel', 'gnu', 'engineer', 'cto']
# Create logical masks to clean up data
# Based on URL
gov_url = df['url'].str.contains('|'.join(gov_url_tags), case=False)
academic_url = df['url'].str.contains('|'.join(academic_url_tags), case=False)
# Based on twitter profile bio
gov_bio = df['description'].str.contains('|'.join(gov_bio_tags), case=False)
academic_bio = df['description'].str.contains('|'.join(academic_bio_tags), case=False)
vc = df['description'].str.contains('|'.join(vc_tags), case=False)
ai = df['description'].str.contains('|'.join(ai_tags), case=False)
product = df['description'].str.contains('|'.join(product_tags), case=False)
writer = df['description'].str.contains('|'.join(writer_tags), case=False)
fitness = df['description'].str.contains('|'.join(fitness_tags), case=False)
energy = df['description'].str.contains('|'.join(energy_tags), case=False)
startup = df['description'].str.contains('|'.join(startup_tags), case=False)
ceo = df['description'].str.contains('|'.join(ceo_tags), case=False)
programmer = df['description'].str.contains('|'.join(programmer_tags), case=False)
# Based on twitter ID
magazine = df['screen_name'].str.contains('|'.join(magazine_tags), case=False)
# Based on a combination
gov = gov_url | gov_bio
academic = academic_url | academic_bio
media = writer | magazine
# Clean up data
df.loc[gov, 'Category'] = 'Gov'
df.loc[academic, 'Category'] = 'Academic'
df.loc[vc, 'Category'] = 'VC'
df.loc[ai, 'Category'] = 'AI'
df.loc[product, 'Category'] = 'Product'
df.loc[media, 'Category'] = 'Media (writer/magazine)'
df.loc[fitness, 'Category'] = 'Fitness'
df.loc[energy, 'Category'] = 'Energy'
df.loc[startup, 'Category'] = 'Startup'
df.loc[ceo, 'Category'] = 'CEO'
df.loc[programmer, 'Category'] = 'Programmer'
df.fillna('Other',inplace=True)
print('Total number of users I\'m following: {}\n'.format(len(following)))
print('Following by')
df.groupby('Category').count().id.sort_values(ascending=False)
# List the users under Other
ids = df[df.Category == 'Other'].id
following[following.id.isin(ids)][['screen_name','description']]
# Unfollow all users in the fitness category
fitness_df = df[df.Category == 'Fitness']
for _, row in fitness_df.iterrows(): # iterate over rows in the fitness data
api.destroy_friendship(row['id']) # use twitter api to unfollow based on id
"""# Analyzing History
## Load Saved Data from Google Drive
"""
history = pd.read_json(path + 'follower_history.json').transpose() # Follower history dictionary
dates = history.index.strftime('%b %d')
# Plot the number of my followers over time
plt.bar(dates, history.followers, figure=plt.figure(figsize=(10,6)), width=0.4);
plt.ylim(8000, 1e4);
plt.title('Followers over time in 2019');
# Plot number of users I'm following over time
plt.bar(dates, history.following, figure=plt.figure(figsize=(10,6)), width=0.4);
plt.ylim(1.35e3, 1.48e3);
plt.title('Following over time in 2019');