forked from aszhang95/123proj
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentity.py
125 lines (101 loc) · 3.84 KB
/
entity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import json
import requests as req
import spacy
import re
from textblob.en.sentiments import PatternAnalyzer
import numpy as np
import jellyfish as jf
nlp = spacy.load('en')
#list of political words to check against
words = set(['politic', 'govern', 'party', 'secretary of', 'united states', 'president', 'election', 'democr', 'republic', 'senat','representative', 'congress', 'court', 'campaign', ''])
types = {'Person':'PERSON', 'Organization':'ORG'}
def sentiment(comment, searches, subjectivity = False):
doc = nlp(comment)
outs = []
for ind, entity in enumerate(doc.ents):
#checking for entity type
if entity.label_ in types.values():
iden, political_entity = None, None
#avoiding querying knowledge graph every time with running dictionary
if entity.text in searches.keys():
iden, political_entity = searches[entity.text]
else:
found = False
for text in searches.keys():
#comparing with similar ones in case of typoes
if jf.jaro_winkler(text, entity.text) > 0.9:
found = True
searches[entity.text] = searches[text]
iden, political_entity = searches[entity.text]
break
if not found:
iden, political_entity = is_political(entity.text)
if iden:
#get sentiment
analyzed = PatternAnalyzer.analyze(PatternAnalyzer, str(entity.sent))
score = None
if subjectivity:
score = analyzed.polarity * analyzed.subjectivity
else:
score = analyzed.polarity
outs.append((entity.text, iden, political_entity, score))
return outs
def is_political(ent_text):
api_key = 'AIzaSyAMSkyNxAUbhtlvfWOKGJAO8w1hbj2WXC0'
service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
ent_text = re.sub("[^(\w\s)]", "", ent_text)
params = [
('query', ent_text),
('limit', 5),
('indent', True),
('key', api_key)
]
page = req.get(url = service_url, params = params)
data = json.loads(page.text)
out = None
score = 0
political_entity = False
#parsing search results
if 'itemListElement' in data.keys():
for item in data['itemListElement']:
description = None
try:
description = item['result']['detailedDescription']['articleBody']
except:
try:
description = item['result']['description']
except:
#ERROR: NO DESCRIPTION
description = ''
try:
description += ' ' + item['result']['name']
except:
#NO NAME
pass
political = sum([word in description for word in words])
try:
right_type = sum([t in types.keys() for t in item['result']['@type']])
except:
pass
#ERROR IN TYPE
if political and right_type:
if political > score:
try:
out = item['result']['name']
score = political
political_entity = True
except:
pass
#ERROR IN ID/NAME
else:
pass
#ERROR: NO RESULTS FOR {}'.format(ent_text))
#out only has value if entity is political, if not we assume it's the first one.
if not out:
try:
entity = data['itemListElement'][0]['result']
out = entity['name']
except:
#No name
pass
return (out, political_entity)