-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathtfidf.py
69 lines (49 loc) · 2.12 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pickle, nltk
from stopwords import stopwords, is_stopword
from nltk import TextCollection, Text
from pprint import pprint
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
def valid(word):
tag = unigram_tagger.tag([word])[0][1]
return tag and tag.startswith("NN")
with open("l2.pickle") as f:
lyrics = pickle.load(f)
i = 0
def add_info(lyric, text_col, i=0):
def prep_word(w):
return w.lower()
a = dict(lyric)
a["lyrics"] = Text(w.lower() for w in nltk.word_tokenize(" ".join(a["lyrics"]).encode("utf-8")))
a["terms"] = set(w for w in a["lyrics"] if not is_stopword(w) and valid(w))
a["tf_idf"] = dict(sorted( ( (t, text_col.tf_idf(t, a["lyrics"])) for t in a["terms"] ), key=lambda x : x[1]) )
i += 1
print i
return a
ts = TextCollection([Text((w.lower()
for w in nltk.word_tokenize(" ".join(l["lyrics"]).encode('utf-8')) if not is_stopword(w) and valid(w) ) ,
name = l["id"])
for l in lyrics if l.get("id")])
#lyrics = map(lambda x : add_info(x[1], ts, x[0]), ((i, l) for i, l in enumerate(lyrics) if l.get("id")))
#with open("withtfidf.pickle", "w") as f:
# pickle.dump(lyrics, f)
with open("withtfidf.pickle", "r") as f:
lyrics = pickle.load(f)
lyrics = dict((l["id"], l) for l in lyrics)
def similar_lyrics(lyric):
for l in lyrics.values():
if l["lyrics"] != lyric["lyrics"]:
total_terms = lyric["terms"].union(l["terms"])
a, b = [lyric["tf_idf"].get(t) or 0 for t in total_terms], [l["tf_idf"].get(t) or 0 for t in total_terms]
yield nltk.cosine_distance(a, b), l["id"]
def give_me_similar(id):
p, s_id = min(sorted(list(similar_lyrics(lyrics[id]))))
return lyrics[s_id]["title"], p, s_id
def print_l(id):
print " ".join(lyrics[id]["lyrics"])
def most_similar(id):
return min(give_me_similar(id))
for v in lyrics.values()[:10]:
print v["id"], v["lyrics"]