-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfrequentWordFinder.py
149 lines (124 loc) · 4.05 KB
/
frequentWordFinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
Name: João Pedro Simões Alegria
NMEC: 85048
Name: Filipe Neto Pires
NMEC: 85122
Main Script that counts the words of a given book
Uses exact counting, fixed probability counting and decreasing logarithmic probability counting.
"""
import random
import math
import os
import sys
import re
import string
import getopt
from stop_words import get_stop_words
from langdetect import detect
from count_min_sketch import CountMinSketch
def countAll(wordCount, word):
if word not in wordCount:
wordCount[word] = 1
else:
wordCount[word] += 1
def detectLang(filename):
maxTries = 10
file = open(filename)
tmpDecision = []
for line in file:
if line == "" or line == "\n":
continue
line = re.sub(r'\_', " ", re.sub(
r"[^\w\s]", " ", line.strip().lower()))
lang = detect(line)
if len(tmpDecision) < maxTries:
tmpDecision.append(lang)
else:
break
lang = ""
countLang = 0
for l in set(tmpDecision):
if tmpDecision.count(l) > countLang:
countLang = tmpDecision.count(l)
lang = l
return lang
def findFrequentWords(filename, outputfile, numHash, numColumns):
ALLCOUNT = {}
MINSKETCHCOUNT = {}
lang = detectLang(filename)
stopWords = get_stop_words(lang)
f = open(filename)
allCount = {}
minSketchCount = {}
countMinSketch = CountMinSketch(numColumns, numHash)
for line in f:
if line == "" or line == "\n":
continue
line = re.sub(r'\_', " ", re.sub(
r"[^\w\s]", " ", line.strip().lower()))
line = re.split(r" +", line)
for word in line:
if len(word) > 1 and word not in stopWords:
countAll(allCount, word)
countMinSketch.update(word)
for k in allCount:
ALLCOUNT[k] = [allCount[k]]
MINSKETCHCOUNT[k] = [countMinSketch.query(k)]
f.close()
out = open(outputfile, "w")
out.write(
"Key,All,Sketch,SketchMeanAbsEr,SketchMeanRelEr\n")
for key in ALLCOUNT:
avgAll = round(sum(ALLCOUNT[key])/len(ALLCOUNT[key]), 2)
avgSketch = round(sum(MINSKETCHCOUNT[key])/len(MINSKETCHCOUNT[key]), 2)
sketchError = [MINSKETCHCOUNT[key][i]-ALLCOUNT[key][i]
for i in range(len(ALLCOUNT[key]))]
sketchMeanAbsEr = sum(sketchError)/len(sketchError)
sketchMeanRelEr = (sketchMeanAbsEr/avgAll)*100
out.write("{},{},{},{},{}\n".format(key, round(avgAll), round(
avgSketch, 2), round(sketchMeanAbsEr, 2), round(sketchMeanRelEr, 2)))
out.close()
def main(argv):
HELP = """
USAGE:
python3 frequentWordFinder.py [-h] [-o outputFolder] [-d numHash] [-m numColumns] <inputFolder>
ARGUMENTS:
inputFolder - folder where the books to be processed should be present
OPTIONS:
outputFolder - defines the folder where the resulting files should be stores(default -> out)
numHash - number of hash functions used in the count-min sketch strategy
numColumns - number of columns used in the count-min sketch strategy
"""
try:
opts, args = getopt.getopt(sys.argv[1:], "hd:m:o:")
except getopt.GetoptError as err:
print(HELP)
sys.exit(2)
outfolder = "out/"
numHash = 5
numColumns = 1000
for o, a in opts:
if o == "-o":
outfolder = a
elif o == "-d":
numHash = int(a)
elif o == "-m":
numColumns = int(a)
elif o == "-h":
print(HELP)
sys.exit(3)
if len(args) < 1:
print(HELP)
sys.exit(1)
if not os.path.exists(outfolder):
os.makedirs(outfolder)
infolder = args[0]
files = [infolder + x for x in os.listdir(infolder)]
outfiles = [outfolder + "/" + x+"Out.csv" for x in os.listdir(infolder)]
if not os.path.exists(outfolder):
os.makedirs(outfolder)
for idx, filename in enumerate(files):
findFrequentWords(
filename, outfiles[idx], numHash, numColumns)
if __name__ == "__main__":
main(sys.argv[1:])