-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpus_tools.rb
361 lines (321 loc) · 10.9 KB
/
corpus_tools.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#PATH = "C:\\Sasha\\D\\DGU\\CassandraMy\\KorpApi\\"
PATH = "C:\\Sasha\\D\\DGU\\Repos\\Cassandra\\"
#require 'uri'
#require 'net/http'
#require 'json'
def get_years(corpus,nolabel=false)
if nolabel
#STDERR.puts "nolabel"
firstyear,lastyear = get_years_from_file(corpus,nolabel)
if firstyear.nil? or lastyear.nil?
firstyear,lastyear = get_years_from_api(corpus)
end
else
#STDERR.puts "label"
corphash = get_years_from_file(corpus,nolabel)
#STDERR.puts "#{corphash}"
corpora = read_corpus_label(corpus,outputmode="array")
#STDERR.puts "#{corpora}"
firstmax = 3000
lastmin = 0
corpora.each do |corpus1|
if corphash[corpus1].nil?
first,last = get_years_from_api(corpus1)
#f = File.open("years.tsv","a:utf-8")
#f.puts "#{corpus1}\t#{first}\t#{last}"
#f.close
else
first = corphash[corpus1][0]
last = corphash[corpus1][1]
end
if first < firstmax
firstmax = first.clone
end
if last > lastmin
lastmax = last.clone
end
firstyear = firstmax
lastyear = lastmax
#STDERR.puts firstyear
#STDERR.puts lastyear
end
end
#STDERR.puts firstyear
#STDERR.puts lastyear
return (firstyear..lastyear).to_a
end
def get_years_from_api(corpus)
safe_uri = URI.escape("https://ws.spraakbanken.gu.se/ws/korp/v8/corpus_info?corpus=#{corpus}")
safe_uri.gsub!("+&+","+%26+")
uri = URI(safe_uri)
res = Net::HTTP.get_response(uri)
j = File.open("temp.json", "w:utf-8")
j.puts res.body if res.is_a?(Net::HTTPSuccess)
j.close
file = File.read("temp.json")
data_hash = JSON.parse(file)
#STDERR.puts corpus
#STDERR.puts data_hash["corpora"][corpus.upcase]["info"]
firstyear = data_hash["corpora"][corpus.upcase]["info"]["FirstDate"][0..3].to_i #.split(" ")[0].split("-")[0]
lastyear = data_hash["corpora"][corpus.upcase]["info"]["LastDate"][0..3].to_i #.split(" ")[0].split("-")[0]
f = File.open("years.tsv","a:utf-8")
f.puts "#{corpus}\t#{firstyear}\t#{lastyear}"
f.close
File.delete("temp.json")
return [firstyear,lastyear]
end
def get_years_from_file(corpus,nolabel)
if !nolabel
corpora = read_corpus_label(corpus,outputmode="array")
corphash = {}
end
f = File.open("years.tsv","r:utf-8")
firstyear = nil
lastyear = nil
f.each_line.with_index do |line,index|
if index > 0
line2 = line.strip.split("\t")
if nolabel
if line2[0] == corpus
firstyear = line2[1].to_i
lastyear = line2[2].to_i
break
end
else
corpora.each do |corpus1|
if line2[0] == corpus1
firstyear = line2[1].to_i
lastyear = line2[2].to_i
corphash[corpus1] = [firstyear,lastyear]
end
end
end
end
end
f.close
if nolabel
return [firstyear,lastyear]
else
return corphash
end
end
def get_years_deprecated(maincorpus,corpus=maincorpus,yearlycorpus=false)
#maincorpus = get_maincorpus(corpus)
#STDERR.puts maincorpus,yearlycorpus
if yearlycorpus and ["svt","gp","bloggmix","press","webbnyheter"].include?(maincorpus)
years = [corpus[-4..-1].to_i]
else
if maincorpus == "flashback"
start = 2000
finish = 2021
elsif maincorpus == "familjeliv"
start = 2003
finish = 2021
elsif maincorpus == "svt"
start = 2004
finish = 2021
#corpus = corpus.upcase
elsif maincorpus == "gp"
start = 2001
finish = 2013
elsif maincorpus == "twitter"
if corpus == maincorpus
start = 2006
finish = 2019
else
start = corpus.split("-")[1].to_i
finish = start
end
elsif maincorpus == "rd"
start = 2003
finish = 2019
elsif maincorpus == "da"
start = 2007
finish = 2021
elsif maincorpus == "bloggmix"
if corpus == maincorpus
start = 1998
finish = 2017
else
start = corpus[8..-1].to_i
finish = start
end
elsif maincorpus == "press"
years = [1965,1995,1996,1997,1998]
elsif maincorpus == "press2"
years = [1976]
elsif maincorpus == "dn"
years = [1987]
elsif maincorpus == "webbnyheter"
start = 2001
finish = 2013
elsif maincorpus == "news"
start = 2001
finish = 2021
elsif maincorpus == "forum"
start = 2000
finish = 2021
elsif maincorpus == "kubord"
start = 2000
finish = 2021
end
if maincorpus != "press" and maincorpus != "press2" and maincorpus != "dn"
years = (start..finish).to_a
end
end
return years
end
def get_maincorpus_from_label(label)
maincorpus = label.split("-")[0].downcase
return maincorpus
end
def get_maincorpus(corpus)
if corpus[0..1].downcase != "gp" and corpus[0..7].downcase != "bloggmix" and corpus[0..4].downcase != "press" and corpus[0..1].downcase != "dn" and corpus[0..3].downcase != "webb"
maincorpus = corpus.split("-")[0].downcase
else
if corpus[0..1].downcase == "gp"
maincorpus = "gp"
elsif corpus[0..7].downcase == "bloggmix"
maincorpus = "bloggmix"
elsif corpus[0..4].downcase == "press"
if corpus[5..6] == "76"
maincorpus = "press2"
else
maincorpus = "press"
end
elsif corpus[0..1].downcase == "dn"
maincorpus = "dn"
elsif corpus[0..3].downcase == "webb"
maincorpus = "webbnyheter"
end
end
return maincorpus
end
def get_genre(maincorpus)#, getmain=true)
genre_hash = {"familjeliv" => "socmedia", "flashback" => "socmedia", "twitter" => "socmedia", "rd" => "official", "svt" => "news", "gp" => "news", "da" => "news", "podiet" => "news", "bloggmix" => "socmedia","press" => "news", "press2" => "news","dn"=>"news","webbnyheter"=>"news"}
genre = genre_hash[maincorpus]
return genre
end
def get_structs(maincorpus)
#maincorpus = get_maincorpus(corpus)
if maincorpus == "familjeliv"
structs = "text_date,text_username,text_id,thread_id"
elsif maincorpus == "flashback"
structs = "text_date,text_username,text_id,thread_id,text_userid"
elsif maincorpus == "svt"
structs = "text_date,text_authors,text_id,text_section"
elsif maincorpus == "gp"
structs = "text_date,text_author,text_section,text_sectionshort,text_title"
elsif maincorpus == "da"
structs = "text_date,text_author,text_id"
elsif maincorpus == "twitter"
structs = "text_datetime,user_username,text_id"
elsif maincorpus == "bloggmix"
structs = "text_date,text_title,text_url,blog_title,blog_age,blog_city"
elsif maincorpus == "press"
structs = "text_date"
elsif maincorpus == "press2"
structs = "text_year"
elsif maincorpus == "dn"
structs = "text_date"
elsif maincorpus == "webbnyheter"
structs = "text_date,text_url,text_newspaper"
end
end
def read_corpus_label(corpus_and_label,outputmode="string")
maincorpus = corpus_and_label.split("-")[0]
#STDERR.puts maincorpus
label = corpus_and_label.split("-")[1..-1].join("-")
#STDERR.puts label
corpus = ""
labelfile = File.open("#{PATH}subforum_labels.tsv", "r:utf-8")
labelfound = false
labelfile.each_line do |line|
line1 = line.strip.split("\t")
if line1[0] == corpus_and_label
line1[1].split(",").each do |subcorp|
if line1[2] == "merge"
corpus << maincorpus#.upcase
end
corpus << subcorp#.upcase
corpus << ","
end
corpus = corpus[0..-2]
labelfound = true
break
end
end
if corpus != ""
corpora = corpus
if outputmode == "array"
corpora = corpus.split(",")
end
else
corpora = corpus_and_label
if outputmode == "array"
corpora = [corpus_and_label]
end
end
#STDERR.puts corpora
return corpora
end
def read_in_variable(varname, useradd, nvariants, source="korp_queries.rb")
#STDERR.puts varname
f_input = File.open("#{PATH}#{source}","r:utf-8")
flag = false
found = false
variant1 = ""
variant2 = ""
f_input.each_line do |line|
line1 = line.strip
if line1 == "__END__"
break
elsif line1 == "#label = #{varname}"
flag = true
elsif flag and line1.split(" = ")[0] == "variant1"
variant1 = line1.split(" = ")[1..-1].join(" = ") #.gsub("\"","")
if nvariants == 1
found = true
break
end
elsif flag and line1.split(" = ")[0] == "variant2"
variant2 = line1.split(" = ")[1..-1].join(" = ") #.gsub("\"","")
found = true
break
end
end
if !found
abort "Cassandra says: No variable matches this label, check korp_queries.rb"
end
bracket_index = variant1.index("]")
variant1 = "#{variant1[0..bracket_index-1]}#{useradd}#{variant1[bracket_index..-1]}"
#STDERR.puts variant1
if nvariants > 1
bracket_index = variant2.index("]")
variant2 = "#{variant2[0..bracket_index-1]}#{useradd}#{variant2[bracket_index..-1]}"
end
#STDERR.puts variant2
f_input.close
return variant1, variant2
end
def read_author_freqs(maincorpus,exclude_anonyms=true)
hash = Hash.new(0)
f = "KorpApi\\authors\\#{maincorpus}\\all.tsv"
f.each_line.with_index do |line,index|
if index > 0
line1 = line.strip.split("\t")
if !(exclude_anonyms and ((maincorpus == "flashback" and line1[0] == "") or (maincorpus == "familjeliv" and line1[0].include?("Anonym"))))
hash[line1[0]] = line1[1].to_f
end
end
end
hash
end
def code_space(argument,direction)
if direction == "code"
argument = argument.gsub(" ","_SPACEENCODED_")
elsif direction == "decode"
argument = argument.gsub("_SPACEENCODED_", " ")
end
argument
end