-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze_age.rb
140 lines (120 loc) · 4.24 KB
/
analyze_age.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
corpus = ARGV[0]
maincorpus = corpus.split("-")[0]
subcorpus = corpus.split("-")[1..-1].join("-")
variable = ARGV[1]
path = "variables\\#{variable}\\age_corr\\"
o = File.open("#{path}#{corpus}.tsv","w:utf-8")
o.puts "author\tage\tv2rel\tyear\tbin\tbinlabel"
o2 = File.open("#{path}#{corpus}_change.tsv","w:utf-8")
o2.puts "author\tage\tchange\tstart_year\tbin\tbinlabel"
o_mean = File.open("#{path}#{corpus}_mean.tsv","w:utf-8")
o_mean.puts "age\tv2rel\tiqr\tbin\tbinlabel"
o2_mean = File.open("#{path}#{corpus}_change_mean.tsv","w:utf-8")
o2_mean.puts "age\tchange\tiqr\tbin\tbinlabel"
o3 = File.open("#{path}#{corpus}_change_range.tsv","w:utf-8")
o3.puts "author\tage_at_start\tspeed\trange\tbin\tbinlabel"
author_year = Hash.new{|hash, key| hash[key] = Hash.new}
age_v2rel = Hash.new{|hash, key| hash[key] = Array.new}
age_change = Hash.new{|hash, key| hash[key] = Array.new}
author_age = Hash.new{|hash, key| hash[key] = Hash.new}
blacklist = {}
def bin(age)
if age < 20
bin = 1
label = "<20"
elsif age < 30
bin = 2
label = "20--29"
elsif age < 40
bin = 3
label = "30--39"
elsif age < 50
bin = 4
label = "40--49"
elsif age < 60
bin = 5
label = "50--59"
else
bin = 6
label = "60+"
end
return bin, label
end
def stats(input, type)
if type == "hash"
sent_array = input.values
elsif type == "array"
sent_array = input
end
sent_sum = 0.0
sent_array.each do |sent|
sent_sum += sent
end
mean = sent_sum/sent_array.length
sumsq = 0.0
sent_array.each do |sent|
sumsq += (mean - sent)*(mean - sent)
end
sd = Math.sqrt(sumsq/sent_array.length)
return mean, sd
end
require "rinruby"
for year in 2003..2022 do
f = File.open("#{path}#{corpus}_#{year}.tsv","r:utf-8")
f.each_line.with_index do |line, index|
if index > 0
line1 = line.strip.split("\t")
if !blacklist[line1[0]]
age = year - line1[1].to_i
if age < 15 or age > 80
blacklist[line1[0]] = true
else
author_year[line1[0]][year] = line1[2].to_f
author_age[line1[0]][year] = age
age_v2rel[age] << line1[2].to_f
agebin, label = bin(age)
o.puts "#{line1[0]}\t#{age}\t#{line1[2]}\t#{year}\t#{agebin}\t#{label}"
end
end
end
end
end
#STDERR.puts "#{author_age}"
author_year.each_pair do |author, yearhash|
#STDERR.puts author
if yearhash.keys.length > 1
#STDERR.puts "#{yearhash}"
#STDERR.puts "#{author_age}"
#total_delta = 0.0
prev_v2rel = ""
yearhash.each_pair do |year, v2rel|
if prev_v2rel != ""
#total_delta += v2rel - prev_v2rel
age_change[author_age[author][year]] << v2rel - prev_v2rel
agebin, label = bin(author_age[author][year])
o2.puts "#{author}\t#{author_age[author][year]}\t#{v2rel - prev_v2rel}\t#{year}\t#{agebin}\t#{label}"
end
prev_v2rel = v2rel
end
ave_delta = (yearhash[yearhash.keys.max] - yearhash[yearhash.keys.min])/(yearhash.keys.max - yearhash.keys.min + 1)
agebin, label = bin(author_age[author][yearhash.keys.min])
o3.puts "#{author}\t#{author_age[author][yearhash.keys.min]}\t#{ave_delta}\t#{yearhash.keys.max - yearhash.keys.min + 1}\t#{agebin}\t#{label}"
#STDERR.puts "#{author_age}"
#age_change[author_age[author][yearhash.keys.min]] << ave_delta
#o2.puts "#{author}\t#{author_age[author][yearhash.keys.min]}\t#{ave_delta}\t#{range}\t#{agebin}\t#{label}"
end
end
age_v2rel.each_pair do |age, array|
R.assign "array", array
iqr = R.pull "IQR(array)"
mean = stats(array,"array")[0]
agebin, label = bin(age)
o_mean.puts "#{age}\t#{mean}\t#{iqr}\t#{agebin}\t#{label}"
end
age_change.each_pair do |age, array|
R.assign "array", array
iqr = R.pull "IQR(array)"
mean = stats(array,"array")[0]
agebin, label = bin(age)
o2_mean.puts "#{age}\t#{mean}\t#{iqr}\t#{agebin}\t#{label}"
end