-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathgenerate-weightings.ts
128 lines (109 loc) · 4.06 KB
/
generate-weightings.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import fs from 'fs/promises';
import path from 'path';
import { XMLParser } from 'fast-xml-parser';
import sortKeys from 'sort-keys';
type WikiNewsFeed = {
feed: {
doc: {
abstract: string;
}[];
};
};
/*
Steps to add support for new subsets:
1. Identify source from WikiNews data:
- Previously sourced from mirror: https://wikidata.aerotechnet.com/
- Find folder `<lang>wikinews`, e.g. `enwikinews`
- Find latest date folder, e.g. `20231201`
2. Download & extract `abstracts` data file:
- Filename should be `<lang>wikinews-<date>-abstract.xml`, e.g. `enwikinews-20231201-abstract.xml`
3. Add extracted XML file to `abstracts` folder
4. Add subset name to `Subset` type below (see https://www.utf8icons.com/subsets for reference on Unicode Range names)
5. Add mapping to `languageToSubset` below, for resolving the language from the filename to the correct subset
6. Add Unicode Range to `unicodeRanges` below, for filtering characters to count their frequency
7. Run `pnpm dev` to generate the weightings, local dev refs and metrics
*/
type Subset = 'latin' | 'thai';
const languageToSubset: Record<string, Subset> = {
en: 'latin',
th: 'thai',
};
const unicodeRanges: Record<Subset, string[]> = {
// latin and latin-1 supplement (excluding control characters)
latin: ['\u0020-\u007F', '\u00A0-\u00FF'],
// thai (excluding diacritic marks/combining characters)
thai: ['\u0E01-\u0E30', '\u0E32-\u0E33', '\u0E3F-\u0E46', '\u0E4F-\u0E5B'],
};
const abstractFileNamePattern = /([a-z]{2})wikinews-[\d]+-abstract\.xml/;
const SAMPLE_SIZE = 5000;
(async () => {
const directoryPath = path.join(__dirname, 'abstracts');
const languages = await fs.readdir(directoryPath);
let weightings: Record<string, Record<string, number>> = {};
await Promise.all(
languages.map(async function (filename) {
const languageMatch = filename.match(abstractFileNamePattern);
if (!languageMatch) {
throw new Error(
`Filename did not match expected source pattern: ${filename}`,
);
}
const language = languageMatch[1];
const subset = languageToSubset[language];
if (!subset) {
throw new Error(`Could not find subset for language: ${language}`);
}
const content = await fs.readFile(
path.join(directoryPath, filename),
'utf-8',
);
const parser = new XMLParser();
const json: WikiNewsFeed = parser.parse(content);
const data = json.feed.doc
.slice(1, SAMPLE_SIZE)
// remove markdown link syntax
.map(({ abstract }) =>
abstract.replace(/\[(.[^\|]+)(?:\|.[^\]]+)?\]/g, '$1'),
)
// remove some common wikiquote syntax
.filter((a) => a !== '__NOTOC__' && a !== '----' && a !== '-----')
.join('');
let rawTotal = 0;
const charOccurenceCount: Record<string, number> = {};
const charRegex = new RegExp(`[${unicodeRanges[subset].join('')}]`);
for (const char of data) {
if (charRegex.test(char)) {
charOccurenceCount[char] = (charOccurenceCount[char] ?? 0) + 1;
rawTotal += 1;
}
}
let filteredTotal = 0;
const output = Object.entries(charOccurenceCount)
.map(([char, count]) => ({
char,
count,
weighting: parseFloat((count / rawTotal).toFixed(4)),
}))
// Filter out zero weightings below 4 decimal places
.filter(({ count, weighting }) => {
if (weighting > 0) {
// sum the filtered total for updated weightings
filteredTotal += count;
return true;
}
return false;
})
// Calculate updated weightings, depends on filtered total, so must be done after
.map(({ char, count }) => [
char,
parseFloat((count / filteredTotal).toFixed(4)),
]);
weightings[subset] = Object.fromEntries(output);
}),
);
await fs.writeFile(
path.join(__dirname, '../src/weightings.ts'),
`export default ${JSON.stringify(sortKeys(weightings), null, 2)}\n`,
'utf-8',
);
})();