-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwordcount.py
82 lines (69 loc) · 2.5 KB
/
wordcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
'''
Computes number of words in a translation file provided
@author Ph4r05
'''
import re
import os
import sys
import argparse
import traceback
import datetime
import unicodedata
import codecs
import locale
# pip install chardet
import chardet
def computeFile(fname):
numChars = 0
numKeys = 0
keys=[]
text=[]
pattern = re.compile(ur"""[^\"“]*\s*[\"“](.*?)[\"“]\s*=\s*[\"“](.*?)[\"“]\s*;[\s\n\r\t]*""", re.UNICODE | re.IGNORECASE | re.DOTALL)
detection = chardet.detect(open(fname).read())
with codecs.open(fname, 'r', encoding=detection['encoding']) as f:
content = f.readlines()
for line in content:
line = line.strip().rstrip()
if len(line) == 0 or line.startswith("#"):
continue
match = pattern.match(line)
if match is None:
print u"Does not match!! [%s]" % line
continue
numKeys+=1
curKey = match.group(1)
curTxt = match.group(2)
numChars+=len(curTxt)
keys.append(curKey)
text.append(curTxt)
return (numChars, numKeys, keys, text, detection['encoding'])
# Main executable code
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Word count compute', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('file', metavar='file', nargs='+', help='file to compute')
parser.add_argument('--text', help='Dump only text values', default=0, type=int, required=False)
parser.add_argument('--keys', help='Dump only key values', default=0, type=int, required=False)
args = parser.parse_args()
if args.file is None or len(args.file) == 0:
parser.print_help()
sys.exit(-1)
dispText = args.text > 0
dispKeys = args.keys > 0
# Wrap sys.stdout into a StreamWriter to allow writing unicode.
sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)
for curfile in args.file:
(numChars, numKeys, keys, text, encoding) = computeFile(curfile)
if not dispKeys and not dispText:
print "Character compute for file %s" % (curfile)
print " Char count: %s = %s NS" % (numChars, numChars/1800.0)
print " Number of keys: %s" % (numKeys)
print ""
if dispKeys:
for k in keys:
print k
if dispText:
print text[2]
for t in text:
print t