-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunum_t.pl
executable file
·3271 lines (3110 loc) · 105 KB
/
unum_t.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! /usr/bin/perl -CA
# ^^^ This allows Unicode command-line arguments to be
# accepted if the underlying system supports it.
# If it causes an error, your version of Perl does
# not support this feature. You can remove the option
# and continue to use the program with all other forms
# of arguments.
use utf8;
=head1 NAME
unum - Interconvert numbers, Unicode, and HTML/XHTML characters
=head1 SYNOPSIS
B<unum> I<argument...>
=head1 DESCRIPTION
The B<unum> program is a command line utility which allows you to
convert decimal, octal, hexadecimal, and binary numbers; Unicode
character and block names; and HTML/XHTML character reference names and
numbers into one another. It can be used as an on-line special
character reference for Web authors.
=head2 Arguments
The command line may contain any number of the following
forms of I<argument>.
=over 10
=item S<>123
Decimal number.
=item S<>0371
Octal number preceded by a zero.
=item S<>0x1D351
Hexadecimal number preceded by C<0x>. Letters may be upper or
lower case, but the C<x> must be lower case.
=item S<>0b110101
Binary number.
=item b=I<block>
Unicode character blocks matching I<block> are listed.
The I<block> specification may be a regular expression.
For example, C<b=greek> lists all Greek character blocks
in Unicode.
=item c=I<char>...
The Unicode character codes for the characters I<char>... are listed.
If the first character is not a decimal digit and the second not an
equal sign, the C<c=> may be omitted.
=item h=I<entity>
List all HTML/XHTML character references matching I<entity>, which may
be a regular expression. Matching is case-insensitive, so C<h=alpha>
finds both C<Α> and C<α>. If the reference is composed of
multiple Unicode code points, the components are printed after the name
of the composed character reference.
=item '&#I<number>;&#xI<hexnum>;...'
List the characters corresponding to the specified HTML/XHTML
character entities, which may be given in either decimal or
hexadecimal. Note that the "x" in XHTML entities must be lower case.
On most Unix-like operating systems, you'll need to quote the argument
so the ampersand, octothorpe, and semicolon aren't interpreted by the
shell.
=item l=I<block>
List all Unicode blocks matching I<block> and all characters
within each block; C<l=goth> lists the C<Gothic> block
and the 32 characters it contains.
=item n=I<name>
List all Unicode character whose names match I<name>, which may be
a regular expression. For example, C<n=telephone> finds the twelve
Unicode characters for telephone symbols.
=item utf8=I<number>
Treating the number (which may be specified as either decimal,
octal, hexadecimal, or binary, as for numeric arguments) as a
stream of from one to four bytes, decode the bytes as the
UTF-8 representation of a character. For example, the
specification "utf8=0xE298A2" decodes to Unicode code
point 0x2622, the radioactive sign.
=back
=head2 Options
=over 10
=item --nent
When showing an HTML character reference, always show its numerical
form (for example, —), even if it has a named character
reference.
=item --utf8
Show UTF-8 encoding of characters as a byte sequence in a
hexadecimal number. This is the same format as is accepted
by the utf8= argument. The option applies to the display of
all arguments which follow on the command line.
=back
=head2 Output
For number or character arguments, the value(s) are listed in
all of the input formats, save binary.
Octal Decimal Hex HTML Character Unicode
056 46 0x2E . "." FULL STOP
If the terminal font cannot display the character being listed,
the "Character" field will contain whatever default is shown in
such circumstances. Control characters are shown as a Perl
hexadecimal escape. If multiple HTML named character references
map to the same Unicode code point, all are shown separated by
commas.
Unicode blocks are listed as follows:
Start End Unicode Block
U+2460 - U+24FF Enclosed Alphanumerics
U+1D400 - U+1D7FF Mathematical Alphanumeric Symbols
=head1 VERSION
This is B<unum> version 3.6-15.1.0, released on October 21st, 2023.
The current version of this program is always posted at
http://www.fourmilab.ch/webtools/unum/.
=head1 AUTHOR
John Walker
http://www.fourmilab.ch/
=head1 BUGS
Specification of Unicode characters on the command line requires
an operating system and shell which support that feature and a
version of Perl with the B<-CA> command line option
(v5.8.5 has it, but v5.8.0 does not; I don't know in which
intermediate release it was introduced). If your version of
Perl does not implement this switch, you'll have to remove it
from the C<#!> statement at the top of the program, and Unicode
characters on the command line will not be interpreted correctly.
If you specify a regular expression, be sure to quote the argument
if it contains any characters the shell would otherwise interpret.
If you run B<perldoc> on the compressed version of the program,
a large amount of gibberish will be displayed after the end of the
embedded documentation. B<perldoc> gets confused by sequences in
the compressed data table and tries to interpret it as documentation.
This doesn't happen with the uncompressed version.
Please report any bugs to [email protected].
=head1 COPYRIGHT
This is free software; you can redistribute it and/or modify it under
the same terms as Perl itself.
=cut
use strict;
use warnings;
sub usage {
print << "EOD";
usage: unum arg...
Arguments:
147 Decimal number
0371 Octal number
0xfa75 Hexadecimal number (letters may be A-F or a-f)
0b11010011 Binary number
'∫π' One or more XHTML numeric entities (hex or decimal)
utf8=0xc397 Character encoded as UTF-8 byte stream (any number format)
xyz The characters xyz (non-digit)
c=7Y The characters 7Y (any Unicode characters)
b=cherokee List Unicode blocks containing "CHEROKEE" (b=. to list all)
h=alpha List XHTML entities containing "alpha" (h=. to list all)
n=aggravation Unicode characters with "AGGRAVATION" in the name
n=^greek.*rho Unicode characters beginning with "GREEK" and containing "RHO"
l=gothic List all characters in matching Unicode blocks
Options:
--nent Always show HTML character entities as numeric
--utf8 Show UTF-8 encoding of characters
All name queries are case-insensitive and accept regular
expressions. Be sure to quote regular expressions if they
contain characters with meaning to the shell.
Run perldoc on this program or visit:
http://www.fourmilab.ch/webtools/unum/
for additional information.
Version 3.6-15.1.0, October 2023
EOD
}
my $debug_decompress = 0; # Debug code point table decompression?
my (@HTML_CHARACTER_REFERENCES, @HTML_COMPOSED_CHARACTER_REFERENCES,
%UNICODE_NAMES, @UNICODE_BLOCKS);
binmode(STDOUT, ":utf8");
if ($#ARGV < 0) {
usage();
exit(0);
}
init_names();
my $utf8o = 0; # Show UTF-8 encoding ?
my $numHTMLent = 0; # Always generate numeric HTML entities ?
my ($chartitle, $blocktitle) = (0, 0);
my $arg = 0;
while ($#ARGV >= 0) {
my $n = shift();
$arg++;
# Process options
if ($n eq "--utf8") { # --utf8 Show UTF-8 encoding
$utf8o = 1;
next;
} elsif ($n eq "--nent") { # --nent Always generate numeric HTML entities
$numHTMLent = 1;
next;
}
=begin test_UTF8
elsif ($n eq "--test8") {
test_UTF8();
next;
}
=end test_UTF8
=cut
if ($n =~ m/^\d/) {
# Number List numeric and character representations
# Argument is a number: use oct() to convert to binary
$n = oct($n) if ($n =~ m/^0/);
} elsif ($n =~ m/^(b|l)=(.+)/) {
# b=<block name> List Unicode blocks matching name
my $bl = $1;
my $cpat = qr/$2/i;
my $listall = $bl =~ m/l/i;
my $k;
for $k (@UNICODE_BLOCKS) {
if ($k->[2] =~ m/$cpat/) {
if (!$blocktitle) {
$chartitle = 0;
$blocktitle = 1;
print(" Start End Unicode Block\n");
}
printf("%8s - %8s %s\n",
sprintf("U+%04X", $k->[0]),
sprintf("U+%04X", $k->[1]),
$k->[2]);
if ($listall) {
for (my $i = $k->[0]; $i <= $k->[1]; $i++) {
showchar($i);
}
}
}
}
next;
} elsif ($n =~ m/^h=(.+)/) {
# h=<character name> List HTML character entities matching name
my $cpat = qr/$1/i;
# Scan through the table of names and build a hash of all
# the code points that matches map to. Then sort those
# code points in ascending order and display them,
# counting on showchar() to show all of the character
# reference names which mapped from the code points
# displayed.
my %htmlCodePoints;
for (my $i = 0; $i < scalar(@HTML_CHARACTER_REFERENCES); $i += 2) {
if ($HTML_CHARACTER_REFERENCES[$i] =~ m/$cpat/) {
$htmlCodePoints{$HTML_CHARACTER_REFERENCES[$i + 1]} = 1;
}
}
my $k;
for $k (sort {$a <=> $b} keys(%htmlCodePoints)) {
showchar($k);
}
# Now we must scan through the table of composed character
# references. These are logical characters which are made
# up by combining multiple code points.
for (my $i = 0; $i < scalar(@HTML_COMPOSED_CHARACTER_REFERENCES); $i += 2) {
if ($HTML_COMPOSED_CHARACTER_REFERENCES[$i] =~ m/$cpat/) {
my $hcp = $HTML_COMPOSED_CHARACTER_REFERENCES[$i + 1];
print(" &$HTML_COMPOSED_CHARACTER_REFERENCES[$i]; =\n");
$chartitle = 0;
while ($hcp =~ s/\s*(\d+)(?:,\s*)?//) {
$k = $1;
showchar($k);
}
$chartitle = 0;
}
}
next;
} elsif ($n =~ m/^n=(.+)/) {
# n=<character name> List Unicode characters matching name
my $cpat = qr/$1/i;
# The following would be faster if we selected matching
# characters into an auxiliary array and then sorted
# the selected ones before printing. In fact, the time it
# takes to sort the entire list is less than that consumed
# in init_names() loading it, so there's little point bothering
# with this refinement.
my $k;
for $k (sort {oct("0x$a") <=> oct("0x$b")} keys(%UNICODE_NAMES)) {
if ($UNICODE_NAMES{$k} =~ m/$cpat/) {
showchar(oct("0x$k"));
}
}
next;
} elsif ($n =~ m/^utf8=(.+)/) {
# utf8=<number> UTF-8 character encoded as number
my $u = $1;
# Argument is a number: use oct() to convert to binary if leading 0
$u = oct($u) if ($u =~ m/^0/);
$n = decode_UTF8($u);
} elsif ($n =~ m/^&#/) {
# '&#NNN;&#xNNNN;...' One or more XHTML numeric entities
my @htmlent;
while ($n =~ s/&#(x?[0-9A-Fa-f]+);//) {
my $hch = $1;
$hch =~ s/^x/0x/;
push(@htmlent, $hch);
}
unshift(@ARGV, @htmlent);
next;
} else {
# =<char>... or c=<char>... List code for one or more characters
# If argument is an equal sign followed by a single
# character, take the second character as the argument.
# This allows treating digits as characters to be looked
# up.
$n =~ s/^c?=(.+)$/$1/i;
while ($n =~ s/^(.)//) {
showchar(ord($1));
}
next;
}
showchar($n);
}
# Show a numeric code in all its manifestations
sub showchar {
my ($n) = @_;
my $ch = ((($n >= 32) && ($n < 128)) || ($n > 160)) ?
chr($n) :
sprintf("\\x{%X}", $n);
# Determine the Unicode character code as best we can
my $u = uname($n);
if (!defined($u)) {
$u = ublock($n);
if (defined($u)) {
$u = sprintf("%s U+%05X", $u, $n);
} else {
$u = sprintf("Undefined U+%05X", $n);
}
}
my $ut8l = "";
if ($utf8o) {
$ut8l = " UTF-8 ";
}
if (!$chartitle) {
$blocktitle = 0;
$chartitle = 1;
print(" Octal Decimal Hex HTML$ut8l Character Unicode\n");
}
# With the advent of HTML5, (aka, W3C meets crap sandwich), the mapping
# of named character references to Unicode code points is many-to-many.
# If there is more than one character reference name for the given
# code point, list all of them separated by commas. They are listed
# in the vaguely alphabetical order given in the W3C table. We only
# display direct mappings of code points to named character references,
# not composed character references of which the code point is a
# part.
my $htmlcr;
if (!$numHTMLent) {
for (my $i = 0; $i < scalar(@HTML_CHARACTER_REFERENCES); $i += 2) {
if ($HTML_CHARACTER_REFERENCES[$i + 1] == $n) {
if ($htmlcr) {
$htmlcr .= ",";
}
$htmlcr .= "&" . $HTML_CHARACTER_REFERENCES[$i] . ";";
}
}
}
if (!$htmlcr) {
$htmlcr = sprintf("&#%d;", $n);
}
my $u8e = "";
if ($utf8o) {
$u8e = sprintf(" %10s ", sprintf("0x%X", encode_UTF8($n)));
}
printf("%8s %8d %8s %11s%s %-8s %s\n",
sprintf("0%lo", $n),
$n,
sprintf("0x%X", $n),
$htmlcr,
$u8e,
sprintf("\"%s\"", $ch),
$u);
}
# Decode a stream of bytes, stored in an integer, into a
# single UTF-8 character. Leading zero bytes are
# ignored. The following validations are performed
# and warning messages issued in case of violations of
# the UTF-8 standard.
#
# 1. No extraneous bytes following UTF-8 character
# 2. No continuation code in first byte
# 3. All continuation bytes have 0b10 as the two
# highest bits
# 4. No bytes forbidden or undefined in UTF-8
# (0xC0, 0xC1, 0xF5-0xFF)
# 5. No "overlong" encoding of code points into
# more bytes than necessary.
#
# The code point of the UTF-8 character is returned as
# an integer.
#
# Test cases:
# 0x0 NULL
# 0x4B LATIN CAPITAL LETTER K
# 0xC397 MULTIPLICATION SIGN
# 0xE298A2 RADIOACTIVE SIGN
# 0xF09F918C OK HAND SIGN
sub decode_UTF8 {
my ($u) = @_;
# Now we run the gauntlet of our very exacting UTF-8
# decoder. Many UTF-8 decoders are tolerant of
# sloppiness, but we are not. That would compromise
# our mission of accepting only well-formed input and
# diagnosing errors.
my $err = 0;
my $n;
my @bytes;
my $m = 0xFF000000;
for (my $i = 0; $i < 4; $i++) {
my $b = ($u & $m) >> (8 * (3 - $i));
if (($b != 0) || ($i == 3)) {
push(@bytes, ($u & $m) >> (8 * (3 - $i)));
}
$m = $m >> 8;
}
# Verify that the first byte is not a continuation
# code.
if (($bytes[0] & 0b1100_0000) == 0b1000_0000) {
printf("First byte is a continuation code " .
"in UTF-8 code 0x%X\n", $u);
$err++;
}
# If there is more than a single byte of UTF-8
# code, validate that all continuation bytes
# have the correct 0b10xx_xxxx high bits.
if (scalar(@bytes) > 1) {
for (my $i = 1; $i < scalar(@bytes); $i++) {
if (($bytes[$i] & 0b1100_0000) != 0b1000_0000) {
printf("Incorrect continuation code in byte $i " .
"of UTF-8 code 0x%X\n", $u);
$err++;
}
}
}
# Verify that no byte contains a value forbidden in
# a valid UTF-8 stream.
for (my $i = 0; $i < scalar(@bytes); $i++) {
my $b = $bytes[$i];
if (($b == 0xC0) || ($b == 0xC1) ||
($b >= 0xF5)) {
printf("Byte $i contains invalid UTF-8 code 0x%X\n", $b);
$err++;
}
}
# Bail out on gross encoding errors. This avoids blundering
# into undefined variable references and other horrors in
# the following decoder.
if ($err > 0) {
printf("Erroneous UTF-8 encoding: returning code point 0\n");
@bytes = ( 0 );
}
# Decode the bytes according to the length specified
# by the high-order bits in the first byte.
if (($bytes[0] & 0b1000_0000) == 0) { # Code points 0000 - 007F
$n = $bytes[0];
if (scalar(@bytes) > 1) {
printf("Excess byte(s) in UTF-8 code 0x%X: 1 byte expected, %d specified\n",
$u, scalar(@bytes));
}
} elsif (($bytes[0] & 0b1110_0000) == 0b1100_0000) { # Code points 0080 - 07FF
$n = (($bytes[0] & 0b1_1111) << 6) | ($bytes[1] & 0b11_1111);
if (($bytes[0] & 0b1_1111) == 0) {
printf("Overlong 2 byte UTF-8 code 0x%X for code point 0x%X\n", $u, $n);
}
if (scalar(@bytes) > 2) {
printf("Excess byte(s) in UTF-8 code 0x%X: 2 bytes expected, %d specified\n",
$u, scalar(@bytes));
}
} elsif (($bytes[0] & 0b1111_0000) == 0b1110_0000) { # Code points 0800 - 0FFF
$n = (($bytes[0] & 0b1111) << 12) |
(($bytes[1] & 0b11_1111) << 6) |
($bytes[2] & 0b11_1111);
if ((($bytes[0] & 0b1111) == 0) &&
(($bytes[1] & 0b1000_0000) == 0)) {
printf("Overlong 3 byte UTF-8 code 0x%X for code point 0x%X\n", $u, $n);
}
if (scalar(@bytes) > 3) {
printf("Excess byte(s) in UTF-8 code 0x%X: 3 bytes expected, %d specified\n",
$u, scalar(@bytes));
}
} elsif (($bytes[0] & 0b1111_1000) == 0b1111_0000) { # Code points 10000 - 10FFFF
$n = (($bytes[0] & 0b0111) << 18) |
(($bytes[1] & 0b11_1111) << 12) |
(($bytes[2] & 0b11_1111) << 6) |
($bytes[3] & 0b11_1111);
if ((($bytes[0] & 0b0111) == 0) &&
(($bytes[1] & 0b0011_0000) == 0)) {
printf("Overlong 4 byte UTF-8 code 0x%X for code point 0x%X\n", $u, $n);
}
}
return $n;
}
# Encode a single UTF-8 character code point as a byte
# stream in an integer.
sub encode_UTF8 {
my ($n) = @_;
my $u;
if ($n < 0x80) {
$u = $n;
} elsif ($n < 0x800) {
$u = ((0b1100_0000 | ($n >> 6)) << 8) |
(0b1000_0000 | ($n & 0b0011_1111));
} elsif ($n < 0x10000) {
$u = ((0b1110_0000 | ($n >> 12)) << 16) |
((0b1000_0000 | (($n >> 6) & 0b0011_1111)) << 8) |
(0b1000_0000 | ($n & 0b0011_1111));
} else {
$u = ((0b1111_0000 | ($n >> 18)) << 24) |
((0b1000_0000 | (($n >> 12) & 0b0011_1111)) << 16) |
((0b1000_0000 | (($n >> 6) & 0b0011_1111)) << 8) |
(0b1000_0000 | ($n & 0b0011_1111));
}
return $u;
}
=begin test_UTF8
# Test UTF-8 encoding and decoding
sub test_UTF8 {
for (my $c = 0; $c <= 0x10FFFF; $c++) {
my $n = encode_UTF8($c);
my $u = decode_UTF8($n);
if ($c != $u) {
printf("UTF-8 encode/decode failure for code point 0x%X: encoded 0x%X decoded 0x%X\n",
$c, $n, $u);
}
}
}
=end test-UTF8
=cut
=pod
The Unicode character tables are based upon the Unicode 15.1.0
(2023) standard.
The control characters in this B<unum> version have been annotated
with their Unicode abbreviations, names, and for U+0000 to U+001F,
the Ctrl-key code which generates them.
The HTML named character references are from the World Wide Web
Consortium HTML standard. Some browsers may not support all of
these references.
=cut
sub uname {
my $code = shift;
if ($code >= 0x4E00) {
if ($code >= 0xD800 && $code <= 0xF8FF) {
# Surrogate and private
if ($code <= 0xDFFF) {
return "<surrogate>";
} else {
return "<private>";
}
}
}
$UNICODE_NAMES{sprintf("%04X", $code)}
}
sub ublock {
my $code = shift;
# XXX: could use a binary search, but I am too lazy today...
my $block;
for $block (@UNICODE_BLOCKS) {
return $block->[2] if $block->[0] <= $code && $block->[1] >= $code;
}
undef;
}
sub init_names {
# Pre-dimension array and hash bucket sizes to reduce overhead
# in dynamic allocation as they are built below.
$#UNICODE_BLOCKS = 328;
$#HTML_CHARACTER_REFERENCES = 2032;
$#HTML_COMPOSED_CHARACTER_REFERENCES = 93;
keys %UNICODE_NAMES = 149878;
# The following code allows us to build two versions of the program
# from the same template file. The table of Unicode code points
# is enormous (8.1 Mb as of Unicode 15.0.0), and we'd prefer not
# to carry it around within this program. We read the table from
# a __DATA__ block appended to the program. Following this can
# either be the table itself, appended from a separate file when
# the program is built, or the table compressed with bzip2,
# preceded by a single text line that says "COMPRESSED". If
# that sentinel is read, we switch the data stream to binary and
# feed it through bunzip, creating a temporary file. Then we
# read the temporary file to load the data. Otherwise, we
# continue to read and process the embedded uncompressed table.
my $l;
while ($l = <DATA>) {
if ($l =~ m/^COMPRESSED/) {
# The code point table is compressed. There are two ways
# we can approach uncompressing and loading it. The first
# is to use the system's bunzip2 utility, decompressing to
# a temporary file which we then read. The second is to use
# Perl's IO::Uncompress::Bunzip2, which is a core module in
# recent releases of Perl. The first approach will only work
# on Unix-like systems, while the second should work on any
# implementation of Perl which supports all of the core
# modules. The choice should be simple: use the module if
# it's present and otherwise fall back to the utility if
# we're on a system which provides it.
#
# As with most things, what should be simple is actually more
# complicated. The Perl module is very slow compared to the
# utility: almost four times slower. This results in a
# noticeable pause on each invocation of unum, due solely to
# the decompression of the table. There is no clean solution
# to this, so here's what I'm going to do. If the file is
# compressed, we test for the existence of an executable of
# bunzip2 in the library locations where it's likely to be
# found on Unix-like systems. If it's not found (which will
# be the case on legacy systems) and the IO::Uncompress::Bunzip2
# module exists, we use it, slow as it may be. Otherwise,
# we try using bunzip2, whether or not we found it. This
# will fail only if the system doesn't support the module
# and doesn't have an executable bunzip2. In all other cases,
# the most efficient available alternative will be used.
my $decomp_start = times() if $debug_decompress;
my $cmd_bunzip2 = (-x "/bin/bunzip2") || (-x "/usr/bin/bunzip2") ||
(-x "/usr/local/bin/bunzip2");
if ((!$cmd_bunzip2) && eval { require IO::Uncompress::Bunzip2; }) {
print(STDERR "Using IO::Uncompress::Bunzip2 module\n") if $debug_decompress;
my $bz = IO::Uncompress::Bunzip2->new(\*DATA);
while ($l = <$bz>) {
chop($l);
my ($code, $name) = split(' ', $l, 2);
$UNICODE_NAMES{$code} = $name;
}
} else {
print(STDERR "Using system bunzip2\n") if $debug_decompress;
use File::Temp qw(tempfile);
my ($fh, $filename) = tempfile("unumXXXXXX", DIR => "/tmp",
SUFFIX => ".tmp", UNLINK => 1);
{
local $/ = undef; # Set to read entire file at once
binmode(DATA);
open(UCHARS, "| bunzip2 -c >$filename") ||
die("Unable to open pipe to bunzip2 code point database");
print(UCHARS <DATA>);
close(UCHARS);
}
while ($l = <$fh>) {
chop($l);
my ($code, $name) = split(' ', $l, 2);
$UNICODE_NAMES{$code} = $name;
}
close($fh);
}
printf(STDERR "Decompression time: %.4f seconds.\n",
times() - $decomp_start) if $debug_decompress;
last;
} else {
# Code point table is uncompressed: read text directly
do {
chop($l);
my ($code, $name) = split(' ', $l, 2);
$UNICODE_NAMES{$code} = $name;
} while ($l = <DATA>);
last;
}
}
close(DATA);
# Remember to update $#UNICODE_BLOCKS pre-dimension above!
@UNICODE_BLOCKS = (
# start end block name
[0x0000, 0x007F => 'Basic Latin'],
[0x0080, 0x00FF => 'Latin-1 Supplement'],
[0x0100, 0x017F => 'Latin Extended-A'],
[0x0180, 0x024F => 'Latin Extended-B'],
[0x0250, 0x02AF => 'IPA Extensions'],
[0x02B0, 0x02FF => 'Spacing Modifier Letters'],
[0x0300, 0x036F => 'Combining Diacritical Marks'],
[0x0370, 0x03FF => 'Greek and Coptic'],
[0x0400, 0x04FF => 'Cyrillic'],
[0x0500, 0x052F => 'Cyrillic Supplement'],
[0x0530, 0x058F => 'Armenian'],
[0x0590, 0x05FF => 'Hebrew'],
[0x0600, 0x06FF => 'Arabic'],
[0x0700, 0x074F => 'Syriac'],
[0x0750, 0x077F => 'Arabic Supplement'],
[0x0780, 0x07BF => 'Thaana'],
[0x07C0, 0x07FF => 'NKo'],
[0x0800, 0x083F => 'Samaritan'],
[0x0840, 0x085F => 'Mandaic'],
[0x0860, 0x086F => 'Syriac Supplement'],
[0x0870, 0x089F => 'Arabic Extended-B'],
[0x08A0, 0x08FF => 'Arabic Extended-A'],
[0x0900, 0x097F => 'Devanagari'],
[0x0980, 0x09FF => 'Bengali'],
[0x0A00, 0x0A7F => 'Gurmukhi'],
[0x0A80, 0x0AFF => 'Gujarati'],
[0x0B00, 0x0B7F => 'Oriya'],
[0x0B80, 0x0BFF => 'Tamil'],
[0x0C00, 0x0C7F => 'Telugu'],
[0x0C80, 0x0CFF => 'Kannada'],
[0x0D00, 0x0D7F => 'Malayalam'],
[0x0D80, 0x0DFF => 'Sinhala'],
[0x0E00, 0x0E7F => 'Thai'],
[0x0E80, 0x0EFF => 'Lao'],
[0x0F00, 0x0FFF => 'Tibetan'],
[0x1000, 0x109F => 'Myanmar'],
[0x10A0, 0x10FF => 'Georgian'],
[0x1100, 0x11FF => 'Hangul Jamo'],
[0x1200, 0x137F => 'Ethiopic'],
[0x1380, 0x139F => 'Ethiopic Supplement'],
[0x13A0, 0x13FF => 'Cherokee'],
[0x1400, 0x167F => 'Unified Canadian Aboriginal Syllabics'],
[0x1680, 0x169F => 'Ogham'],
[0x16A0, 0x16FF => 'Runic'],
[0x1700, 0x171F => 'Tagalog'],
[0x1720, 0x173F => 'Hanunoo'],
[0x1740, 0x175F => 'Buhid'],
[0x1760, 0x177F => 'Tagbanwa'],
[0x1780, 0x17FF => 'Khmer'],
[0x1800, 0x18AF => 'Mongolian'],
[0x18B0, 0x18FF => 'Unified Canadian Aboriginal Syllabics Extended'],
[0x1900, 0x194F => 'Limbu'],
[0x1950, 0x197F => 'Tai Le'],
[0x1980, 0x19DF => 'New Tai Lue'],
[0x19E0, 0x19FF => 'Khmer Symbols'],
[0x1A00, 0x1A1F => 'Buginese'],
[0x1A20, 0x1AAF => 'Tai Tham'],
[0x1AB0, 0x1AFF => 'Combining Diacritical Marks Extended'],
[0x1B00, 0x1B7F => 'Balinese'],
[0x1B80, 0x1BBF => 'Sundanese'],
[0x1BC0, 0x1BFF => 'Batak'],
[0x1C00, 0x1C4F => 'Lepcha'],
[0x1C50, 0x1C7F => 'Ol Chiki'],
[0x1C80, 0x1C8F => 'Cyrillic Extended-C'],
[0x1C90, 0x1CBF => 'Georgian Extended'],
[0x1CC0, 0x1CCF => 'Sundanese Supplement'],
[0x1CD0, 0x1CFF => 'Vedic Extensions'],
[0x1D00, 0x1D7F => 'Phonetic Extensions'],
[0x1D80, 0x1DBF => 'Phonetic Extensions Supplement'],
[0x1DC0, 0x1DFF => 'Combining Diacritical Marks Supplement'],
[0x1E00, 0x1EFF => 'Latin Extended Additional'],
[0x1F00, 0x1FFF => 'Greek Extended'],
[0x2000, 0x206F => 'General Punctuation'],
[0x2070, 0x209F => 'Superscripts and Subscripts'],
[0x20A0, 0x20CF => 'Currency Symbols'],
[0x20D0, 0x20FF => 'Combining Diacritical Marks for Symbols'],
[0x2100, 0x214F => 'Letterlike Symbols'],
[0x2150, 0x218F => 'Number Forms'],
[0x2190, 0x21FF => 'Arrows'],
[0x2200, 0x22FF => 'Mathematical Operators'],
[0x2300, 0x23FF => 'Miscellaneous Technical'],
[0x2400, 0x243F => 'Control Pictures'],
[0x2440, 0x245F => 'Optical Character Recognition'],
[0x2460, 0x24FF => 'Enclosed Alphanumerics'],
[0x2500, 0x257F => 'Box Drawing'],
[0x2580, 0x259F => 'Block Elements'],
[0x25A0, 0x25FF => 'Geometric Shapes'],
[0x2600, 0x26FF => 'Miscellaneous Symbols'],
[0x2700, 0x27BF => 'Dingbats'],
[0x27C0, 0x27EF => 'Miscellaneous Mathematical Symbols-A'],
[0x27F0, 0x27FF => 'Supplemental Arrows-A'],
[0x2800, 0x28FF => 'Braille Patterns'],
[0x2900, 0x297F => 'Supplemental Arrows-B'],
[0x2980, 0x29FF => 'Miscellaneous Mathematical Symbols-B'],
[0x2A00, 0x2AFF => 'Supplemental Mathematical Operators'],
[0x2B00, 0x2BFF => 'Miscellaneous Symbols and Arrows'],
[0x2C00, 0x2C5F => 'Glagolitic'],
[0x2C60, 0x2C7F => 'Latin Extended-C'],
[0x2C80, 0x2CFF => 'Coptic'],
[0x2D00, 0x2D2F => 'Georgian Supplement'],
[0x2D30, 0x2D7F => 'Tifinagh'],
[0x2D80, 0x2DDF => 'Ethiopic Extended'],
[0x2DE0, 0x2DFF => 'Cyrillic Extended-A'],
[0x2E00, 0x2E7F => 'Supplemental Punctuation'],
[0x2E80, 0x2EFF => 'CJK Radicals Supplement'],
[0x2F00, 0x2FDF => 'Kangxi Radicals'],
[0x2FF0, 0x2FFF => 'Ideographic Description Characters'],
[0x3000, 0x303F => 'CJK Symbols and Punctuation'],
[0x3040, 0x309F => 'Hiragana'],
[0x30A0, 0x30FF => 'Katakana'],
[0x3100, 0x312F => 'Bopomofo'],
[0x3130, 0x318F => 'Hangul Compatibility Jamo'],
[0x3190, 0x319F => 'Kanbun'],
[0x31A0, 0x31BF => 'Bopomofo Extended'],
[0x31C0, 0x31EF => 'CJK Strokes'],
[0x31F0, 0x31FF => 'Katakana Phonetic Extensions'],
[0x3200, 0x32FF => 'Enclosed CJK Letters and Months'],
[0x3300, 0x33FF => 'CJK Compatibility'],
[0x3400, 0x4DBF => 'CJK Unified Ideographs Extension A'],
[0x4DC0, 0x4DFF => 'Yijing Hexagram Symbols'],
[0x4E00, 0x9FFF => 'CJK Unified Ideographs'],
[0xA000, 0xA48F => 'Yi Syllables'],
[0xA490, 0xA4CF => 'Yi Radicals'],
[0xA4D0, 0xA4FF => 'Lisu'],
[0xA500, 0xA63F => 'Vai'],
[0xA640, 0xA69F => 'Cyrillic Extended-B'],
[0xA6A0, 0xA6FF => 'Bamum'],
[0xA700, 0xA71F => 'Modifier Tone Letters'],
[0xA720, 0xA7FF => 'Latin Extended-D'],
[0xA800, 0xA82F => 'Syloti Nagri'],
[0xA830, 0xA83F => 'Common Indic Number Forms'],
[0xA840, 0xA87F => 'Phags-pa'],
[0xA880, 0xA8DF => 'Saurashtra'],
[0xA8E0, 0xA8FF => 'Devanagari Extended'],
[0xA900, 0xA92F => 'Kayah Li'],
[0xA930, 0xA95F => 'Rejang'],
[0xA960, 0xA97F => 'Hangul Jamo Extended-A'],
[0xA980, 0xA9DF => 'Javanese'],
[0xA9E0, 0xA9FF => 'Myanmar Extended-B'],
[0xAA00, 0xAA5F => 'Cham'],
[0xAA60, 0xAA7F => 'Myanmar Extended-A'],
[0xAA80, 0xAADF => 'Tai Viet'],
[0xAAE0, 0xAAFF => 'Meetei Mayek Extensions'],
[0xAB00, 0xAB2F => 'Ethiopic Extended-A'],
[0xAB30, 0xAB6F => 'Latin Extended-E'],
[0xAB70, 0xABBF => 'Cherokee Supplement'],
[0xABC0, 0xABFF => 'Meetei Mayek'],
[0xAC00, 0xD7AF => 'Hangul Syllables'],
[0xD7B0, 0xD7FF => 'Hangul Jamo Extended-B'],
[0xD800, 0xDB7F => 'High Surrogates'],
[0xDB80, 0xDBFF => 'High Private Use Surrogates'],
[0xDC00, 0xDFFF => 'Low Surrogates'],
[0xE000, 0xF8FF => 'Private Use Area'],
[0xF900, 0xFAFF => 'CJK Compatibility Ideographs'],
[0xFB00, 0xFB4F => 'Alphabetic Presentation Forms'],
[0xFB50, 0xFDFF => 'Arabic Presentation Forms-A'],
[0xFE00, 0xFE0F => 'Variation Selectors'],
[0xFE10, 0xFE1F => 'Vertical Forms'],
[0xFE20, 0xFE2F => 'Combining Half Marks'],
[0xFE30, 0xFE4F => 'CJK Compatibility Forms'],
[0xFE50, 0xFE6F => 'Small Form Variants'],
[0xFE70, 0xFEFF => 'Arabic Presentation Forms-B'],
[0xFF00, 0xFFEF => 'Halfwidth and Fullwidth Forms'],
[0xFFF0, 0xFFFF => 'Specials'],
[0x10000, 0x1007F => 'Linear B Syllabary'],
[0x10080, 0x100FF => 'Linear B Ideograms'],
[0x10100, 0x1013F => 'Aegean Numbers'],
[0x10140, 0x1018F => 'Ancient Greek Numbers'],
[0x10190, 0x101CF => 'Ancient Symbols'],
[0x101D0, 0x101FF => 'Phaistos Disc'],
[0x10280, 0x1029F => 'Lycian'],
[0x102A0, 0x102DF => 'Carian'],
[0x102E0, 0x102FF => 'Coptic Epact Numbers'],
[0x10300, 0x1032F => 'Old Italic'],
[0x10330, 0x1034F => 'Gothic'],
[0x10350, 0x1037F => 'Old Permic'],
[0x10380, 0x1039F => 'Ugaritic'],
[0x103A0, 0x103DF => 'Old Persian'],
[0x10400, 0x1044F => 'Deseret'],
[0x10450, 0x1047F => 'Shavian'],
[0x10480, 0x104AF => 'Osmanya'],
[0x104B0, 0x104FF => 'Osage'],
[0x10500, 0x1052F => 'Elbasan'],
[0x10530, 0x1056F => 'Caucasian Albanian'],
[0x10570, 0x105BF => 'Vithkuqi'],
[0x10600, 0x1077F => 'Linear A'],
[0x10780, 0x107BF => 'Latin Extended-F'],
[0x10800, 0x1083F => 'Cypriot Syllabary'],
[0x10840, 0x1085F => 'Imperial Aramaic'],
[0x10860, 0x1087F => 'Palmyrene'],
[0x10880, 0x108AF => 'Nabataean'],
[0x108E0, 0x108FF => 'Hatran'],
[0x10900, 0x1091F => 'Phoenician'],
[0x10920, 0x1093F => 'Lydian'],
[0x10980, 0x1099F => 'Meroitic Hieroglyphs'],
[0x109A0, 0x109FF => 'Meroitic Cursive'],
[0x10A00, 0x10A5F => 'Kharoshthi'],
[0x10A60, 0x10A7F => 'Old South Arabian'],
[0x10A80, 0x10A9F => 'Old North Arabian'],
[0x10AC0, 0x10AFF => 'Manichaean'],
[0x10B00, 0x10B3F => 'Avestan'],
[0x10B40, 0x10B5F => 'Inscriptional Parthian'],
[0x10B60, 0x10B7F => 'Inscriptional Pahlavi'],
[0x10B80, 0x10BAF => 'Psalter Pahlavi'],