3 # Author: Petter Reinholdtsen <pere@td.org.uit.no>
6 # Check GNU libc charmaps for consistent symbolic naming and Unicode
7 # encoding in comments.
9 # I run it like this: 'charmaps-check ISO_10646 ISO-IR-197' to
10 # check if ISO-IR-197 use the same names as ISO-10646 (Unicode)
13 use vars qw($filename $error %unicodes %codes
14 $code_set_name $comment_char $escape_char);
17 while ($filename = shift) {
19 # *** Default values ***
23 open(CHARMAP, "<$filename") || warn "Unable to open $filename";
25 if (/<code_set_name>\s*(.*)$/) {
29 if (/<comment_char>\s*(\S+)/) {
34 if (/<escape_char>\s*(\S+)/) {
38 parse_charmap() if (/^CHARMAP/);
44 # Convert hex, octal and decimal string values to numbers
45 # Must handle /x0f => 15 /x0f/x0f => 3855
54 if ( $input =~ m/${escape_char}x([0-9a-fA-F]{2})/ ) {
56 my $number = oct("0x".$hex);
58 $input =~ s/${escape_char}x$hex//; # Remove this part of the string
66 # Receive param with this format, and return them as array
67 # <SO> /x0E <U000E> SHIFT OUT (SO)
68 # @retval = ('SO', 14, '<U000U> SHIFT OUT');
69 sub parse_charmap_line {
73 return if ($comment_char && /^%comment_char/);
76 return if ($line =~ /^\s*$/);
78 my ($code, $encoding, $comment) =
79 $line =~ m/^<(\S+)>\s+(\S+)\s+(.+)$/;
80 $encoding = parse_encoding($encoding);
81 return ($code, $encoding, $comment);
86 if (/^END CHARMAP$/) {
87 $code_set_name = "[unknown]";
94 my ($code, $encoding, $comment) = parse_charmap_line($line);
97 print "Unable to parse line: $line\n" if ( ! $comment );
99 if ( $comment =~ m/<U([0-9a-fA-Fx]{4})>\s+(.*)/ ) {
100 ($unicode, $name) = (oct("0x".$1), $2);
102 $unicode = $encoding;
108 # ************ Check UNICODE value<->name *****************
109 # Don't test U0000, as it is named both 'NULL' and 'NUL'
110 if (0 != $unicode && exists $unicodes{$unicode}) {
111 unless ($unicodes{$unicode} eq $name) {
112 print "$filename\[$code_set_name\]: Same unicode ($unicode) but different name\n \"$name\"\n != \"$unicodes{$unicode}\"\n";
116 $unicodes{$unicode} = $name;
118 # *********** Check UNICODE value<->code *****************
119 # XXX Hm, don't seem to work right. Don't handle multiple
120 # codes for the same name
121 if (exists $codes{$code}) {
122 unless ($codes{$code} eq $unicode) {
123 print "$filename\[$code_set_name\]: Same code ($code) but different unicode \"$unicode\" != \"$codes{$code}\"\n";
127 $codes{$code} = $unicode;