]> pere.pagekite.me Git - homepage.git/blob - linux/charmaps-check
Switched blog to hungry.com for now. Updated all links.
[homepage.git] / linux / charmaps-check
1 #!/store/bin/perl5 -w
2 #
3 # Author: Petter Reinholdtsen <pere@td.org.uit.no>
4 # Date: 1998-08-31
5 #
6 # Check GNU libc charmaps for consistent symbolic naming and Unicode
7 # encoding in comments.
8 #
9 # I run it like this: 'charmaps-check ISO_10646 ISO-IR-197' to
10 # check if ISO-IR-197 use the same names as ISO-10646 (Unicode)
11
12 use strict;
13 use vars qw($filename $error %unicodes %codes
14 $code_set_name $comment_char $escape_char);
15
16 $error = 0;
17 while ($filename = shift) {
18
19 # *** Default values ***
20 $escape_char = '\\';
21 $comment_char = '#';
22
23 open(CHARMAP, "<$filename") || warn "Unable to open $filename";
24 while (<CHARMAP>) {
25 if (/<code_set_name>\s*(.*)$/) {
26 $code_set_name = $1;
27 next;
28 }
29 if (/<comment_char>\s*(\S+)/) {
30 $comment_char = $1;
31 next;
32 }
33
34 if (/<escape_char>\s*(\S+)/) {
35 $escape_char = $1;
36 next;
37 }
38 parse_charmap() if (/^CHARMAP/);
39 }
40 close CHARMAP;
41 }
42 exit 1 if ($error);
43
44 # Convert hex, octal and decimal string values to numbers
45 # Must handle /x0f => 15 /x0f/x0f => 3855
46 sub parse_encoding {
47 my $input = shift;
48
49 my $base = 1;
50 my $value = 0;
51 while ( $input ) {
52 $value *= $base;
53
54 if ( $input =~ m/${escape_char}x([0-9a-fA-F]{2})/ ) {
55 my $hex = $1;
56 my $number = oct("0x".$hex);
57 $value += $number;
58 $input =~ s/${escape_char}x$hex//; # Remove this part of the string
59 $base = 16*16;
60 }
61 }
62
63 return $value;
64 }
65
66 # Receive param with this format, and return them as array
67 # <SO> /x0E <U000E> SHIFT OUT (SO)
68 # @retval = ('SO', 14, '<U000U> SHIFT OUT');
69 sub parse_charmap_line {
70 my $line = shift;
71
72 # Skip comments
73 return if ($comment_char && /^%comment_char/);
74
75 # Skip blank lines
76 return if ($line =~ /^\s*$/);
77
78 my ($code, $encoding, $comment) =
79 $line =~ m/^<(\S+)>\s+(\S+)\s+(.+)$/;
80 $encoding = parse_encoding($encoding);
81 return ($code, $encoding, $comment);
82 }
83
84 sub parse_charmap {
85 while (<CHARMAP>) {
86 if (/^END CHARMAP$/) {
87 $code_set_name = "[unknown]";
88 return;
89 }
90
91 chomp;
92 my $line = $_;
93
94 my ($code, $encoding, $comment) = parse_charmap_line($line);
95 my ($unicode, $name);
96
97 print "Unable to parse line: $line\n" if ( ! $comment );
98
99 if ( $comment =~ m/<U([0-9a-fA-Fx]{4})>\s+(.*)/ ) {
100 ($unicode, $name) = (oct("0x".$1), $2);
101 } else {
102 $unicode = $encoding;
103 $name = $comment;
104 }
105
106 if ($code) {
107
108 # ************ Check UNICODE value<->name *****************
109 # Don't test U0000, as it is named both 'NULL' and 'NUL'
110 if (0 != $unicode && exists $unicodes{$unicode}) {
111 unless ($unicodes{$unicode} eq $name) {
112 print "$filename\[$code_set_name\]: Same unicode ($unicode) but different name\n \"$name\"\n != \"$unicodes{$unicode}\"\n";
113 $error = 1;
114 }
115 } else {
116 $unicodes{$unicode} = $name;
117 }
118 # *********** Check UNICODE value<->code *****************
119 # XXX Hm, don't seem to work right. Don't handle multiple
120 # codes for the same name
121 if (exists $codes{$code}) {
122 unless ($codes{$code} eq $unicode) {
123 print "$filename\[$code_set_name\]: Same code ($code) but different unicode \"$unicode\" != \"$codes{$code}\"\n";
124 $error = 1;
125 }
126 } else {
127 $codes{$code} = $unicode;
128 }
129 }
130 }
131 }