1package Encode::Alias;
2use strict;
3use warnings;
4our $VERSION = do { my @r = ( q$Revision: 2.25 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
5use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
6
7use Exporter 'import';
8
9# Public, encouraged API is exported by default
10
11our @EXPORT =
12  qw (
13  define_alias
14  find_alias
15);
16
17our @Alias;    # ordered matching list
18our %Alias;    # cached known aliases
19
20sub find_alias {
21    my $class = shift;
22    my $find  = shift;
23    unless ( exists $Alias{$find} ) {
24        $Alias{$find} = undef;    # Recursion guard
25        for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
26            my $alias = $Alias[$i];
27            my $val   = $Alias[ $i + 1 ];
28            my $new;
29            if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
30                DEBUG and warn "eval $val";
31                $new = eval $val;
32                DEBUG and $@ and warn "$val, $@";
33            }
34            elsif ( ref($alias) eq 'CODE' ) {
35                DEBUG and warn "$alias", "->", "($find)";
36                $new = $alias->($find);
37            }
38            elsif ( lc($find) eq lc($alias) ) {
39                $new = $val;
40            }
41            if ( defined($new) ) {
42                next if $new eq $find;    # avoid (direct) recursion on bugs
43                DEBUG and warn "$alias, $new";
44                my $enc =
45                  ( ref($new) ) ? $new : Encode::find_encoding($new);
46                if ($enc) {
47                    $Alias{$find} = $enc;
48                    last;
49                }
50            }
51        }
52
53        # case insensitive search when canonical is not in all lowercase
54        # RT ticket #7835
55        unless ( $Alias{$find} ) {
56            my $lcfind = lc($find);
57            for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
58            {
59                $lcfind eq lc($name) or next;
60                $Alias{$find} = Encode::find_encoding($name);
61                DEBUG and warn "$find => $name";
62            }
63        }
64    }
65    if (DEBUG) {
66        my $name;
67        if ( my $e = $Alias{$find} ) {
68            $name = $e->name;
69        }
70        else {
71            $name = "";
72        }
73        warn "find_alias($class, $find)->name = $name";
74    }
75    return $Alias{$find};
76}
77
78sub define_alias {
79    while (@_) {
80        my $alias = shift;
81        my $name = shift;
82        unshift( @Alias, $alias => $name )    # newer one has precedence
83            if defined $alias;
84        if ( ref($alias) ) {
85
86            # clear %Alias cache to allow overrides
87            my @a = keys %Alias;
88            for my $k (@a) {
89                if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
90                    DEBUG and warn "delete \$Alias\{$k\}";
91                    delete $Alias{$k};
92                }
93                elsif ( ref($alias) eq 'CODE' && $alias->($k) ) {
94                    DEBUG and warn "delete \$Alias\{$k\}";
95                    delete $Alias{$k};
96                }
97            }
98        }
99        elsif (defined $alias) {
100            DEBUG and warn "delete \$Alias\{$alias\}";
101            delete $Alias{$alias};
102        }
103        elsif (DEBUG) {
104            require Carp;
105            Carp::croak("undef \$alias");
106        }
107    }
108}
109
110# HACK: Encode must be used after define_alias is declarated as Encode calls define_alias
111use Encode ();
112
113# Allow latin-1 style names as well
114# 0  1  2  3  4  5   6   7   8   9  10
115our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
116
117# Allow winlatin1 style names as well
118our %Winlatin2cp = (
119    'latin1'     => 1252,
120    'latin2'     => 1250,
121    'cyrillic'   => 1251,
122    'greek'      => 1253,
123    'turkish'    => 1254,
124    'hebrew'     => 1255,
125    'arabic'     => 1256,
126    'baltic'     => 1257,
127    'vietnamese' => 1258,
128);
129
130init_aliases();
131
132sub undef_aliases {
133    @Alias = ();
134    %Alias = ();
135}
136
137sub init_aliases {
138    undef_aliases();
139
140    # Try all-lower-case version should all else fails
141    define_alias( qr/^(.*)$/ => '"\L$1"' );
142
143    # UTF/UCS stuff
144    define_alias( qr/^(unicode-1-1-)?UTF-?7$/i     => '"UTF-7"' );
145    define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
146    define_alias(
147        qr/^UCS-?2-?(BE)?$/i    => '"UCS-2BE"',
148        qr/^UCS-?4-?(BE|LE|)?$/i => 'uc("UTF-32$1")',
149        qr/^iso-10646-1$/i      => '"UCS-2BE"'
150    );
151    define_alias(
152        qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
153        qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
154        qr/^UTF-?(16|32)$/i     => '"UTF-$1"',
155    );
156
157    # ASCII
158    define_alias( qr/^(?:US-?)ascii$/i       => '"ascii"' );
159    define_alias( 'C'                        => 'ascii' );
160    define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' );
161
162    # Allow variants of iso-8859-1 etc.
163    define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
164
165    # ISO-8859-8-I => ISO-8859-8
166    # https://en.wikipedia.org/wiki/ISO-8859-8-I
167    define_alias( qr/\biso[-_]8859[-_]8[-_]I$/i => '"iso-8859-8"' );
168
169    # At least HP-UX has these.
170    define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
171
172    # More HP stuff.
173    define_alias(
174        qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
175          '"${1}8"' );
176
177    # The Official name of ASCII.
178    define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
179
180    # This is a font issue, not an encoding issue.
181    # (The currency symbol of the Latin 1 upper half
182    #  has been redefined as the euro symbol.)
183    define_alias( qr/^(.+)\@euro$/i => '"$1"' );
184
185    define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
186'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
187    );
188
189    define_alias(
190        qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
191             hebrew|arabic|baltic|vietnamese)$/ix =>
192          '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
193    );
194
195    # Common names for non-latin preferred MIME names
196    define_alias(
197        'ascii'    => 'US-ascii',
198        'cyrillic' => 'iso-8859-5',
199        'arabic'   => 'iso-8859-6',
200        'greek'    => 'iso-8859-7',
201        'hebrew'   => 'iso-8859-8',
202        'thai'     => 'iso-8859-11',
203    );
204    # RT #20781
205    define_alias(qr/\btis-?620\b/i  => '"iso-8859-11"');
206
207    # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
208    # And Microsoft has their own naming (again, surprisingly).
209    # And windows-* is registered in IANA!
210    define_alias(
211        qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
212
213    # Sometimes seen with a leading zero.
214    # define_alias( qr/\bcp037\b/i => '"cp37"');
215
216    # Mac Mappings
217    # predefined in *.ucm; unneeded
218    # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
219    define_alias( qr/^(?:x[_-])?mac[_-](.*)$/i => '"mac$1"' );
220    # http://rt.cpan.org/Ticket/Display.html?id=36326
221    define_alias( qr/^macintosh$/i => '"MacRoman"' );
222    # https://rt.cpan.org/Ticket/Display.html?id=78125
223    define_alias( qr/^macce$/i => '"MacCentralEurRoman"' );
224    # Ououououou. gone.  They are different!
225    # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
226
227    # Standardize on the dashed versions.
228    define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
229
230    unless ($Encode::ON_EBCDIC) {
231
232        # for Encode::CN
233        define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
234        define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
235
236        # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
237        # CP936 doesn't have vendor-addon for GBK, so they're identical.
238        define_alias( qr/^gbk$/i => '"cp936"' );
239
240        # This fixes gb2312 vs. euc-cn confusion, practically
241        define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
242
243        # for Encode::JP
244        define_alias( qr/\bjis$/i         => '"7bit-jis"' );
245        define_alias( qr/\beuc.*jp$/i     => '"euc-jp"' );
246        define_alias( qr/\bjp.*euc$/i     => '"euc-jp"' );
247        define_alias( qr/\bujis$/i        => '"euc-jp"' );
248        define_alias( qr/\bshift.*jis$/i  => '"shiftjis"' );
249        define_alias( qr/\bsjis$/i        => '"shiftjis"' );
250        define_alias( qr/\bwindows-31j$/i => '"cp932"' );
251
252        # for Encode::KR
253        define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
254        define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
255
256        # This fixes ksc5601 vs. euc-kr confusion, practically
257        define_alias( qr/(?:x-)?uhc$/i         => '"cp949"' );
258        define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
259        define_alias( qr/\bks_c_5601-1987$/i   => '"cp949"' );
260
261        # for Encode::TW
262        define_alias( qr/\bbig-?5$/i              => '"big5-eten"' );
263        define_alias( qr/\bbig5-?et(?:en)?$/i     => '"big5-eten"' );
264        define_alias( qr/\btca[-_]?big5$/i        => '"big5-eten"' );
265        define_alias( qr/\bbig5-?hk(?:scs)?$/i    => '"big5-hkscs"' );
266        define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
267    }
268
269    # https://github.com/dankogai/p5-encode/issues/37
270    define_alias(qr/cp65000/i => '"UTF-7"');
271    define_alias(qr/cp65001/i => '"utf-8-strict"');
272
273    # utf8 is blessed :)
274    define_alias( qr/\bUTF-8$/i => '"utf-8-strict"' );
275
276    # At last, Map white space and _ to '-'
277    define_alias( qr/^([^\s_]+)[\s_]+([^\s_]*)$/i => '"$1-$2"' );
278}
279
2801;
281__END__
282
283# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
284# TODO: HP-UX '15' encodings japanese15 korean15 roi15
285# TODO: Cyrillic encoding ISO-IR-111 (useful?)
286# TODO: Armenian encoding ARMSCII-8
287# TODO: Hebrew encoding ISO-8859-8-1
288# TODO: Thai encoding TCVN
289# TODO: Vietnamese encodings VPS
290# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
291#       ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
292#       Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
293#       Kannada Khmer Korean Laotian Malayalam Mongolian
294#       Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
295
296=head1 NAME
297
298Encode::Alias - alias definitions to encodings
299
300=head1 SYNOPSIS
301
302  use Encode;
303  use Encode::Alias;
304  define_alias( "newName" => ENCODING);
305  define_alias( qr/.../ => ENCODING);
306  define_alias( sub { return ENCODING if ...; } );
307
308=head1 DESCRIPTION
309
310Allows newName to be used as an alias for ENCODING. ENCODING may be
311either the name of an encoding or an encoding object (as described
312in L<Encode>).
313
314Currently the first argument to define_alias() can be specified in the
315following ways:
316
317=over 4
318
319=item As a simple string.
320
321=item As a qr// compiled regular expression, e.g.:
322
323  define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
324
325In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
326in order to allow C<$1> etc. to be substituted.  The example is one
327way to alias names as used in X11 fonts to the MIME names for the
328iso-8859-* family.  Note the double quotes inside the single quotes.
329
330(or, you don't have to do this yourself because this example is predefined)
331
332If you are using a regex here, you have to use the quotes as shown or
333it won't work.  Also note that regex handling is tricky even for the
334experienced.  Use this feature with caution.
335
336=item As a code reference, e.g.:
337
338  define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
339
340The same effect as the example above in a different way.  The coderef
341takes the alias name as an argument and returns a canonical name on
342success or undef if not.  Note the second argument is ignored if provided.
343Use this with even more caution than the regex version.
344
345=back
346
347=head3 Changes in code reference aliasing
348
349As of Encode 1.87, the older form
350
351  define_alias( sub { return  /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
352
353no longer works.
354
355Encode up to 1.86 internally used "local $_" to implement this older
356form.  But consider the code below;
357
358  use Encode;
359  $_ = "eeeee" ;
360  while (/(e)/g) {
361    my $utf = decode('aliased-encoding-name', $1);
362    print "position:",pos,"\n";
363  }
364
365Prior to Encode 1.86 this fails because of "local $_".
366
367=head2 Alias overloading
368
369You can override predefined aliases by simply applying define_alias().
370The new alias is always evaluated first, and when necessary,
371define_alias() flushes the internal cache to make the new definition
372available.
373
374  # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
375  # superset of SHIFT_JIS
376
377  define_alias( qr/shift.*jis$/i  => '"cp932"' );
378  define_alias( qr/sjis$/i        => '"cp932"' );
379
380If you want to zap all predefined aliases, you can use
381
382  Encode::Alias->undef_aliases;
383
384to do so.  And
385
386  Encode::Alias->init_aliases;
387
388gets the factory settings back.
389
390Note that define_alias() will not be able to override the canonical name
391of encodings. Encodings are first looked up by canonical name before
392potential aliases are tried.
393
394=head1 SEE ALSO
395
396L<Encode>, L<Encode::Supported>
397
398=cut
399
400