119370Spstpackage Unicode::Collate;
219370Spst
3130803Smarceluse 5.006;
4130803Smarceluse strict;
5130803Smarceluse warnings;
698944Sobrienuse Carp;
719370Spstuse File::Spec;
898944Sobrien
998944Sobrienno warnings 'utf8';
1098944Sobrien
1198944Sobrienour $VERSION = '1.31';
1219370Spstour $PACKAGE = __PACKAGE__;
1398944Sobrien
1498944Sobrien### begin XS only ###
1598944Sobrienuse XSLoader ();
1698944SobrienXSLoader::load('Unicode::Collate', $VERSION);
1719370Spst### end XS only ###
1898944Sobrien
1998944Sobrienmy @Path = qw(Unicode Collate);
2098944Sobrienmy $KeyFile = 'allkeys.txt';
2198944Sobrien
2219370Spst# Perl's boolean
2319370Spstuse constant TRUE  => 1;
2419370Spstuse constant FALSE => "";
2519370Spstuse constant NOMATCHPOS => -1;
2619370Spst
2798944Sobrien# A coderef to get combining class imported from Unicode::Normalize
2898944Sobrien# (i.e. \&Unicode::Normalize::getCombinClass).
2946283Sdfr# This is also used as a HAS_UNICODE_NORMALIZE flag.
3046283Sdfrmy $CVgetCombinClass;
3119370Spst
3219370Spst# Supported Levels
3319370Spstuse constant MinLevel => 1;
3419370Spstuse constant MaxLevel => 4;
3519370Spst
3619370Spst# Minimum weights at level 2 and 3, respectively
3719370Spstuse constant Min2Wt => 0x20;
3819370Spstuse constant Min3Wt => 0x02;
3919370Spst
4019370Spst# Shifted weight at 4th level
4119370Spstuse constant Shift4Wt => 0xFFFF;
4219370Spst
4319370Spst# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
4419370Spstuse constant VCE_TEMPLATE => 'Cn4';
4519370Spst
4619370Spst# A sort key: 16-bit weights
4798944Sobrienuse constant KEY_TEMPLATE => 'n*';
4898944Sobrien
4998944Sobrien# The tie-breaking: 32-bit weights
5098944Sobrienuse constant TIE_TEMPLATE => 'N*';
5119370Spst
5219370Spst# Level separator in a sort key:
5319370Spst# i.e. pack(KEY_TEMPLATE, 0)
5419370Spstuse constant LEVEL_SEP => "\0\0";
5598944Sobrien
5698944Sobrien# As Unicode code point separator for hash keys.
5798944Sobrien# A joined code point string (denoted by JCPS below)
5898944Sobrien# like "65;768" is used for internal processing
5998944Sobrien# instead of Perl's Unicode string like "\x41\x{300}",
6098944Sobrien# as the native code point is different from the Unicode code point
6198944Sobrien# on EBCDIC platform.
6298944Sobrien# This character must not be included in any stringified
6398944Sobrien# representation of an integer.
6498944Sobrienuse constant CODE_SEP => ';';
6598944Sobrien	# NOTE: in regex /;/ is used for $jcps!
6698944Sobrien
6798944Sobrien# boolean values of variable weights
6898944Sobrienuse constant NON_VAR => 0; # Non-Variable character
6998944Sobrienuse constant VAR     => 1; # Variable character
7098944Sobrien
7198944Sobrien# specific code points
7298944Sobrienuse constant Hangul_SIni   => 0xAC00;
7398944Sobrienuse constant Hangul_SFin   => 0xD7A3;
7498944Sobrien
7598944Sobrien# Logical_Order_Exception in PropList.txt
7698944Sobrienmy $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
7798944Sobrien
7898944Sobrien# for highestFFFF and minimalFFFE
7919370Spstmy $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
8098944Sobrienmy $minimalVCE = pack(VCE_TEMPLATE, 0,      1, 0x20, 0x5, 0xFFFE);
8119370Spst
8298944Sobriensub UCA_Version { '43' }
8398944Sobrien
8419370Spstsub Base_Unicode_Version { '13.0.0' }
8598944Sobrien
8698944Sobrien######
8798944Sobrien
8819370Spstmy $native_to_unicode = ($::IS_ASCII || $] < 5.008)
8998944Sobrien	? sub { return shift }
9098944Sobrien	: sub { utf8::native_to_unicode(shift) };
9198944Sobrien
9219370Spstmy $unicode_to_native = ($::IS_ASCII || $] < 5.008)
9398944Sobrien	? sub { return shift }
9419370Spst	: sub { utf8::unicode_to_native(shift) };
9598944Sobrien
9698944Sobrien# pack_U() should get Unicode code points.
9719370Spstsub pack_U {
9898944Sobrien    return pack('U*', map $unicode_to_native->($_), @_);
9998944Sobrien}
10098944Sobrien
10198944Sobrien# unpack_U() should return Unicode code points.
10298944Sobriensub unpack_U {
10319370Spst    return map $native_to_unicode->($_), unpack('U*', shift(@_).pack('U*'));
10498944Sobrien}
10598944Sobrien# for older perl version, pack('U*') generates empty string with utf8 flag.
10698944Sobrien
10798944Sobrien######
10819370Spst
10998944Sobrienmy (%VariableOK);
11098944Sobrien@VariableOK{ qw/
11198944Sobrien    blanked  non-ignorable  shifted  shift-trimmed
11298944Sobrien  / } = (); # keys lowercased
11319370Spst
114130803Smarcelour @ChangeOK = qw/
11598944Sobrien    alternate backwards level normalization rearrange
11619370Spst    katakana_before_hiragana upper_before_lower ignore_level2
117130803Smarcel    overrideCJK overrideHangul overrideOut preprocess UCA_Version
118130803Smarcel    hangul_terminator variable identical highestFFFF minimalFFFE
11998944Sobrien    long_contraction
12019370Spst  /;
12198944Sobrien
12298944Sobrienour @ChangeNG = qw/
12319370Spst    entry mapping table maxlength contraction
12498944Sobrien    ignoreChar ignoreName undefChar undefName rewrite
12598944Sobrien    versionTable alternateTable backwardsTable forwardsTable
12619370Spst    rearrangeTable variableTable
127130803Smarcel    derivCode normCode rearrangeHash backwardsFlag
128130803Smarcel    suppress suppressHash
12998944Sobrien    __useXS /; ### XS only
13019370Spst# The hash key 'ignored' was deleted at v 0.21.
131130803Smarcel# The hash key 'isShift' was deleted at v 0.23.
132130803Smarcel# The hash key 'combining' was deleted at v 0.24.
133130803Smarcel# The hash key 'entries' was deleted at v 0.30.
13498944Sobrien# The hash key 'L3_ignorable' was deleted at v 0.40.
13519370Spst
13698944Sobriensub version {
13798944Sobrien    my $self = shift;
13898944Sobrien    return $self->{versionTable} || 'unknown';
13998944Sobrien}
14098944Sobrien
14198944Sobrienmy (%ChangeOK, %ChangeNG);
14219370Spst@ChangeOK{ @ChangeOK } = ();
14398944Sobrien@ChangeNG{ @ChangeNG } = ();
14498944Sobrien
14598944Sobriensub change {
14619370Spst    my $self = shift;
14798944Sobrien    my %hash = @_;
14898944Sobrien    my %old;
14998944Sobrien    if (exists $hash{alternate}) {
15098944Sobrien	if (exists $hash{variable}) {
15198944Sobrien	    delete $hash{alternate};
15219370Spst	} else {
15398944Sobrien	    $hash{variable} = $hash{alternate};
15498944Sobrien	}
15519370Spst    }
15698944Sobrien    foreach my $k (keys %hash) {
15798944Sobrien	if (exists $ChangeOK{$k}) {
15898944Sobrien	    $old{$k} = $self->{$k};
15998944Sobrien	    $self->{$k} = $hash{$k};
16098944Sobrien	} elsif (exists $ChangeNG{$k}) {
16198944Sobrien	    croak "change of $k via change() is not allowed!";
16219370Spst	}
16398944Sobrien	# else => ignored
16498944Sobrien    }
16598944Sobrien    $self->checkCollator();
16698944Sobrien    return wantarray ? %old : $self;
16798944Sobrien}
16819370Spst
16998944Sobriensub _checkLevel {
17098944Sobrien    my $level = shift;
17198944Sobrien    my $key   = shift; # 'level' or 'backwards'
17298944Sobrien    MinLevel <= $level or croak sprintf
17319370Spst	"Illegal level %d (in value for key '%s') lower than %d.",
17498944Sobrien	    $level, $key, MinLevel;
17598944Sobrien    $level <= MaxLevel or croak sprintf
17698944Sobrien	"Unsupported level %d (in value for key '%s') higher than %d.",
17719370Spst	    $level, $key, MaxLevel;
17898944Sobrien}
17998944Sobrien
18098944Sobrienmy %DerivCode = (
18198944Sobrien    8 => \&_derivCE_8,
18298944Sobrien    9 => \&_derivCE_9,
18398944Sobrien   11 => \&_derivCE_9, # 11 == 9
18498944Sobrien   14 => \&_derivCE_14,
18519370Spst   16 => \&_derivCE_14, # 16 == 14
186130803Smarcel   18 => \&_derivCE_18,
187130803Smarcel   20 => \&_derivCE_20,
188130803Smarcel   22 => \&_derivCE_22,
189130803Smarcel   24 => \&_derivCE_24,
190130803Smarcel   26 => \&_derivCE_24, # 26 == 24
191130803Smarcel   28 => \&_derivCE_24, # 28 == 24
19298944Sobrien   30 => \&_derivCE_24, # 30 == 24
19398944Sobrien   32 => \&_derivCE_32,
19498944Sobrien   34 => \&_derivCE_34,
19598944Sobrien   36 => \&_derivCE_36,
19698944Sobrien   38 => \&_derivCE_38,
19798944Sobrien   40 => \&_derivCE_40,
19819370Spst   41 => \&_derivCE_40, # 41 == 40
19998944Sobrien   43 => \&_derivCE_43,
20098944Sobrien);
20198944Sobrien
20219370Spstsub checkCollator {
20398944Sobrien    my $self = shift;
20498944Sobrien    _checkLevel($self->{level}, 'level');
20598944Sobrien
20698944Sobrien    $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
20798944Sobrien	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
20819370Spst
20998944Sobrien    $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
21098944Sobrien				$self->{alternateTable} || 'shifted';
21198944Sobrien    $self->{variable} = $self->{alternate} = lc($self->{variable});
21298944Sobrien    exists $VariableOK{ $self->{variable} }
21398944Sobrien	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
21419370Spst
21598944Sobrien    if (! defined $self->{backwards}) {
21698944Sobrien	$self->{backwardsFlag} = 0;
21798944Sobrien    } elsif (! ref $self->{backwards}) {
21898944Sobrien	_checkLevel($self->{backwards}, 'backwards');
21998944Sobrien	$self->{backwardsFlag} = 1 << $self->{backwards};
22098944Sobrien    } else {
22198944Sobrien	my %level;
22298944Sobrien	$self->{backwardsFlag} = 0;
22398944Sobrien	for my $b (@{ $self->{backwards} }) {
22498944Sobrien	    _checkLevel($b, 'backwards');
22519370Spst	    $level{$b} = 1;
22698944Sobrien	}
22798944Sobrien	for my $v (sort keys %level) {
22898944Sobrien	    $self->{backwardsFlag} += 1 << $v;
22998944Sobrien	}
23019370Spst    }
23198944Sobrien
23298944Sobrien    defined $self->{rearrange} or $self->{rearrange} = [];
23398944Sobrien    ref $self->{rearrange}
23498944Sobrien	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
23598944Sobrien
23619370Spst    # keys of $self->{rearrangeHash} are $self->{rearrange}.
23798944Sobrien    $self->{rearrangeHash} = undef;
23898944Sobrien
23919370Spst    if (@{ $self->{rearrange} }) {
24098944Sobrien	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
24198944Sobrien    }
24298944Sobrien
24398944Sobrien    $self->{normCode} = undef;
24498944Sobrien
24598944Sobrien    if (defined $self->{normalization}) {
24698944Sobrien	eval { require Unicode::Normalize };
24798944Sobrien	$@ and croak "Unicode::Normalize is required to normalize strings";
24898944Sobrien
24998944Sobrien	$CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
25019370Spst
25198944Sobrien	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
25219370Spst	    $self->{normCode} = \&Unicode::Normalize::NFD;
25398944Sobrien	}
25498944Sobrien	elsif ($self->{normalization} ne 'prenormalized') {
25598944Sobrien	    my $norm = $self->{normalization};
25698944Sobrien	    $self->{normCode} = sub {
25798944Sobrien		Unicode::Normalize::normalize($norm, shift);
25898944Sobrien	    };
25998944Sobrien	    eval { $self->{normCode}->("") }; # try
26098944Sobrien	    $@ and croak "$PACKAGE unknown normalization form name: $norm";
26198944Sobrien	}
26298944Sobrien    }
26319370Spst    return;
264130803Smarcel}
26598944Sobrien
26619370Spstsub new
26798944Sobrien{
26898944Sobrien    my $class = shift;
26919370Spst    my $self = bless { @_ }, $class;
27098944Sobrien
27198944Sobrien### begin XS only ###
27298944Sobrien    if (! exists $self->{table}     && !defined $self->{rewrite} &&
27398944Sobrien	!defined $self->{undefName} && !defined $self->{ignoreName} &&
27498944Sobrien	!defined $self->{undefChar} && !defined $self->{ignoreChar}) {
27598944Sobrien	$self->{__useXS} = \&_fetch_simple;
27619370Spst    } else {
27798944Sobrien	$self->{__useXS} = undef;
27898944Sobrien    }
27998944Sobrien### end XS only ###
28019370Spst
28198944Sobrien    # keys of $self->{suppressHash} are $self->{suppress}.
28298944Sobrien    if ($self->{suppress} && @{ $self->{suppress} }) {
28319370Spst	@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
284130803Smarcel    } # before read_table()
28598944Sobrien
28698944Sobrien    # If undef is passed explicitly, no file is read.
28719370Spst    $self->{table} = $KeyFile if ! exists $self->{table};
288130803Smarcel    $self->read_table() if defined $self->{table};
289130803Smarcel
290130803Smarcel    if ($self->{entry}) {
291130803Smarcel	while ($self->{entry} =~ /([^\n]+)/g) {
292130803Smarcel	    $self->parseEntry($1, TRUE);
293130803Smarcel	}
294130803Smarcel    }
29598944Sobrien
29698944Sobrien    # only in new(), not in change()
29798944Sobrien    $self->{level} ||= MaxLevel;
29898944Sobrien    $self->{UCA_Version} ||= UCA_Version();
29919370Spst
300130803Smarcel    $self->{overrideHangul} = FALSE
301130803Smarcel	if ! exists $self->{overrideHangul};
302130803Smarcel    $self->{overrideCJK} = FALSE
30319370Spst	if ! exists $self->{overrideCJK};
304130803Smarcel    $self->{normalization} = 'NFD'
305130803Smarcel	if ! exists $self->{normalization};
306130803Smarcel    $self->{rearrange} = $self->{rearrangeTable} ||
307130803Smarcel	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
308130803Smarcel	if ! exists $self->{rearrange};
309130803Smarcel    $self->{backwards} = $self->{backwardsTable}
31098944Sobrien	if ! exists $self->{backwards};
31198944Sobrien    exists $self->{long_contraction} or $self->{long_contraction}
31219370Spst	= 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
31398944Sobrien
31498944Sobrien    # checkCollator() will be called in change()
31598944Sobrien    $self->checkCollator();
31698944Sobrien
31719370Spst    return $self;
31898944Sobrien}
31998944Sobrien
32019370Spstsub parseAtmark {
32198944Sobrien    my $self = shift;
322130803Smarcel    my $line = shift; # after s/^\s*\@//
323130803Smarcel
324130803Smarcel    if ($line =~ /^version\s*(\S*)/) {
325130803Smarcel	$self->{versionTable} ||= $1;
326130803Smarcel    }
327130803Smarcel    elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
328130803Smarcel	$self->{variableTable} ||= $1;
329130803Smarcel    }
330130803Smarcel    elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
331130803Smarcel	$self->{alternateTable} ||= $1;
332130803Smarcel    }
333130803Smarcel    elsif ($line =~ /^backwards\s+(\S*)/) {
334130803Smarcel	push @{ $self->{backwardsTable} }, $1;
335130803Smarcel    }
336130803Smarcel    elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
337130803Smarcel	push @{ $self->{forwardsTable} }, $1;
338130803Smarcel    }
339130803Smarcel    elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
340130803Smarcel	push @{ $self->{rearrangeTable} }, _getHexArray($1);
341130803Smarcel    }
342130803Smarcel}
343130803Smarcel
344130803Smarcelsub read_table {
345130803Smarcel    my $self = shift;
34698944Sobrien
34798944Sobrien### begin XS only ###
34819370Spst    if ($self->{__useXS}) {
34998944Sobrien	my @rest = _fetch_rest(); # complex matter need to parse
35098944Sobrien	for my $line (@rest) {
35198944Sobrien	    next if $line =~ /^\s*#/;
35298944Sobrien
35398944Sobrien	    if ($line =~ s/^\s*\@//) {
35498944Sobrien		$self->parseAtmark($line);
35598944Sobrien	    } else {
35698944Sobrien		$self->parseEntry($line);
35798944Sobrien	    }
35898944Sobrien	}
35998944Sobrien	return;
36098944Sobrien    }
36119370Spst### end XS only ###
36219370Spst
36398944Sobrien    my($f, $fh);
36498944Sobrien    foreach my $d (@INC) {
36598944Sobrien	$f = File::Spec->catfile($d, @Path, $self->{table});
36698944Sobrien	last if open($fh, $f);
36798944Sobrien	$f = undef;
36819370Spst    }
36919370Spst    if (!defined $f) {
37019370Spst	$f = File::Spec->catfile(@Path, $self->{table});
37119370Spst	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
37219370Spst    }
37319370Spst
37419370Spst    while (my $line = <$fh>) {
37519370Spst	next if $line =~ /^\s*#/;
37619370Spst
37719370Spst	if ($line =~ s/^\s*\@//) {
37819370Spst	    $self->parseAtmark($line);
37998944Sobrien	} else {
38019370Spst	    $self->parseEntry($line);
38198944Sobrien	}
38219370Spst    }
38319370Spst    close $fh;
38419370Spst}
38519370Spst
38619370Spst
38719370Spst##
38819370Spst## get $line, parse it, and write an entry in $self
38919370Spst##
39019370Spstsub parseEntry
39119370Spst{
39219370Spst    my $self = shift;
39398944Sobrien    my $line = shift;
39498944Sobrien    my $tailoring = shift;
39598944Sobrien    my($name, $entry, @uv, @key);
39698944Sobrien
39719370Spst    if (defined $self->{rewrite}) {
39819370Spst	$line = $self->{rewrite}->($line);
39919370Spst    }
40019370Spst
40119370Spst    return if $line !~ /^\s*[0-9A-Fa-f]/;
40219370Spst
40319370Spst    # removes comment and gets name
40419370Spst    $name = $1
40598944Sobrien	if $line =~ s/[#%]\s*(.*)//;
40619370Spst    return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
40798944Sobrien
40898944Sobrien    # gets element
40919370Spst    my($e, $k) = split /;/, $line;
41019370Spst    croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
41119370Spst	if ! $k;
41298944Sobrien
41319370Spst    @uv = _getHexArray($e);
41498944Sobrien    return if !@uv;
41519370Spst    return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
416130803Smarcel		  exists $self->{suppressHash}{$uv[0]};
417130803Smarcel    $entry = join(CODE_SEP, @uv); # in JCPS
41819370Spst
41998944Sobrien    if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
420	my $ele = pack_U(@uv);
421
422	# regarded as if it were not stored in the table
423	return
424	    if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
425
426	# replaced as completely ignorable
427	$k = '[.0000.0000.0000.0000]'
428	    if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
429    }
430
431    # replaced as completely ignorable
432    $k = '[.0000.0000.0000.0000]'
433	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
434
435    my $is_L3_ignorable = TRUE;
436
437    foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
438	my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
439	my @wt = _getHexArray($arr);
440	push @key, pack(VCE_TEMPLATE, $var, @wt);
441	$is_L3_ignorable = FALSE
442	    if $wt[0] || $wt[1] || $wt[2];
443	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
444	# is completely ignorable.
445	# For expansion, an entry $is_L3_ignorable
446	# if and only if "all" CEs are [.0000.0000.0000].
447    }
448
449    # mapping: be an array ref or not exists (any false value is disallowed)
450    $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
451
452    # maxlength: be more than 1 or not exists (any false value is disallowed)
453    if (@uv > 1) {
454	if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
455	    $self->{maxlength}{$uv[0]} = @uv;
456	}
457    }
458
459    # contraction: be 1 or not exists (any false value is disallowed)
460    while (@uv > 2) {
461	pop @uv;
462	my $fake_entry = join(CODE_SEP, @uv); # in JCPS
463	$self->{contraction}{$fake_entry} = 1;
464    }
465}
466
467
468sub viewSortKey
469{
470    my $self = shift;
471    my $str  = shift;
472    $self->visualizeSortKey($self->getSortKey($str));
473}
474
475
476sub process
477{
478    my $self = shift;
479    my $str  = shift;
480    my $prep = $self->{preprocess};
481    my $norm = $self->{normCode};
482
483    $str = &$prep($str) if ref $prep;
484    $str = &$norm($str) if ref $norm;
485    return $str;
486}
487
488##
489## arrayref of JCPS   = splitEnt(string to be collated)
490## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
491##
492sub splitEnt
493{
494    my $self = shift;
495    my $str  = shift;
496    my $wLen = shift; # with Length
497
498    my $map  = $self->{mapping};
499    my $max  = $self->{maxlength};
500    my $reH  = $self->{rearrangeHash};
501    my $vers = $self->{UCA_Version};
502    my $ver9 = $vers >= 9 && $vers <= 11;
503    my $long = $self->{long_contraction};
504    my $uXS  = $self->{__useXS}; ### XS only
505
506    my @buf;
507
508    # get array of Unicode code point of string.
509    my @src = unpack_U($str);
510
511    # rearrangement:
512    # Character positions are not kept if rearranged,
513    # then neglected if $wLen is true.
514    if ($reH && ! $wLen) {
515	for (my $i = 0; $i < @src; $i++) {
516	    if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
517		($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
518		$i++;
519	    }
520	}
521    }
522
523    # remove a code point marked as a completely ignorable.
524    for (my $i = 0; $i < @src; $i++) {
525	if ($vers <= 20 && _isIllegal($src[$i])) {
526	    $src[$i] = undef;
527	} elsif ($ver9) {
528	    $src[$i] = undef if exists $map->{ $src[$i] }
529			   ? @{ $map->{ $src[$i] } } == 0
530			   : $uXS && _ignorable_simple($src[$i]); ### XS only
531	}
532    }
533
534    for (my $i = 0; $i < @src; $i++) {
535	my $jcps = $src[$i];
536
537	# skip removed code point
538	if (! defined $jcps) {
539	    if ($wLen && @buf) {
540		$buf[-1][2] = $i + 1;
541	    }
542	    next;
543	}
544
545	my $i_orig = $i;
546
547	# find contraction
548	if (exists $max->{$jcps}) {
549	    my $temp_jcps = $jcps;
550	    my $jcpsLen = 1;
551	    my $maxLen = $max->{$jcps};
552
553	    for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
554		next if ! defined $src[$p];
555		$temp_jcps .= CODE_SEP . $src[$p];
556		$jcpsLen++;
557		if (exists $map->{$temp_jcps}) {
558		    $jcps = $temp_jcps;
559		    $i = $p;
560		}
561	    }
562
563	# discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
564	# This process requires Unicode::Normalize.
565	# If "normalization" is undef, here should be skipped *always*
566	# (in spite of bool value of $CVgetCombinClass),
567	# since canonical ordering cannot be expected.
568	# Blocked combining character should not be contracted.
569
570	    # $self->{normCode} is false in the case of "prenormalized".
571	    if ($self->{normalization}) {
572		my $cont = $self->{contraction};
573		my $preCC = 0;
574		my $preCC_uc = 0;
575		my $jcps_uc = $jcps;
576		my(@out, @out_uc);
577
578		for (my $p = $i + 1; $p < @src; $p++) {
579		    next if ! defined $src[$p];
580		    my $curCC = $CVgetCombinClass->($src[$p]);
581		    last unless $curCC;
582		    my $tail = CODE_SEP . $src[$p];
583
584		    if ($preCC != $curCC && exists $map->{$jcps.$tail}) {
585			$jcps .= $tail;
586			push @out, $p;
587		    } else {
588			$preCC = $curCC;
589		    }
590
591		    next if !$long;
592
593		    if ($preCC_uc != $curCC &&
594			    (exists $map->{$jcps_uc.$tail} ||
595			    exists $cont->{$jcps_uc.$tail})) {
596			$jcps_uc .= $tail;
597			push @out_uc, $p;
598		    } else {
599			$preCC_uc = $curCC;
600		    }
601		}
602
603		if (@out_uc && exists $map->{$jcps_uc}) {
604		    $jcps = $jcps_uc;
605		    $src[$_] = undef for @out_uc;
606		} else {
607		    $src[$_] = undef for @out;
608		}
609	    }
610	}
611
612	# skip completely ignorable
613	if (exists $map->{$jcps} ? @{ $map->{$jcps} } == 0 :
614	    $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
615	    if ($wLen && @buf) {
616		$buf[-1][2] = $i + 1;
617	    }
618	    next;
619	}
620
621	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
622    }
623    return \@buf;
624}
625
626##
627## VCE = _pack_override(input, codepoint, derivCode)
628##
629sub _pack_override ($$$) {
630    my $r = shift;
631    my $u = shift;
632    my $der = shift;
633
634    if (ref $r) {
635	return pack(VCE_TEMPLATE, NON_VAR, @$r);
636    } elsif (defined $r) {
637	return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
638    } else {
639	$u = 0xFFFD if 0x10FFFF < $u;
640	return $der->($u);
641    }
642}
643
644##
645## list of VCE = getWt(JCPS)
646##
647sub getWt
648{
649    my $self = shift;
650    my $u    = shift;
651    my $map  = $self->{mapping};
652    my $der  = $self->{derivCode};
653    my $out  = $self->{overrideOut};
654    my $uXS  = $self->{__useXS}; ### XS only
655
656    return if !defined $u;
657    return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
658    return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
659    $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
660
661    my @ce;
662    if (exists $map->{$u}) {
663	@ce = @{ $map->{$u} }; # $u may be a contraction
664### begin XS only ###
665    } elsif ($uXS && _exists_simple($u)) {
666	@ce = _fetch_simple($u);
667### end XS only ###
668    } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
669	my $hang = $self->{overrideHangul};
670	if ($hang) {
671	    @ce = map _pack_override($_, $u, $der), $hang->($u);
672	} elsif (!defined $hang) {
673	    @ce = $der->($u);
674	} else {
675	    my $max  = $self->{maxlength};
676	    my @decH = _decompHangul($u);
677
678	    if (@decH == 2) {
679		my $contract = join(CODE_SEP, @decH);
680		@decH = ($contract) if exists $map->{$contract};
681	    } else { # must be <@decH == 3>
682		if (exists $max->{$decH[0]}) {
683		    my $contract = join(CODE_SEP, @decH);
684		    if (exists $map->{$contract}) {
685			@decH = ($contract);
686		    } else {
687			$contract = join(CODE_SEP, @decH[0,1]);
688			exists $map->{$contract} and @decH = ($contract, $decH[2]);
689		    }
690		    # even if V's ignorable, LT contraction is not supported.
691		    # If such a situation were required, NFD should be used.
692		}
693		if (@decH == 3 && exists $max->{$decH[1]}) {
694		    my $contract = join(CODE_SEP, @decH[1,2]);
695		    exists $map->{$contract} and @decH = ($decH[0], $contract);
696		}
697	    }
698
699	    @ce = map({
700		    exists $map->{$_} ? @{ $map->{$_} } :
701		$uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
702		    $der->($_);
703		} @decH);
704	}
705    } elsif ($out && 0x10FFFF < $u) {
706	@ce = map _pack_override($_, $u, $der), $out->($u);
707    } else {
708	my $cjk  = $self->{overrideCJK};
709	my $vers = $self->{UCA_Version};
710	if ($cjk && _isUIdeo($u, $vers)) {
711	    @ce = map _pack_override($_, $u, $der), $cjk->($u);
712	} elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
713	    @ce = _uideoCE_8($u);
714	} else {
715	    @ce = $der->($u);
716	}
717    }
718    return map $self->varCE($_), @ce;
719}
720
721
722##
723## string sortkey = getSortKey(string arg)
724##
725sub getSortKey
726{
727    my $self = shift;
728    my $orig = shift;
729    my $str  = $self->process($orig);
730    my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
731    my $vers = $self->{UCA_Version};
732    my $term = $self->{hangul_terminator};
733    my $lev  = $self->{level};
734    my $iden = $self->{identical};
735
736    my @buf; # weight arrays
737    if ($term) {
738	my $preHST = '';
739	my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
740	foreach my $jcps (@$rEnt) {
741	    # weird things like VL, TL-contraction are not considered!
742	    my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
743	    if ($preHST && !$curHST || # hangul before non-hangul
744		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
745		$preHST =~ /V\z/ && $curHST =~ /^L/ ||
746		$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
747		push @buf, $termCE;
748	    }
749	    $preHST = $curHST;
750	    push @buf, $self->getWt($jcps);
751	}
752	push @buf, $termCE if $preHST; # end at hangul
753    } else {
754	foreach my $jcps (@$rEnt) {
755	    push @buf, $self->getWt($jcps);
756	}
757    }
758
759    my $rkey = $self->mk_SortKey(\@buf); ### XS only
760
761    if ($iden || $vers >= 26 && $lev == MaxLevel) {
762	$rkey .= LEVEL_SEP;
763	$rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
764    }
765    return $rkey;
766}
767
768
769##
770## int compare = cmp(string a, string b)
771##
772sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
773sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
774sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
775sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
776sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
777sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
778sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
779
780##
781## list[strings] sorted = sort(list[strings] arg)
782##
783sub sort {
784    my $obj = shift;
785    return
786	map { $_->[1] }
787	    sort{ $a->[0] cmp $b->[0] }
788		map [ $obj->getSortKey($_), $_ ], @_;
789}
790
791
792##
793## bool _nonIgnorAtLevel(arrayref weights, int level)
794##
795sub _nonIgnorAtLevel($$)
796{
797    my $wt = shift;
798    return if ! defined $wt;
799    my $lv = shift;
800    return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
801}
802
803##
804## bool _eqArray(
805##    arrayref of arrayref[weights] source,
806##    arrayref of arrayref[weights] substr,
807##    int level)
808## * comparison of graphemes vs graphemes.
809##   @$source >= @$substr must be true (check it before call this);
810##
811sub _eqArray($$$)
812{
813    my $source = shift;
814    my $substr = shift;
815    my $lev = shift;
816
817    for my $g (0..@$substr-1){
818	# Do the $g'th graphemes have the same number of AV weights?
819	return if @{ $source->[$g] } != @{ $substr->[$g] };
820
821	for my $w (0..@{ $substr->[$g] }-1) {
822	    for my $v (0..$lev-1) {
823		return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
824	    }
825	}
826    }
827    return 1;
828}
829
830##
831## (int position, int length)
832## int position = index(string, substring, position, [undoc'ed global])
833##
834## With "global" (only for the list context),
835##  returns list of arrayref[position, length].
836##
837sub index
838{
839    my $self = shift;
840    $self->{preprocess} and
841	croak "Don't use Preprocess with index(), match(), etc.";
842    $self->{normCode} and
843	croak "Don't use Normalization with index(), match(), etc.";
844
845    my $str  = shift;
846    my $len  = length($str);
847    my $sub  = shift;
848    my $subE = $self->splitEnt($sub);
849    my $pos  = @_ ? shift : 0;
850       $pos  = 0 if $pos < 0;
851    my $glob = shift;
852
853    my $lev  = $self->{level};
854    my $v2i  = $self->{UCA_Version} >= 9 &&
855		$self->{variable} ne 'non-ignorable';
856
857    if (! @$subE) {
858	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
859	return $glob
860	    ? map([$_, 0], $temp..$len)
861	    : wantarray ? ($temp,0) : $temp;
862    }
863    $len < $pos
864	and return wantarray ? () : NOMATCHPOS;
865    my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
866    @$strE
867	or return wantarray ? () : NOMATCHPOS;
868
869    my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
870
871    my $last_is_variable;
872    for my $vwt (map $self->getWt($_), @$subE) {
873	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
874	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
875
876	# "Ignorable (L1, L2) after Variable" since track. v. 9
877	if ($v2i) {
878	    if ($var) {
879		$last_is_variable = TRUE;
880	    }
881	    elsif (!$wt[0]) { # ignorable
882		$to_be_pushed = FALSE if $last_is_variable;
883	    }
884	    else {
885		$last_is_variable = FALSE;
886	    }
887	}
888
889	if (@subWt && !$var && !$wt[0]) {
890	    push @{ $subWt[-1] }, \@wt if $to_be_pushed;
891	} elsif ($to_be_pushed) {
892	    push @subWt, [ \@wt ];
893	}
894	# else ===> skipped
895    }
896
897    my $count = 0;
898    my $end = @$strE - 1;
899
900    $last_is_variable = FALSE; # reuse
901    for (my $i = 0; $i <= $end; ) { # no $i++
902	my $found_base = 0;
903
904	# fetch a grapheme
905	while ($i <= $end && $found_base == 0) {
906	    for my $vwt ($self->getWt($strE->[$i][0])) {
907		my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
908		my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
909
910		# "Ignorable (L1, L2) after Variable" since track. v. 9
911		if ($v2i) {
912		    if ($var) {
913			$last_is_variable = TRUE;
914		    }
915		    elsif (!$wt[0]) { # ignorable
916			$to_be_pushed = FALSE if $last_is_variable;
917		    }
918		    else {
919			$last_is_variable = FALSE;
920		    }
921		}
922
923		if (@strWt && !$var && !$wt[0]) {
924		    push @{ $strWt[-1] }, \@wt if $to_be_pushed;
925		    $finPos[-1] = $strE->[$i][2];
926		} elsif ($to_be_pushed) {
927		    push @strWt, [ \@wt ];
928		    push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
929		    $finPos[-1] = NOMATCHPOS if $found_base;
930		    push @finPos, $strE->[$i][2];
931		    $found_base++;
932		}
933		# else ===> no-op
934	    }
935	    $i++;
936	}
937
938	# try to match
939	while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
940	    if ($iniPos[0] != NOMATCHPOS &&
941		    $finPos[$#subWt] != NOMATCHPOS &&
942			_eqArray(\@strWt, \@subWt, $lev)) {
943		my $temp = $iniPos[0] + $pos;
944
945		if ($glob) {
946		    push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
947		    splice @strWt,  0, $#subWt;
948		    splice @iniPos, 0, $#subWt;
949		    splice @finPos, 0, $#subWt;
950		}
951		else {
952		    return wantarray
953			? ($temp, $finPos[$#subWt] - $iniPos[0])
954			:  $temp;
955		}
956	    }
957	    shift @strWt;
958	    shift @iniPos;
959	    shift @finPos;
960	}
961    }
962
963    return $glob
964	? @g_ret
965	: wantarray ? () : NOMATCHPOS;
966}
967
968##
969## scalarref to matching part = match(string, substring)
970##
971sub match
972{
973    my $self = shift;
974    if (my($pos,$len) = $self->index($_[0], $_[1])) {
975	my $temp = substr($_[0], $pos, $len);
976	return wantarray ? $temp : \$temp;
977	# An lvalue ref \substr should be avoided,
978	# since its value is affected by modification of its referent.
979    }
980    else {
981	return;
982    }
983}
984
985##
986## arrayref matching parts = gmatch(string, substring)
987##
988sub gmatch
989{
990    my $self = shift;
991    my $str  = shift;
992    my $sub  = shift;
993    return map substr($str, $_->[0], $_->[1]),
994		$self->index($str, $sub, 0, 'g');
995}
996
997##
998## bool subst'ed = subst(string, substring, replace)
999##
1000sub subst
1001{
1002    my $self = shift;
1003    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1004
1005    if (my($pos,$len) = $self->index($_[0], $_[1])) {
1006	if ($code) {
1007	    my $mat = substr($_[0], $pos, $len);
1008	    substr($_[0], $pos, $len, $code->($mat));
1009	} else {
1010	    substr($_[0], $pos, $len, $_[2]);
1011	}
1012	return TRUE;
1013    }
1014    else {
1015	return FALSE;
1016    }
1017}
1018
1019##
1020## int count = gsubst(string, substring, replace)
1021##
1022sub gsubst
1023{
1024    my $self = shift;
1025    my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1026    my $cnt = 0;
1027
1028    # Replacement is carried out from the end, then use reverse.
1029    for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1030	if ($code) {
1031	    my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1032	    substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1033	} else {
1034	    substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1035	}
1036	$cnt++;
1037    }
1038    return $cnt;
1039}
1040
10411;
1042__END__
1043
1044=head1 NAME
1045
1046Unicode::Collate - Unicode Collation Algorithm
1047
1048=head1 SYNOPSIS
1049
1050  use Unicode::Collate;
1051
1052  #construct
1053  $Collator = Unicode::Collate->new(%tailoring);
1054
1055  #sort
1056  @sorted = $Collator->sort(@not_sorted);
1057
1058  #compare
1059  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1060
1061B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1062according to Perl's Unicode support. See L<perlunicode>,
1063L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1064Otherwise you can use C<preprocess> or should decode them before.
1065
1066=head1 DESCRIPTION
1067
1068This module is an implementation of Unicode Technical Standard #10
1069(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1070
1071=head2 Constructor and Tailoring
1072
1073The C<new> method returns a collator object. If new() is called
1074with no parameters, the collator should do the default collation.
1075
1076   $Collator = Unicode::Collate->new(
1077      UCA_Version => $UCA_Version,
1078      alternate => $alternate, # alias for 'variable'
1079      backwards => $levelNumber, # or \@levelNumbers
1080      entry => $element,
1081      hangul_terminator => $term_primary_weight,
1082      highestFFFF => $bool,
1083      identical => $bool,
1084      ignoreName => qr/$ignoreName/,
1085      ignoreChar => qr/$ignoreChar/,
1086      ignore_level2 => $bool,
1087      katakana_before_hiragana => $bool,
1088      level => $collationLevel,
1089      long_contraction => $bool,
1090      minimalFFFE => $bool,
1091      normalization  => $normalization_form,
1092      overrideCJK => \&overrideCJK,
1093      overrideHangul => \&overrideHangul,
1094      preprocess => \&preprocess,
1095      rearrange => \@charList,
1096      rewrite => \&rewrite,
1097      suppress => \@charList,
1098      table => $filename,
1099      undefName => qr/$undefName/,
1100      undefChar => qr/$undefChar/,
1101      upper_before_lower => $bool,
1102      variable => $variable,
1103   );
1104
1105=over 4
1106
1107=item UCA_Version
1108
1109If the revision (previously "tracking version") number of UCA is given,
1110behavior of that revision is emulated on collating.
1111If omitted, the return value of C<UCA_Version()> is used.
1112
1113The following revisions are supported.  The default is 43.
1114
1115     UCA       Unicode Standard         DUCET (@version)
1116   -------------------------------------------------------
1117      8              3.1                3.0.1 (3.0.1d9)
1118      9     3.1 with Corrigendum 3      3.1.1
1119     11             4.0.0
1120     14             4.1.0
1121     16             5.0.0
1122     18             5.1.0
1123     20             5.2.0
1124     22             6.0.0
1125     24             6.1.0
1126     26             6.2.0
1127     28             6.3.0
1128     30             7.0.0
1129     32             8.0.0
1130     34             9.0.0
1131     36            10.0.0
1132     38            11.0.0
1133     40            12.0.0
1134     41            12.1.0
1135     43            13.0.0
1136
1137* See below for C<long_contraction> with C<UCA_Version> 22 and 24.
1138
1139* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1140since C<UCA_Version> 22.
1141
1142* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1143and can be overridden since C<UCA_Version> 22.
1144
1145* Fully ignorable characters were ignored, and would not interrupt
1146contractions with C<UCA_Version> 9 and 11.
1147
1148* Treatment of ignorables after variables and some behaviors
1149were changed at C<UCA_Version> 9.
1150
1151* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1152depend on C<UCA_Version>.
1153
1154* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1155C<hangul_terminator>.
1156
1157=item alternate
1158
1159-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1160
1161For backward compatibility, C<alternate> (old name) can be used
1162as an alias for C<variable>.
1163
1164=item backwards
1165
1166-- see 3.4 Backward Accents, UTS #10.
1167
1168     backwards => $levelNumber or \@levelNumbers
1169
1170Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1171If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1172forwards at all the levels.
1173
1174=item entry
1175
1176-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
1177
1178If the same character (or a sequence of characters) exists
1179in the collation element table through C<table>,
1180mapping to collation elements is overridden.
1181If it does not exist, the mapping is defined additionally.
1182
1183    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11840063 0068 ; [.0E6A.0020.0002.0063] # ch
11850043 0068 ; [.0E6A.0020.0007.0043] # Ch
11860043 0048 ; [.0E6A.0020.0008.0043] # CH
1187006C 006C ; [.0F4C.0020.0002.006C] # ll
1188004C 006C ; [.0F4C.0020.0007.004C] # Ll
1189004C 004C ; [.0F4C.0020.0008.004C] # LL
119000F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1191006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
119200D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1193004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1194ENTRY
1195
1196    entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
119700E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
119800C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1199ENTRY
1200
1201B<NOTE:> The code point in the UCA file format (before C<';'>)
1202B<must> be a Unicode code point (defined as hexadecimal),
1203but not a native code point.
1204So C<0063> must always denote C<U+0063>,
1205but not a character of C<"\x63">.
1206
1207Weighting may vary depending on collation element table.
1208So ensure the weights defined in C<entry> will be consistent with
1209those in the collation element table loaded via C<table>.
1210
1211In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1212and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1213(as a value between C<0E60> and C<0E6D>)
1214makes ordering as C<C E<lt> CH E<lt> D>.
1215Exactly speaking DUCET already has some characters between C<C> and C<D>:
1216C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1217C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1218and C<c-curl> (C<U+0255>) with C<0E69>.
1219Then primary weight C<0E6A> for C<CH> makes C<CH>
1220ordered between C<c-curl> and C<D>.
1221
1222=item hangul_terminator
1223
1224-- see 7.1.4 Trailing Weights, UTS #10.
1225
1226If a true value is given (non-zero but should be positive),
1227it will be added as a terminator primary weight to the end of
1228every standard Hangul syllable. Secondary and any higher weights
1229for terminator are set to zero.
1230If the value is false or C<hangul_terminator> key does not exist,
1231insertion of terminator weights will not be performed.
1232
1233Boundaries of Hangul syllables are determined
1234according to conjoining Jamo behavior in F<the Unicode Standard>
1235and F<HangulSyllableType.txt>.
1236
1237B<Implementation Note:>
1238(1) For expansion mapping (Unicode character mapped
1239to a sequence of collation elements), a terminator will not be added
1240between collation elements, even if Hangul syllable boundary exists there.
1241Addition of terminator is restricted to the next position
1242to the last collation element.
1243
1244(2) Non-conjoining Hangul letters
1245(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1246automatically terminated with a terminator primary weight.
1247These characters may need terminator included in a collation element
1248table beforehand.
1249
1250=item highestFFFF
1251
1252-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation.
1253
1254If the parameter is made true, C<U+FFFF> has a highest primary weight.
1255When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1256C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1257begins with C<"abc">, or another primary equivalent.
1258C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1259such as C<"abc\x{FFFF}xyz">.
1260
1261C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
1262almost, but the latter has a problem that you should know which letter is
1263next to C<c>. For a certain language where C<ch> as the next letter,
1264C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
1265
1266Note:
1267This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
1268Any other character than C<U+FFFF> can be tailored by C<entry>.
1269
1270=item identical
1271
1272-- see A.3 Deterministic Comparison, UTS #10.
1273
1274By default, strings whose weights are equal should be equal,
1275even though their code points are not equal.
1276Completely ignorable characters are ignored.
1277
1278If the parameter is made true, a final, tie-breaking level is used.
1279If no difference of weights is found after the comparison through
1280all the level specified by C<level>, the comparison with code points
1281will be performed.
1282For the tie-breaking comparison, the sort key has code points
1283of the original string appended.
1284Completely ignorable characters are not ignored.
1285
1286If C<preprocess> and/or C<normalization> is applied, the code points
1287of the string after them (in NFD by default) are used.
1288
1289=item ignoreChar
1290
1291=item ignoreName
1292
1293-- see 3.6 Variable Weighting, UTS #10.
1294
1295Makes the entry in the table completely ignorable;
1296i.e. as if the weights were zero at all level.
1297
1298Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1299will be ignored. Through C<ignoreName>, any character whose name
1300(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1301will be ignored.
1302
1303E.g. when 'a' and 'e' are ignorable,
1304'element' is equal to 'lament' (or 'lmnt').
1305
1306=item ignore_level2
1307
1308-- see 5.1 Parametric Tailoring, UTS #10.
1309
1310By default, case-sensitive comparison (that is level 3 difference)
1311won't ignore accents (that is level 2 difference).
1312
1313If the parameter is made true, accents (and other primary ignorable
1314characters) are ignored, even though cases are taken into account.
1315
1316B<NOTE>: C<level> should be 3 or greater.
1317
1318=item katakana_before_hiragana
1319
1320-- see 7.2 Tertiary Weight Table, UTS #10.
1321
1322By default, hiragana is before katakana.
1323If the parameter is made true, this is reversed.
1324
1325B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1326distinctions must occur in level 3, and their weights at level 3 must be
1327same as those mentioned in 7.3.1, UTS #10.
1328If you define your collation elements which violate this requirement,
1329this parameter does not work validly.
1330
1331=item level
1332
1333-- see 4.3 Form Sort Key, UTS #10.
1334
1335Set the maximum level.
1336Any higher levels than the specified one are ignored.
1337
1338  Level 1: alphabetic ordering
1339  Level 2: diacritic ordering
1340  Level 3: case ordering
1341  Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1342
1343  ex.level => 2,
1344
1345If omitted, the maximum is the 4th.
1346
1347B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1348But this module only uses weights within 0xFFFF.
1349When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1350and 'shift-trimmed'), the level 4 may be unreliable.
1351
1352See also C<identical>.
1353
1354=item long_contraction
1355
1356-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1357
1358If the parameter is made true, for a contraction with three or more
1359characters (here nicknamed "long contraction"), initial substrings
1360will be handled.
1361For example, a contraction ABC, where A is a starter, and B and C
1362are non-starters (character with non-zero combining character class),
1363will be detected even if there is not AB as a contraction.
1364
1365B<Default:> Usually false.
1366If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1367is not specified in C<new()>, a true value is set implicitly.
1368This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1369
1370C<change()> handles C<long_contraction> explicitly only.
1371If C<long_contraction> is not specified in C<change()>, even though
1372C<UCA_Version> is changed, C<long_contraction> will not be changed.
1373
1374B<Limitation:> Scanning non-starters is one-way (no back tracking).
1375If AB is found but not ABC is not found, other long contraction where
1376the first character is A and the second is not B may not be found.
1377
1378Under C<(normalization =E<gt> undef)>, detection step of discontiguous
1379contractions will be skipped.
1380
1381B<Note:> The following contractions in DUCET are not considered
1382in steps S2.1.1 to S2.1.3, where they are discontiguous.
1383
1384    0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1385    0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1386
1387For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1388(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1389In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1390instead of C<0FB2 0F71 0F80>.
1391Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1392contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1393
1394=item minimalFFFE
1395
1396-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation.
1397
1398If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1399The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1400first compares C<$a1> and C<$b1> at level 1, and
1401then C<$a2> and C<$b2> at level 1, as followed.
1402
1403        "ab\x{FFFE}a"
1404        "Ab\x{FFFE}a"
1405        "ab\x{FFFE}c"
1406        "Ab\x{FFFE}c"
1407        "ab\x{FFFE}xyz"
1408        "abc\x{FFFE}def"
1409        "abc\x{FFFE}xYz"
1410        "aBc\x{FFFE}xyz"
1411        "abcX\x{FFFE}def"
1412        "abcx\x{FFFE}xyz"
1413        "b\x{FFFE}aaa"
1414        "bbb\x{FFFE}a"
1415
1416Note:
1417This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
1418Any other character than C<U+FFFE> can be tailored by C<entry>.
1419
1420=item normalization
1421
1422-- see 4.1 Normalize, UTS #10.
1423
1424If specified, strings are normalized before preparation of sort keys
1425(the normalization is executed after preprocess).
1426
1427A form name C<Unicode::Normalize::normalize()> accepts will be applied
1428as C<$normalization_form>.
1429Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1430See C<Unicode::Normalize::normalize()> for detail.
1431If omitted, C<'NFD'> is used.
1432
1433C<normalization> is performed after C<preprocess> (if defined).
1434
1435Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1436though they are not concerned with C<Unicode::Normalize::normalize()>.
1437
1438If C<undef> (not a string C<"undef">) is passed explicitly
1439as the value for this key,
1440any normalization is not carried out (this may make tailoring easier
1441if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1442only contiguous contractions are resolved;
1443e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1444C<A-cedilla-ring> would be primary equal to C<A>.
1445In this point,
1446C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1447B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1448
1449In the case of C<(normalization =E<gt> "prenormalized")>,
1450any normalization is not performed, but
1451discontiguous contractions with combining characters are performed.
1452Therefore
1453C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1454B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1455If source strings are finely prenormalized,
1456C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1457
1458Except C<(normalization =E<gt> undef)>,
1459B<Unicode::Normalize> is required (see also B<CAVEAT>).
1460
1461=item overrideCJK
1462
1463-- see 7.1 Derived Collation Elements, UTS #10.
1464
1465By default, CJK unified ideographs are ordered in Unicode codepoint
1466order, but those in the CJK Unified Ideographs block are less than
1467those in the CJK Unified Ideographs Extension A etc.
1468
1469    In the CJK Unified Ideographs block:
1470    U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1471    U+4E00..U+9FBB if UCA_Version is 14 or 16.
1472    U+4E00..U+9FC3 if UCA_Version is 18.
1473    U+4E00..U+9FCB if UCA_Version is 20 or 22.
1474    U+4E00..U+9FCC if UCA_Version is 24 to 30.
1475    U+4E00..U+9FD5 if UCA_Version is 32 or 34.
1476    U+4E00..U+9FEA if UCA_Version is 36.
1477    U+4E00..U+9FEF if UCA_Version is 38, 40 or 41.
1478    U+4E00..U+9FFC if UCA_Version is 43.
1479
1480    In the CJK Unified Ideographs Extension blocks:
1481    Ext.A (U+3400..U+4DB5)   if UCA_Version is  8 to 41.
1482    Ext.A (U+3400..U+4DBF)   if UCA_Version is 43.
1483    Ext.B (U+20000..U+2A6D6) if UCA_Version is  8 to 41.
1484    Ext.B (U+20000..U+2A6DD) if UCA_Version is 43.
1485    Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1486    Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
1487    Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later.
1488    Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36 or later.
1489    Ext.G (U+30000..U+3134A) if UCA_Version is 43.
1490
1491Through C<overrideCJK>, ordering of CJK unified ideographs (including
1492extensions) can be overridden.
1493
1494ex. CJK unified ideographs in the JIS code point order.
1495
1496  overrideCJK => sub {
1497      my $u = shift;             # get a Unicode codepoint
1498      my $b = pack('n', $u);     # to UTF-16BE
1499      my $s = your_unicode_to_sjis_converter($b); # convert
1500      my $n = unpack('n', $s);   # convert sjis to short
1501      [ $n, 0x20, 0x2, $u ];     # return the collation element
1502  },
1503
1504The return value may be an arrayref of 1st to 4th weights as shown
1505above. The return value may be an integer as the primary weight
1506as shown below.  If C<undef> is returned, the default derived
1507collation element will be used.
1508
1509  overrideCJK => sub {
1510      my $u = shift;             # get a Unicode codepoint
1511      my $b = pack('n', $u);     # to UTF-16BE
1512      my $s = your_unicode_to_sjis_converter($b); # convert
1513      my $n = unpack('n', $s);   # convert sjis to short
1514      return $n;                 # return the primary weight
1515  },
1516
1517The return value may be a list containing zero or more of
1518an arrayref, an integer, or C<undef>.
1519
1520ex. ignores all CJK unified ideographs.
1521
1522  overrideCJK => sub {()}, # CODEREF returning empty list
1523
1524   # where ->eq("Pe\x{4E00}rl", "Perl") is true
1525   # as U+4E00 is a CJK unified ideograph and to be ignorable.
1526
1527If a false value (including C<undef>) is passed, C<overrideCJK>
1528has no effect.
1529C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
1530
1531But assignment of weight for CJK unified ideographs
1532in C<table> or C<entry> is still valid.
1533If C<undef> is passed explicitly as the value for this key,
1534weights for CJK unified ideographs are treated as undefined.
1535However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
1536has no special meaning.
1537
1538B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1539C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1540C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1541ideographs. But they can't be overridden via C<overrideCJK> when you use
1542DUCET, as the table includes weights for them. C<table> or C<entry> has
1543priority over C<overrideCJK>.
1544
1545=item overrideHangul
1546
1547-- see 7.1 Derived Collation Elements, UTS #10.
1548
1549By default, Hangul syllables are decomposed into Hangul Jamo,
1550even if C<(normalization =E<gt> undef)>.
1551But the mapping of Hangul syllables may be overridden.
1552
1553This parameter works like C<overrideCJK>, so see there for examples.
1554
1555If you want to override the mapping of Hangul syllables,
1556NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1557Hangul syllables before overriding. FCD may decompose Hangul syllables
1558as the case may be.
1559
1560If a false value (but not C<undef>) is passed, C<overrideHangul>
1561has no effect.
1562C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
1563
1564If C<undef> is passed explicitly as the value for this key,
1565weight for Hangul syllables is treated as undefined
1566without decomposition into Hangul Jamo.
1567But definition of weight for Hangul syllables
1568in C<table> or C<entry> is still valid.
1569
1570=item overrideOut
1571
1572-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
1573
1574Perl seems to allow out-of-range values (greater than 0x10FFFF).
1575By default, out-of-range values are replaced with C<U+FFFD>
1576(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
1577or ignored when C<UCA_Version> E<lt>= 20.
1578
1579When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
1580can be overridden. Though C<table> or C<entry> are available for them,
1581out-of-range values are too many.
1582
1583C<overrideOut> can perform it algorithmically.
1584This parameter works like C<overrideCJK>, so see there for examples.
1585
1586ex. ignores all out-of-range values.
1587
1588  overrideOut => sub {()}, # CODEREF returning empty list
1589
1590If a false value (including C<undef>) is passed, C<overrideOut>
1591has no effect.
1592C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
1593
1594B<NOTE ABOUT U+FFFD:>
1595
1596UCA recommends that out-of-range values should not be ignored for security
1597reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
1598However, C<U+FFFD> is wrongly mapped to a variable collation element
1599in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
1600ignored when C<variable> isn't C<Non-ignorable>.
1601
1602The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
1603see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
1604(7.1.4 Trailing Weights). Such a correction is reproduced by this.
1605
1606  overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
1607
1608This workaround is unnecessary since Unicode 6.3.0.
1609
1610=item preprocess
1611
1612-- see 5.4 Preprocessing, UTS #10.
1613
1614If specified, the coderef is used to preprocess each string
1615before the formation of sort keys.
1616
1617ex. dropping English articles, such as "a" or "the".
1618Then, "the pen" is before "a pencil".
1619
1620     preprocess => sub {
1621           my $str = shift;
1622           $str =~ s/\b(?:an?|the)\s+//gi;
1623           return $str;
1624        },
1625
1626C<preprocess> is performed before C<normalization> (if defined).
1627
1628ex. decoding strings in a legacy encoding such as shift-jis:
1629
1630    $sjis_collator = Unicode::Collate->new(
1631        preprocess => \&your_shiftjis_to_unicode_decoder,
1632    );
1633    @result = $sjis_collator->sort(@shiftjis_strings);
1634
1635B<Note:> Strings returned from the coderef will be interpreted
1636according to Perl's Unicode support. See L<perlunicode>,
1637L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1638
1639=item rearrange
1640
1641-- see 3.5 Rearrangement, UTS #10.
1642
1643Characters that are not coded in logical order and to be rearranged.
1644If C<UCA_Version> is equal to or less than 11, default is:
1645
1646    rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1647
1648If you want to disallow any rearrangement, pass C<undef> or C<[]>
1649(a reference to empty list) as the value for this key.
1650
1651If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1652(i.e. no rearrangement).
1653
1654B<According to the version 9 of UCA, this parameter shall not be used;
1655but it is not warned at present.>
1656
1657=item rewrite
1658
1659If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1660The coderef will get each line, and then should return a rewritten line
1661according to the UCA file format.
1662If the coderef returns an empty line, the line will be skipped.
1663
1664e.g. any primary ignorable characters into tertiary ignorable:
1665
1666    rewrite => sub {
1667        my $line = shift;
1668        $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1669        return $line;
1670    },
1671
1672This example shows rewriting weights. C<rewrite> is allowed to
1673affect code points, weights, and the name.
1674
1675B<NOTE>: C<table> is available to use another table file;
1676preparing a modified table once would be more efficient than
1677rewriting lines on reading an unmodified table every time.
1678
1679=item suppress
1680
1681-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation.
1682
1683Contractions beginning with the specified characters are suppressed,
1684even if those contractions are defined in C<table>.
1685
1686An example for Russian and some languages using the Cyrillic script:
1687
1688    suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1689
1690where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1691
1692B<NOTE>: Contractions via C<entry> will not be suppressed.
1693
1694=item table
1695
1696-- see 3.8 Default Unicode Collation Element Table, UTS #10.
1697
1698You can use another collation element table if desired.
1699
1700The table file should locate in the F<Unicode/Collate> directory
1701on C<@INC>. Say, if the filename is F<Foo.txt>,
1702the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1703
1704By default, F<allkeys.txt> (as the filename of DUCET) is used.
1705If you will prepare your own table file, any name other than F<allkeys.txt>
1706may be better to avoid namespace conflict.
1707
1708B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1709module, and it may save time at the run time.
1710Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
1711or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1712C<rewrite> will prevent this module from using the compiled DUCET.
1713
1714If C<undef> is passed explicitly as the value for this key,
1715no file is read (but you can define collation elements via C<entry>).
1716
1717A typical way to define a collation element table
1718without any file of table:
1719
1720   $onlyABC = Unicode::Collate->new(
1721       table => undef,
1722       entry => << 'ENTRIES',
17230061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
17240041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
17250062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
17260042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
17270063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
17280043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1729ENTRIES
1730    );
1731
1732If C<ignoreName> or C<undefName> is used, character names should be
1733specified as a comment (following C<#>) on each line.
1734
1735=item undefChar
1736
1737=item undefName
1738
1739-- see 6.3.3 Reducing the Repertoire, UTS #10.
1740
1741Undefines the collation element as if it were unassigned in the C<table>.
1742This reduces the size of the table.
1743If an unassigned character appears in the string to be collated,
1744the sort key is made from its codepoint
1745as a single-character collation element,
1746as it is greater than any other assigned collation elements
1747(in the codepoint order among the unassigned characters).
1748But, it'd be better to ignore characters
1749unfamiliar to you and maybe never used.
1750
1751Through C<undefChar>, any character matching C<qr/$undefChar/>
1752will be undefined. Through C<undefName>, any character whose name
1753(given in the C<table> file as a comment) matches C<qr/$undefName/>
1754will be undefined.
1755
1756ex. Collation weights for beyond-BMP characters are not stored in object:
1757
1758    undefChar => qr/[^\0-\x{fffd}]/,
1759
1760=item upper_before_lower
1761
1762-- see 6.6 Case Comparisons, UTS #10.
1763
1764By default, lowercase is before uppercase.
1765If the parameter is made true, this is reversed.
1766
1767B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1768distinctions must occur in level 3, and their weights at level 3 must be
1769same as those mentioned in 7.3.1, UTS #10.
1770If you define your collation elements which differs from this requirement,
1771this parameter doesn't work validly.
1772
1773=item variable
1774
1775-- see 3.6 Variable Weighting, UTS #10.
1776
1777This key allows for variable weighting of variable collation elements,
1778which are marked with an ASTERISK in the table
1779(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1780
1781   variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1782
1783These names are case-insensitive.
1784By default (if specification is omitted), 'shifted' is adopted.
1785
1786   'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1787                    considered at the 4th level.
1788
1789   'Non-Ignorable'  Variable elements are not reset to ignorable.
1790
1791   'Shifted'        Variable elements are made ignorable at levels 1 through 3
1792                    their level 4 weight is replaced by the old level 1 weight.
1793                    Level 4 weight for Non-Variable elements is 0xFFFF.
1794
1795   'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1796                    are trimmed.
1797
1798=back
1799
1800=head2 Methods for Collation
1801
1802=over 4
1803
1804=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1805
1806Sorts a list of strings.
1807
1808=item C<$result = $Collator-E<gt>cmp($a, $b)>
1809
1810Returns 1 (when C<$a> is greater than C<$b>)
1811or 0 (when C<$a> is equal to C<$b>)
1812or -1 (when C<$a> is less than C<$b>).
1813
1814=item C<$result = $Collator-E<gt>eq($a, $b)>
1815
1816=item C<$result = $Collator-E<gt>ne($a, $b)>
1817
1818=item C<$result = $Collator-E<gt>lt($a, $b)>
1819
1820=item C<$result = $Collator-E<gt>le($a, $b)>
1821
1822=item C<$result = $Collator-E<gt>gt($a, $b)>
1823
1824=item C<$result = $Collator-E<gt>ge($a, $b)>
1825
1826They works like the same name operators as theirs.
1827
1828   eq : whether $a is equal to $b.
1829   ne : whether $a is not equal to $b.
1830   lt : whether $a is less than $b.
1831   le : whether $a is less than $b or equal to $b.
1832   gt : whether $a is greater than $b.
1833   ge : whether $a is greater than $b or equal to $b.
1834
1835=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1836
1837-- see 4.3 Form Sort Key, UTS #10.
1838
1839Returns a sort key.
1840
1841You compare the sort keys using a binary comparison
1842and get the result of the comparison of the strings using UCA.
1843
1844   $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1845
1846      is equivalent to
1847
1848   $Collator->cmp($a, $b)
1849
1850=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1851
1852Converts a sorting key into its representation form.
1853If C<UCA_Version> is 8, the output is slightly different.
1854
1855   use Unicode::Collate;
1856   my $c = Unicode::Collate->new();
1857   print $c->viewSortKey("Perl"),"\n";
1858
1859   # output:
1860   # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1861   #  Level 1               Level 2               Level 3               Level 4
1862
1863=back
1864
1865=head2 Methods for Searching
1866
1867The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1868like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1869but they are not aware of any pattern, but only a literal substring.
1870
1871B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1872for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1873C<subst>, C<gsubst>) is croaked, as the position and the length might
1874differ from those on the specified string.
1875
1876C<rearrange> and C<hangul_terminator> parameters are neglected.
1877C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1878matching and searching, as it doesn't matter whether greater or less.
1879
1880=over 4
1881
1882=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1883
1884=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1885
1886If C<$substring> matches a part of C<$string>, returns
1887the position of the first occurrence of the matching part in scalar context;
1888in list context, returns a two-element list of
1889the position and the length of the matching part.
1890
1891If C<$substring> does not match any part of C<$string>,
1892returns C<-1> in scalar context and
1893an empty list in list context.
1894
1895e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
1896you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
1897
1898  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1899                                     # (normalization => undef) is REQUIRED.
1900  my $match;
1901  if (my($pos,$len) = $Collator->index($str, $sub)) {
1902      $match = substr($str, $pos, $len);
1903  }
1904
1905and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
1906is primary equal to C<"M>E<uuml>C<SS">.
1907
1908=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1909
1910=item C<($match)   = $Collator-E<gt>match($string, $substring)>
1911
1912If C<$substring> matches a part of C<$string>, in scalar context, returns
1913B<a reference to> the first occurrence of the matching part
1914(C<$match_ref> is always true if matches,
1915since every reference is B<true>);
1916in list context, returns the first occurrence of the matching part.
1917
1918If C<$substring> does not match any part of C<$string>,
1919returns C<undef> in scalar context and
1920an empty list in list context.
1921
1922e.g.
1923
1924    if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1925	print "matches [$$match_ref].\n";
1926    } else {
1927	print "doesn't match.\n";
1928    }
1929
1930     or
1931
1932    if (($match) = $Collator->match($str, $sub)) { # list context
1933	print "matches [$match].\n";
1934    } else {
1935	print "doesn't match.\n";
1936    }
1937
1938=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1939
1940If C<$substring> matches a part of C<$string>, returns
1941all the matching parts (or matching count in scalar context).
1942
1943If C<$substring> does not match any part of C<$string>,
1944returns an empty list.
1945
1946=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1947
1948If C<$substring> matches a part of C<$string>,
1949the first occurrence of the matching part is replaced by C<$replacement>
1950(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1951
1952C<$replacement> can be a C<CODEREF>,
1953taking the matching part as an argument,
1954and returning a string to replace the matching part
1955(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1956
1957=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1958
1959If C<$substring> matches a part of C<$string>,
1960all the occurrences of the matching part are replaced by C<$replacement>
1961(C<$string> is modified) and C<$count> is returned.
1962
1963C<$replacement> can be a C<CODEREF>,
1964taking the matching part as an argument,
1965and returning a string to replace the matching part
1966(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1967
1968e.g.
1969
1970  my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1971                                     # (normalization => undef) is REQUIRED.
1972  my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1973  $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1974
1975  # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1976  # i.e., all the camels are made bold-faced.
1977
1978   Examples: levels and ignore_level2 - what does camel match?
1979  ---------------------------------------------------------------------------
1980   level  ignore_level2  |  camel  Camel  came\x{301}l  c-a-m-e-l  cam\0e\0l
1981  -----------------------|---------------------------------------------------
1982     1        false      |   yes    yes      yes          yes        yes
1983     2        false      |   yes    yes      no           yes        yes
1984     3        false      |   yes    no       no           yes        yes
1985     4        false      |   yes    no       no           no         yes
1986  -----------------------|---------------------------------------------------
1987     1        true       |   yes    yes      yes          yes        yes
1988     2        true       |   yes    yes      yes          yes        yes
1989     3        true       |   yes    no       yes          yes        yes
1990     4        true       |   yes    no       yes          no         yes
1991  ---------------------------------------------------------------------------
1992   note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1993         at any level.
1994
1995=back
1996
1997=head2 Other Methods
1998
1999=over 4
2000
2001=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
2002
2003=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
2004
2005Changes the value of specified keys and returns the changed part.
2006
2007    $Collator = Unicode::Collate->new(level => 4);
2008
2009    $Collator->eq("perl", "PERL"); # false
2010
2011    %old = $Collator->change(level => 2); # returns (level => 4).
2012
2013    $Collator->eq("perl", "PERL"); # true
2014
2015    $Collator->change(%old); # returns (level => 2).
2016
2017    $Collator->eq("perl", "PERL"); # false
2018
2019Not all C<(key,value)>s are allowed to be changed.
2020See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
2021
2022In the scalar context, returns the modified collator
2023(but it is B<not> a clone from the original).
2024
2025    $Collator->change(level => 2)->eq("perl", "PERL"); # true
2026
2027    $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
2028
2029    $Collator->change(level => 4)->eq("perl", "PERL"); # false
2030
2031=item C<$version = $Collator-E<gt>version()>
2032
2033Returns the version number (a string) of the Unicode Standard
2034which the C<table> file used by the collator object is based on.
2035If the table does not include a version line (starting with C<@version>),
2036returns C<"unknown">.
2037
2038=item C<UCA_Version()>
2039
2040Returns the revision number of UTS #10 this module consults,
2041that should correspond with the DUCET incorporated.
2042
2043=item C<Base_Unicode_Version()>
2044
2045Returns the version number of UTS #10 this module consults,
2046that should correspond with the DUCET incorporated.
2047
2048=back
2049
2050=head1 EXPORT
2051
2052No method will be exported.
2053
2054=head1 INSTALL
2055
2056Though this module can be used without any C<table> file,
2057to use this module easily, it is recommended to install a table file
2058in the UCA format, by copying it under the directory
2059<a place in @INC>/Unicode/Collate.
2060
2061The most preferable one is "The Default Unicode Collation Element Table"
2062(aka DUCET), available from the Unicode Consortium's website:
2063
2064   http://www.unicode.org/Public/UCA/
2065
2066   http://www.unicode.org/Public/UCA/latest/allkeys.txt
2067   (latest version)
2068
2069If DUCET is not installed, it is recommended to copy the file
2070from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2071to <a place in @INC>/Unicode/Collate/allkeys.txt
2072manually.
2073
2074=head1 CAVEATS
2075
2076=over 4
2077
2078=item Normalization
2079
2080Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2081module (see L<Unicode::Normalize>).
2082
2083If you need not it (say, in the case when you need not
2084handle any combining characters),
2085assign C<(normalization =E<gt> undef)> explicitly.
2086
2087-- see 6.5 Avoiding Normalization, UTS #10.
2088
2089=item Conformance Test
2090
2091The Conformance Test for the UCA is available
2092under L<http://www.unicode.org/Public/UCA/>.
2093
2094For F<CollationTest_SHIFTED.txt>,
2095a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2096for F<CollationTest_NON_IGNORABLE.txt>, a collator via
2097C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
2098
2099If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2100C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2101C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2102C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2103
2104B<Unicode::Normalize is required to try The Conformance Test.>
2105
2106B<EBCDIC-SUPPORT IS EXPERIMENTAL.>
2107
2108=back
2109
2110=head1 AUTHOR, COPYRIGHT AND LICENSE
2111
2112The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
2113<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2021,
2114SADAHIRO Tomoyuki. Japan. All rights reserved.
2115
2116This module is free software; you can redistribute it and/or
2117modify it under the same terms as Perl itself.
2118
2119The file Unicode/Collate/allkeys.txt was copied verbatim
2120from L<http://www.unicode.org/Public/UCA/13.0.0/allkeys.txt>.
2121For this file, Copyright (c) 2020 Unicode, Inc.; distributed
2122under the Terms of Use in L<http://www.unicode.org/terms_of_use.html>
2123
2124=head1 SEE ALSO
2125
2126=over 4
2127
2128=item Unicode Collation Algorithm - UTS #10
2129
2130L<http://www.unicode.org/reports/tr10/>
2131
2132=item The Default Unicode Collation Element Table (DUCET)
2133
2134L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
2135
2136=item The conformance test for the UCA
2137
2138L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
2139
2140L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
2141
2142=item Hangul Syllable Type
2143
2144L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
2145
2146=item Unicode Normalization Forms - UAX #15
2147
2148L<http://www.unicode.org/reports/tr15/>
2149
2150=item Unicode Locale Data Markup Language (LDML) - UTS #35
2151
2152L<http://www.unicode.org/reports/tr35/>
2153
2154=back
2155
2156=cut
2157