1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
5 * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
6 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
7 *		at Electronni Visti IA, Kiev, Ukraine.
8 *			All rights reserved.
9 *
10 * Copyright (c) 2011 The FreeBSD Foundation
11 *
12 * Portions of this software were developed by David Chisnall
13 * under sponsorship from the FreeBSD Foundation.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * Adapted to xlocale by John Marino <draco@marino.st>
37 */
38
39#include "namespace.h"
40
41#include <sys/types.h>
42#include <sys/stat.h>
43#include <sys/mman.h>
44
45#include <assert.h>
46#include <stdio.h>
47#include <stdlib.h>
48#include <string.h>
49#include <wchar.h>
50#include <errno.h>
51#include <unistd.h>
52#include <fcntl.h>
53#include "un-namespace.h"
54
55#include "collate.h"
56#include "setlocale.h"
57#include "ldpart.h"
58#include "libc_private.h"
59
60struct xlocale_collate __xlocale_global_collate = {
61	{{0}, "C"}, 1, 0, 0, 0
62};
63
64struct xlocale_collate __xlocale_C_collate = {
65	{{0}, "C"}, 1, 0, 0, 0
66};
67
68struct xlocale_collate __xlocale_POSIX_collate = {
69	{{0}, "POSIX"}, 1, 0, 0, 0
70};
71
72struct xlocale_collate __xlocale_CUTF8_collate = {
73	{{0}, "C.UTF-8"}, 1, 0, 0, 0
74};
75
76static int
77__collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
78
79static void
80destruct_collate(void *t)
81{
82	struct xlocale_collate *table = t;
83	if (table->map && (table->maplen > 0)) {
84		(void) munmap(table->map, table->maplen);
85	}
86	free(t);
87}
88
89void *
90__collate_load(const char *encoding, __unused locale_t unused)
91{
92	if (strcmp(encoding, "C") == 0)
93		return (&__xlocale_C_collate);
94	else if (strcmp(encoding, "POSIX") == 0)
95		return (&__xlocale_POSIX_collate);
96	else if (strcmp(encoding, "C.UTF-8") == 0)
97		return (&__xlocale_CUTF8_collate);
98
99	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate),
100	    1);
101	if (table == NULL)
102		return (NULL);
103	table->header.header.destructor = destruct_collate;
104
105	/*
106	 * FIXME: Make sure that _LDP_CACHE is never returned.  We
107	 * should be doing the caching outside of this section.
108	 */
109	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
110		xlocale_release(table);
111		return (NULL);
112	}
113	return (table);
114}
115
116/**
117 * Load the collation tables for the specified encoding into the global table.
118 */
119int
120__collate_load_tables(const char *encoding)
121{
122
123	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
124}
125
126static int
127__collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
128{
129	int i, chains, z;
130	char *buf;
131	char *TMP;
132	char *map;
133	collate_info_t *info;
134	struct stat sbuf;
135	int fd;
136
137	table->__collate_load_error = 1;
138
139	/* 'encoding' must be already checked. */
140	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 ||
141	    strncmp(encoding, "C.", 2) == 0) {
142		return (_LDP_CACHE);
143	}
144
145	if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1)
146		return (_LDP_ERROR);
147
148	if ((fd = _open(buf, O_RDONLY | O_CLOEXEC)) < 0) {
149		free(buf);
150		return (_LDP_ERROR);
151	}
152	free(buf);
153	if (_fstat(fd, &sbuf) < 0) {
154		(void) _close(fd);
155		return (_LDP_ERROR);
156	}
157	if (sbuf.st_size < (COLLATE_FMT_VERSION_LEN +
158			    XLOCALE_DEF_VERSION_LEN +
159			    sizeof (*info))) {
160		(void) _close(fd);
161		errno = EINVAL;
162		return (_LDP_ERROR);
163	}
164	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
165	(void) _close(fd);
166	if ((TMP = map) == MAP_FAILED) {
167		return (_LDP_ERROR);
168	}
169
170	if (strncmp(TMP, COLLATE_FMT_VERSION, COLLATE_FMT_VERSION_LEN) != 0) {
171		(void) munmap(map, sbuf.st_size);
172		errno = EINVAL;
173		return (_LDP_ERROR);
174	}
175	TMP += COLLATE_FMT_VERSION_LEN;
176	strlcat(table->header.version, TMP, sizeof (table->header.version));
177	TMP += XLOCALE_DEF_VERSION_LEN;
178
179	info = (void *)TMP;
180	TMP += sizeof (*info);
181
182	if ((info->directive_count < 1) ||
183	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
184	    ((chains = info->chain_count) < 0)) {
185		(void) munmap(map, sbuf.st_size);
186		errno = EINVAL;
187		return (_LDP_ERROR);
188	}
189
190	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
191	    (sizeof (collate_chain_t) * chains) +
192	    (sizeof (collate_large_t) * info->large_count);
193	for (z = 0; z < info->directive_count; z++) {
194		i += sizeof (collate_subst_t) * info->subst_count[z];
195	}
196	if (i != (sbuf.st_size - (TMP - map))) {
197		(void) munmap(map, sbuf.st_size);
198		errno = EINVAL;
199		return (_LDP_ERROR);
200	}
201
202	if (table->map && (table->maplen > 0)) {
203		(void) munmap(table->map, table->maplen);
204	}
205	table->map = map;
206	table->maplen = sbuf.st_size;
207	table->info = info;
208	table->char_pri_table = (void *)TMP;
209	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
210
211	for (z = 0; z < info->directive_count; z++) {
212		if (info->subst_count[z] > 0) {
213			table->subst_table[z] = (void *)TMP;
214			TMP += info->subst_count[z] * sizeof (collate_subst_t);
215		} else {
216			table->subst_table[z] = NULL;
217		}
218	}
219
220	if (chains > 0) {
221		table->chain_pri_table = (void *)TMP;
222		TMP += chains * sizeof (collate_chain_t);
223	} else
224		table->chain_pri_table = NULL;
225	if (info->large_count > 0)
226		table->large_pri_table = (void *)TMP;
227	else
228		table->large_pri_table = NULL;
229
230	table->__collate_load_error = 0;
231	return (_LDP_LOADED);
232}
233
234static const int32_t *
235substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
236{
237	const collate_subst_t *p;
238	int n = table->info->subst_count[pass];
239
240	if (n == 0)
241		return (NULL);
242
243	if (pass >= table->info->directive_count)
244		return (NULL);
245
246	if (!(key & COLLATE_SUBST_PRIORITY))
247		return (NULL);
248
249	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
250	assert(p->key == key);
251
252	return (p->pri);
253}
254
255static collate_chain_t *
256chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
257{
258	int low = 0;
259	int high = table->info->chain_count - 1;
260	int next, compar, l;
261	collate_chain_t *p;
262	collate_chain_t *tab = table->chain_pri_table;
263
264	if (high < 0)
265		return (NULL);
266
267	while (low <= high) {
268		next = (low + high) / 2;
269		p = tab + next;
270		compar = *key - *p->str;
271		if (compar == 0) {
272			l = wcsnlen(p->str, COLLATE_STR_LEN);
273			compar = wcsncmp(key, p->str, l);
274			if (compar == 0) {
275				*len = l;
276				return (p);
277			}
278		}
279		if (compar > 0)
280			low = next + 1;
281		else
282			high = next - 1;
283	}
284	return (NULL);
285}
286
287static collate_large_t *
288largesearch(struct xlocale_collate *table, const wchar_t key)
289{
290	int low = 0;
291	int high = table->info->large_count - 1;
292	int next, compar;
293	collate_large_t *p;
294	collate_large_t *tab = table->large_pri_table;
295
296	if (high < 0)
297		return (NULL);
298
299	while (low <= high) {
300		next = (low + high) / 2;
301		p = tab + next;
302		compar = key - p->val;
303		if (compar == 0)
304			return (p);
305		if (compar > 0)
306			low = next + 1;
307		else
308			high = next - 1;
309	}
310	return (NULL);
311}
312
313void
314_collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
315    int *pri, int which, const int **state)
316{
317	collate_chain_t *p2;
318	collate_large_t *match;
319	int p, l;
320	const int *sptr;
321
322	/*
323	 * If this is the "last" pass for the UNDEFINED, then
324	 * we just return the priority itself.
325	 */
326	if (which >= table->info->directive_count) {
327		*pri = *t;
328		*len = 1;
329		*state = NULL;
330		return;
331	}
332
333	/*
334	 * If we have remaining substitution data from a previous
335	 * call, consume it first.
336	 */
337	if ((sptr = *state) != NULL) {
338		*pri = *sptr;
339		sptr++;
340		if ((sptr == *state) || (sptr == NULL))
341			*state = NULL;
342		else
343			*state = sptr;
344		*len = 0;
345		return;
346	}
347
348	/* No active substitutions */
349	*len = 1;
350
351	/*
352	 * Check for composites such as diphthongs that collate as a
353	 * single element (aka chains or collating-elements).
354	 */
355	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
356	    ((p = p2->pri[which]) >= 0)) {
357
358		*len = l;
359		*pri = p;
360
361	} else if (*t <= UCHAR_MAX) {
362
363		/*
364		 * Character is a small (8-bit) character.
365		 * We just look these up directly for speed.
366		 */
367		*pri = table->char_pri_table[*t].pri[which];
368
369	} else if ((table->info->large_count > 0) &&
370	    ((match = largesearch(table, *t)) != NULL)) {
371
372		/*
373		 * Character was found in the extended table.
374		 */
375		*pri = match->pri.pri[which];
376
377	} else {
378		/*
379		 * Character lacks a specific definition.
380		 */
381		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
382			/* Mask off sign bit to prevent ordering confusion. */
383			*pri = (*t & COLLATE_MAX_PRIORITY);
384		} else {
385			*pri = table->info->undef_pri[which];
386		}
387		/* No substitutions for undefined characters! */
388		return;
389	}
390
391	/*
392	 * Try substituting (expanding) the character.  We are
393	 * currently doing this *after* the chain compression.  I
394	 * think it should not matter, but this way might be slightly
395	 * faster.
396	 *
397	 * We do this after the priority search, as this will help us
398	 * to identify a single key value.  In order for this to work,
399	 * its important that the priority assigned to a given element
400	 * to be substituted be unique for that level.  The localedef
401	 * code ensures this for us.
402	 */
403	if ((sptr = substsearch(table, *pri, which)) != NULL) {
404		if ((*pri = *sptr) > 0) {
405			sptr++;
406			*state = *sptr ? sptr : NULL;
407		}
408	}
409
410}
411
412/*
413 * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
414 * NOT NULL terminate.  That is left to the caller.
415 */
416size_t
417_collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
418    size_t room)
419{
420	int		pri;
421	int		len;
422	const wchar_t	*t;
423	wchar_t		*tr = NULL;
424	int		direc;
425	int		pass;
426	const int32_t 	*state;
427	size_t		want = 0;
428	size_t		need = 0;
429	int		ndir = table->info->directive_count;
430
431	assert(src);
432
433	for (pass = 0; pass <= ndir; pass++) {
434
435		state = NULL;
436
437		if (pass != 0) {
438			/* insert level separator from the previous pass */
439			if (room) {
440				*xf++ = 1;
441				room--;
442			}
443			want++;
444		}
445
446		/* special pass for undefined */
447		if (pass == ndir) {
448			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
449		} else {
450			direc = table->info->directive[pass];
451		}
452
453		t = src;
454
455		if (direc & DIRECTIVE_BACKWARD) {
456			wchar_t *bp, *fp, c;
457			free(tr);
458			if ((tr = wcsdup(t)) == NULL) {
459				errno = ENOMEM;
460				goto fail;
461			}
462			bp = tr;
463			fp = tr + wcslen(tr) - 1;
464			while (bp < fp) {
465				c = *bp;
466				*bp++ = *fp;
467				*fp-- = c;
468			}
469			t = (const wchar_t *)tr;
470		}
471
472		if (direc & DIRECTIVE_POSITION) {
473			while (*t || state) {
474				_collate_lookup(table, t, &len, &pri, pass, &state);
475				t += len;
476				if (pri <= 0) {
477					if (pri < 0) {
478						errno = EINVAL;
479						goto fail;
480					}
481					state = NULL;
482					pri = COLLATE_MAX_PRIORITY;
483				}
484				if (room) {
485					*xf++ = pri;
486					room--;
487				}
488				want++;
489				need = want;
490			}
491		} else {
492			while (*t || state) {
493				_collate_lookup(table, t, &len, &pri, pass, &state);
494				t += len;
495				if (pri <= 0) {
496					if (pri < 0) {
497						errno = EINVAL;
498						goto fail;
499					}
500					state = NULL;
501					continue;
502				}
503				if (room) {
504					*xf++ = pri;
505					room--;
506				}
507				want++;
508				need = want;
509			}
510		}
511	}
512	free(tr);
513	return (need);
514
515fail:
516	free(tr);
517	return ((size_t)(-1));
518}
519
520/*
521 * In the non-POSIX case, we transform each character into a string of
522 * characters representing the character's priority.  Since char is usually
523 * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
524 * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
525 * bits per byte.
526 *
527 * It turns out that we sometimes have real priorities that are
528 * 31-bits wide.  (But: be careful using priorities where the high
529 * order bit is set -- i.e. the priority is negative.  The sort order
530 * may be surprising!)
531 *
532 * TODO: This would be a good area to optimize somewhat.  It turns out
533 * that real prioririties *except for the last UNDEFINED pass* are generally
534 * very small.  We need the localedef code to precalculate the max
535 * priority for us, and ideally also give us a mask, and then we could
536 * severely limit what we expand to.
537 */
538#define	XFRM_BYTES	6
539#define	XFRM_OFFSET	('0')	/* make all printable characters */
540#define	XFRM_SHIFT	6
541#define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
542#define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
543
544static int
545xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
546{
547	/* we use unsigned to ensure zero fill on right shift */
548	uint32_t val = (uint32_t)table->info->pri_count[pass];
549	int nc = 0;
550
551	while (val) {
552		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
553		pri >>= XFRM_SHIFT;
554		val >>= XFRM_SHIFT;
555		p++;
556		nc++;
557	}
558	return (nc);
559}
560
561size_t
562_collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
563    size_t room)
564{
565	int		pri;
566	int		len;
567	const wchar_t	*t;
568	wchar_t		*tr = NULL;
569	int		direc;
570	int		pass;
571	const int32_t 	*state;
572	size_t		want = 0;
573	size_t		need = 0;
574	int		b;
575	uint8_t		buf[XFRM_BYTES];
576	int		ndir = table->info->directive_count;
577
578	assert(src);
579
580	for (pass = 0; pass <= ndir; pass++) {
581
582		state = NULL;
583
584		if (pass != 0) {
585			/* insert level separator from the previous pass */
586			if (room) {
587				*xf++ = XFRM_SEP;
588				room--;
589			}
590			want++;
591		}
592
593		/* special pass for undefined */
594		if (pass == ndir) {
595			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
596		} else {
597			direc = table->info->directive[pass];
598		}
599
600		t = src;
601
602		if (direc & DIRECTIVE_BACKWARD) {
603			wchar_t *bp, *fp, c;
604			free(tr);
605			if ((tr = wcsdup(t)) == NULL) {
606				errno = ENOMEM;
607				goto fail;
608			}
609			bp = tr;
610			fp = tr + wcslen(tr) - 1;
611			while (bp < fp) {
612				c = *bp;
613				*bp++ = *fp;
614				*fp-- = c;
615			}
616			t = (const wchar_t *)tr;
617		}
618
619		if (direc & DIRECTIVE_POSITION) {
620			while (*t || state) {
621
622				_collate_lookup(table, t, &len, &pri, pass, &state);
623				t += len;
624				if (pri <= 0) {
625					if (pri < 0) {
626						errno = EINVAL;
627						goto fail;
628					}
629					state = NULL;
630					pri = COLLATE_MAX_PRIORITY;
631				}
632
633				b = xfrm(table, buf, pri, pass);
634				want += b;
635				if (room) {
636					while (b) {
637						b--;
638						if (room) {
639							*xf++ = buf[b];
640							room--;
641						}
642					}
643				}
644				need = want;
645			}
646		} else {
647			while (*t || state) {
648				_collate_lookup(table, t, &len, &pri, pass, &state);
649				t += len;
650				if (pri <= 0) {
651					if (pri < 0) {
652						errno = EINVAL;
653						goto fail;
654					}
655					state = NULL;
656					continue;
657				}
658
659				b = xfrm(table, buf, pri, pass);
660				want += b;
661				if (room) {
662
663					while (b) {
664						b--;
665						if (room) {
666							*xf++ = buf[b];
667							room--;
668						}
669					}
670				}
671				need = want;
672			}
673		}
674	}
675	free(tr);
676	return (need);
677
678fail:
679	free(tr);
680	return ((size_t)(-1));
681}
682
683/*
684 * __collate_equiv_value returns the primary collation value for the given
685 * collating symbol specified by str and len.  Zero or negative is returned
686 * if the collating symbol was not found.  This function is used by bracket
687 * code in the TRE regex library.
688 */
689int
690__collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
691{
692	int32_t e;
693
694	if (len < 1 || len >= COLLATE_STR_LEN)
695		return (-1);
696
697	FIX_LOCALE(locale);
698	struct xlocale_collate *table =
699		(struct xlocale_collate*)locale->components[XLC_COLLATE];
700
701	if (table->__collate_load_error)
702		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
703
704	if (len == 1) {
705		e = -1;
706		if (*str <= UCHAR_MAX)
707			e = table->char_pri_table[*str].pri[0];
708		else if (table->info->large_count > 0) {
709			collate_large_t *match_large;
710			match_large = largesearch(table, *str);
711			if (match_large)
712				e = match_large->pri.pri[0];
713		}
714		if (e == 0)
715			return (1);
716		return (e > 0 ? e : 0);
717	}
718	if (table->info->chain_count > 0) {
719		wchar_t name[COLLATE_STR_LEN];
720		collate_chain_t *match_chain;
721		int clen;
722
723		wcsncpy (name, str, len);
724		name[len] = 0;
725		match_chain = chainsearch(table, name, &clen);
726		if (match_chain) {
727			e = match_chain->pri[0];
728			if (e == 0)
729				return (1);
730			return (e < 0 ? -e : e);
731		}
732	}
733	return (0);
734}
735