1/*-
2 * Copyright 2018 Nexenta Systems, Inc.
3 * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
4 * Copyright 2015 John Marino <draco@marino.st>
5 *
6 * This source code is derived from the illumos localedef command, and
7 * provided under BSD-style license terms by Nexenta Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * LC_CTYPE database generation routines for localedef.
34 */
35#include <sys/cdefs.h>
36#include <sys/tree.h>
37
38#include <stdio.h>
39#include <stdlib.h>
40#include <stddef.h>
41#include <string.h>
42#include <sys/types.h>
43#include <wchar.h>
44#include <unistd.h>
45#include "localedef.h"
46#include "parser.h"
47
48/* Always include the defines for the target: */
49#define _DONT_USE_CTYPE_INLINE_ /* Avoid dependencies on runetype.h */
50#include "_ctype.h"
51#include "runefile.h"
52
53
54/* Needed for bootstrapping, _CTYPE_N */
55#ifndef _CTYPE_N
56#define _CTYPE_N       0x00400000L
57#endif
58
59#define _ISUPPER	_CTYPE_U
60#define _ISLOWER	_CTYPE_L
61#define	_ISDIGIT	_CTYPE_D
62#define	_ISXDIGIT	_CTYPE_X
63#define	_ISSPACE	_CTYPE_S
64#define	_ISBLANK	_CTYPE_B
65#define	_ISALPHA	_CTYPE_A
66#define	_ISPUNCT	_CTYPE_P
67#define	_ISGRAPH	_CTYPE_G
68#define	_ISPRINT	_CTYPE_R
69#define	_ISCNTRL	_CTYPE_C
70#define	_E1		_CTYPE_Q
71#define	_E2		_CTYPE_I
72#define	_E3		0
73#define	_E4		_CTYPE_N
74#define	_E5		_CTYPE_T
75
76static wchar_t		last_ctype;
77static int ctype_compare(const void *n1, const void *n2);
78
79typedef struct ctype_node {
80	wchar_t wc;
81	int32_t	ctype;
82	int32_t	toupper;
83	int32_t	tolower;
84	RB_ENTRY(ctype_node) entry;
85} ctype_node_t;
86
87static RB_HEAD(ctypes, ctype_node) ctypes;
88RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare);
89
90static int
91ctype_compare(const void *n1, const void *n2)
92{
93	const ctype_node_t *c1 = n1;
94	const ctype_node_t *c2 = n2;
95
96	return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
97}
98
99void
100init_ctype(void)
101{
102	RB_INIT(&ctypes);
103}
104
105
106static void
107add_ctype_impl(ctype_node_t *ctn)
108{
109	switch (last_kw) {
110	case T_ISUPPER:
111		ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
112		break;
113	case T_ISLOWER:
114		ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
115		break;
116	case T_ISALPHA:
117		ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
118		break;
119	case T_ISDIGIT:
120		ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4);
121		break;
122	case T_ISSPACE:
123		/*
124		 * This can be troublesome as <form-feed>, <newline>,
125		 * <carriage-return>, <tab>, and <vertical-tab> are defined both
126		 * as space and cntrl, and POSIX doesn't allow cntrl/print
127		 * combination.  We will take care of this in dump_ctype().
128		 */
129		ctn->ctype |= (_ISSPACE | _ISPRINT);
130		break;
131	case T_ISCNTRL:
132		ctn->ctype |= _ISCNTRL;
133		break;
134	case T_ISGRAPH:
135		ctn->ctype |= (_ISGRAPH | _ISPRINT);
136		break;
137	case T_ISPRINT:
138		ctn->ctype |= _ISPRINT;
139		break;
140	case T_ISPUNCT:
141		ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
142		break;
143	case T_ISXDIGIT:
144		ctn->ctype |= (_ISXDIGIT | _ISPRINT);
145		break;
146	case T_ISBLANK:
147		ctn->ctype |= (_ISBLANK | _ISSPACE);
148		break;
149	case T_ISPHONOGRAM:
150		ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
151		break;
152	case T_ISIDEOGRAM:
153		ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
154		break;
155	case T_ISENGLISH:
156		ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
157		break;
158	case T_ISNUMBER:
159		ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
160		break;
161	case T_ISSPECIAL:
162		ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
163		break;
164	case T_ISALNUM:
165		/*
166		 * We can't do anything with this.  The character
167		 * should already be specified as a digit or alpha.
168		 */
169		break;
170	default:
171		errf("not a valid character class");
172	}
173}
174
175static ctype_node_t *
176get_ctype(wchar_t wc)
177{
178	ctype_node_t	srch;
179	ctype_node_t	*ctn;
180
181	srch.wc = wc;
182	if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) {
183		if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
184			errf("out of memory");
185			return (NULL);
186		}
187		ctn->wc = wc;
188
189		RB_INSERT(ctypes, &ctypes, ctn);
190	}
191	return (ctn);
192}
193
194void
195add_ctype(int val)
196{
197	ctype_node_t	*ctn;
198
199	if ((ctn = get_ctype(val)) == NULL) {
200		INTERR;
201		return;
202	}
203	add_ctype_impl(ctn);
204	last_ctype = ctn->wc;
205}
206
207void
208add_ctype_range(wchar_t end)
209{
210	ctype_node_t	*ctn;
211	wchar_t		cur;
212
213	if (end < last_ctype) {
214		errf("malformed character range (%u ... %u))",
215		    last_ctype, end);
216		return;
217	}
218	for (cur = last_ctype + 1; cur <= end; cur++) {
219		if ((ctn = get_ctype(cur)) == NULL) {
220			INTERR;
221			return;
222		}
223		add_ctype_impl(ctn);
224	}
225	last_ctype = end;
226
227}
228
229/*
230 * A word about widths: if the width mask is specified, then libc
231 * unconditionally honors it.  Otherwise, it assumes printable
232 * characters have width 1, and non-printable characters have width
233 * -1 (except for NULL which is special with width 0).  Hence, we have
234 * no need to inject defaults here -- the "default" unset value of 0
235 * indicates that libc should use its own logic in wcwidth as described.
236 */
237void
238add_width(int wc, int width)
239{
240	ctype_node_t	*ctn;
241
242	if ((ctn = get_ctype(wc)) == NULL) {
243		INTERR;
244		return;
245	}
246	ctn->ctype &= ~(_CTYPE_SWM);
247	switch (width) {
248	case 0:
249		ctn->ctype |= _CTYPE_SW0;
250		break;
251	case 1:
252		ctn->ctype |= _CTYPE_SW1;
253		break;
254	case 2:
255		ctn->ctype |= _CTYPE_SW2;
256		break;
257	case 3:
258		ctn->ctype |= _CTYPE_SW3;
259		break;
260	}
261}
262
263void
264add_width_range(int start, int end, int width)
265{
266	for (; start <= end; start++) {
267		add_width(start, width);
268	}
269}
270
271void
272add_caseconv(int val, int wc)
273{
274	ctype_node_t	*ctn;
275
276	ctn = get_ctype(val);
277	if (ctn == NULL) {
278		INTERR;
279		return;
280	}
281
282	switch (last_kw) {
283	case T_TOUPPER:
284		ctn->toupper = wc;
285		break;
286	case T_TOLOWER:
287		ctn->tolower = wc;
288		break;
289	default:
290		INTERR;
291		break;
292	}
293}
294
295void
296dump_ctype(void)
297{
298	FILE		*f;
299	_FileRuneLocale	rl;
300	ctype_node_t	*ctn, *last_ct, *last_lo, *last_up;
301	_FileRuneEntry	*ct = NULL;
302	_FileRuneEntry	*lo = NULL;
303	_FileRuneEntry	*up = NULL;
304	wchar_t		wc;
305	uint32_t	runetype_ext_nranges;
306	uint32_t	maplower_ext_nranges;
307	uint32_t	mapupper_ext_nranges;
308
309	(void) memset(&rl, 0, sizeof (rl));
310	runetype_ext_nranges = 0;
311	last_ct = NULL;
312	maplower_ext_nranges = 0;
313	last_lo = NULL;
314	mapupper_ext_nranges = 0;
315	last_up = NULL;
316
317	if ((f = open_category()) == NULL)
318		return;
319
320	(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
321	(void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
322
323	/*
324	 * Initialize the identity map.
325	 */
326	for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
327		rl.maplower[wc] = htote(wc);
328		rl.mapupper[wc] = htote(wc);
329	}
330
331	RB_FOREACH(ctn, ctypes, &ctypes) {
332		int conflict = 0;
333
334		wc = ctn->wc;
335
336		/*
337		 * POSIX requires certain portable characters have
338		 * certain types.  Add them if they are missing.
339		 */
340		if ((wc >= 1) && (wc <= 127)) {
341			if ((wc >= 'A') && (wc <= 'Z'))
342				ctn->ctype |= _ISUPPER;
343			if ((wc >= 'a') && (wc <= 'z'))
344				ctn->ctype |= _ISLOWER;
345			if ((wc >= '0') && (wc <= '9'))
346				ctn->ctype |= _ISDIGIT;
347			if (wc == ' ')
348				ctn->ctype |= _ISPRINT;
349			if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
350				ctn->ctype |= _ISSPACE;
351			if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
352				ctn->ctype |= _ISXDIGIT;
353			if (strchr(" \t", (char)wc))
354				ctn->ctype |= _ISBLANK;
355
356			/*
357			 * Technically these settings are only
358			 * required for the C locale.  However, it
359			 * turns out that because of the historical
360			 * version of isprint(), we need them for all
361			 * locales as well.  Note that these are not
362			 * necessarily valid punctation characters in
363			 * the current language, but ispunct() needs
364			 * to return TRUE for them.
365			 */
366			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
367			    (char)wc))
368				ctn->ctype |= _ISPUNCT;
369		}
370
371		/*
372		 * POSIX also requires that certain types imply
373		 * others.  Add any inferred types here.
374		 */
375		if (ctn->ctype & (_ISUPPER |_ISLOWER))
376			ctn->ctype |= _ISALPHA;
377		if (ctn->ctype & _ISDIGIT)
378			ctn->ctype |= _ISXDIGIT;
379		if (ctn->ctype & _ISBLANK)
380			ctn->ctype |= _ISSPACE;
381		if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
382			ctn->ctype |= _ISGRAPH;
383		if (ctn->ctype & _ISGRAPH)
384			ctn->ctype |= _ISPRINT;
385
386		/*
387		 * POSIX requires that certain combinations are invalid.
388		 * Try fixing the cases we know about (see add_ctype_impl()).
389		 */
390		if ((ctn->ctype & (_ISSPACE|_ISCNTRL)) == (_ISSPACE|_ISCNTRL))
391			ctn->ctype &= ~_ISPRINT;
392
393		/*
394		 * Finally, don't flag remaining cases as a fatal error,
395		 * and just warn about them.
396		 */
397		if ((ctn->ctype & _ISALPHA) &&
398		    (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
399			conflict++;
400		if ((ctn->ctype & _ISPUNCT) &&
401		    (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
402			conflict++;
403		if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
404			conflict++;
405		if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT))
406			conflict++;
407		if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
408			conflict++;
409
410		if (conflict) {
411			warn("conflicting classes for character 0x%x (%x)",
412			    wc, ctn->ctype);
413		}
414		/*
415		 * Handle the lower 256 characters using the simple
416		 * optimization.  Note that if we have not defined the
417		 * upper/lower case, then we identity map it.
418		 */
419		if ((unsigned)wc < _CACHED_RUNES) {
420			rl.runetype[wc] = htote(ctn->ctype);
421			if (ctn->tolower)
422				rl.maplower[wc] = htote(ctn->tolower);
423			if (ctn->toupper)
424				rl.mapupper[wc] = htote(ctn->toupper);
425			continue;
426		}
427
428		if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) &&
429		    (last_ct->wc + 1 == wc)) {
430			ct[runetype_ext_nranges - 1].max = htote(wc);
431		} else {
432			runetype_ext_nranges++;
433			ct = realloc(ct, sizeof (*ct) * runetype_ext_nranges);
434			ct[runetype_ext_nranges - 1].min = htote(wc);
435			ct[runetype_ext_nranges - 1].max = htote(wc);
436			ct[runetype_ext_nranges - 1].map =
437			    htote(ctn->ctype);
438		}
439		last_ct = ctn;
440		if (ctn->tolower == 0) {
441			last_lo = NULL;
442		} else if ((last_lo != NULL) &&
443		    (last_lo->tolower + 1 == ctn->tolower)) {
444			lo[maplower_ext_nranges - 1].max = htote(wc);
445			last_lo = ctn;
446		} else {
447			maplower_ext_nranges++;
448			lo = realloc(lo, sizeof (*lo) * maplower_ext_nranges);
449			lo[maplower_ext_nranges - 1].min = htote(wc);
450			lo[maplower_ext_nranges - 1].max = htote(wc);
451			lo[maplower_ext_nranges - 1].map =
452			    htote(ctn->tolower);
453			last_lo = ctn;
454		}
455
456		if (ctn->toupper == 0) {
457			last_up = NULL;
458		} else if ((last_up != NULL) &&
459		    (last_up->toupper + 1 == ctn->toupper)) {
460			up[mapupper_ext_nranges-1].max = htote(wc);
461			last_up = ctn;
462		} else {
463			mapupper_ext_nranges++;
464			up = realloc(up, sizeof (*up) * mapupper_ext_nranges);
465			up[mapupper_ext_nranges - 1].min = htote(wc);
466			up[mapupper_ext_nranges - 1].max = htote(wc);
467			up[mapupper_ext_nranges - 1].map =
468			    htote(ctn->toupper);
469			last_up = ctn;
470		}
471	}
472
473	rl.runetype_ext_nranges = htote(runetype_ext_nranges);
474	rl.maplower_ext_nranges = htote(maplower_ext_nranges);
475	rl.mapupper_ext_nranges = htote(mapupper_ext_nranges);
476	if ((wr_category(&rl, sizeof (rl), f) < 0) ||
477	    (wr_category(ct, sizeof (*ct) * runetype_ext_nranges, f) < 0) ||
478	    (wr_category(lo, sizeof (*lo) * maplower_ext_nranges, f) < 0) ||
479	    (wr_category(up, sizeof (*up) * mapupper_ext_nranges, f) < 0)) {
480		return;
481	}
482
483	close_category(f);
484}
485