1/* Make ucnid.h from various sources.
2   Copyright (C) 2005-2015 Free Software Foundation, Inc.
3
4This program is free software; you can redistribute it and/or modify it
5under the terms of the GNU General Public License as published by the
6Free Software Foundation; either version 3, or (at your option) any
7later version.
8
9This program is distributed in the hope that it will be useful,
10but WITHOUT ANY WARRANTY; without even the implied warranty of
11MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License
15along with this program; see the file COPYING3.  If not see
16<http://www.gnu.org/licenses/>.  */
17
18/* Run this program as
19   ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \
20       > ucnid.h
21*/
22
23#include <stdio.h>
24#include <string.h>
25#include <ctype.h>
26#include <stdbool.h>
27#include <stdlib.h>
28
29enum {
30  C99 = 1,
31  CXX = 2,
32  N99 = 4,
33  C11 = 8,
34  N11 = 16,
35  all_languages = C99 | CXX | C11,
36  not_NFC = 32,
37  not_NFKC = 64,
38  maybe_not_NFC = 128
39};
40
41#define NUM_CODE_POINTS 0x110000
42#define MAX_CODE_POINT 0x10ffff
43
44static unsigned flags[NUM_CODE_POINTS];
45static unsigned int all_decomp[NUM_CODE_POINTS][2];
46static unsigned int decomp[NUM_CODE_POINTS][2];
47static unsigned char combining_value[NUM_CODE_POINTS];
48
49/* Die!  */
50
51static void
52fail (const char *s)
53{
54  fprintf (stderr, "%s\n", s);
55  exit (1);
56}
57
58/* Read ucnid.tab and set the flags for language versions in header[].  */
59
60static void
61read_ucnid (const char *fname)
62{
63  FILE *f = fopen (fname, "r");
64  unsigned fl = 0;
65
66  if (!f)
67    fail ("opening ucnid.tab");
68  for (;;)
69    {
70      char line[256];
71
72      if (!fgets (line, sizeof (line), f))
73	break;
74      if (strcmp (line, "[C99]\n") == 0)
75	fl = C99;
76      else if (strcmp (line, "[C99DIG]\n") == 0)
77	fl = C99|N99;
78      else if (strcmp (line, "[CXX]\n") == 0)
79	fl = CXX;
80      else if (strcmp (line, "[C11]\n") == 0)
81	fl = C11;
82      else if (strcmp (line, "[C11NOSTART]\n") == 0)
83	fl = C11|N11;
84      else if (isxdigit (line[0]))
85	{
86	  char *l = line;
87	  while (*l)
88	    {
89	      unsigned long start, end;
90	      char *endptr;
91	      start = strtoul (l, &endptr, 16);
92	      if (endptr == l || (*endptr != '-' && ! isspace (*endptr)))
93		fail ("parsing ucnid.tab [1]");
94	      l = endptr;
95	      if (*l != '-')
96		end = start;
97	      else
98		{
99		  end = strtoul (l + 1, &endptr, 16);
100		  if (end < start)
101		    fail ("parsing ucnid.tab, end before start");
102		  l = endptr;
103		  if (! isspace (*l))
104		    fail ("parsing ucnid.tab, junk after range");
105		}
106	      while (isspace (*l))
107		l++;
108	      if (end > MAX_CODE_POINT)
109		fail ("parsing ucnid.tab, end too large");
110	      while (start <= end)
111		flags[start++] |= fl;
112	    }
113	}
114    }
115  if (ferror (f))
116    fail ("reading ucnid.tab");
117  fclose (f);
118}
119
120/* Read UnicodeData.txt and fill in the 'decomp' table to be the
121   decompositions of characters for which both the character
122   decomposed and all the code points in the decomposition are valid
123   for some supported language version, and the 'all_decomp' table to
124   be the decompositions of all characters without those
125   constraints.  */
126
127static void
128read_table (char *fname)
129{
130  FILE * f = fopen (fname, "r");
131
132  if (!f)
133    fail ("opening UnicodeData.txt");
134  for (;;)
135    {
136      char line[256];
137      unsigned long codepoint, this_decomp[4];
138      char *l;
139      int i, j;
140      int decomp_useful;
141
142      if (!fgets (line, sizeof (line), f))
143	break;
144      codepoint = strtoul (line, &l, 16);
145      if (l == line || *l != ';')
146	fail ("parsing UnicodeData.txt, reading code point");
147      if (codepoint > MAX_CODE_POINT)
148	fail ("parsing UnicodeData.txt, code point too large");
149
150      do {
151	l++;
152      } while (*l != ';');
153      /* Category value.  */
154      do {
155	l++;
156      } while (*l != ';');
157      /* Canonical combining class; in NFC/NFKC, they must be increasing
158	 (or zero).  */
159      if (! isdigit (*++l))
160	fail ("parsing UnicodeData.txt, combining class not number");
161      combining_value[codepoint] = strtoul (l, &l, 10);
162      if (*l++ != ';')
163	fail ("parsing UnicodeData.txt, junk after combining class");
164
165      /* Skip over bidi value.  */
166      do {
167	l++;
168      } while (*l != ';');
169
170      /* Decomposition mapping.  */
171      decomp_useful = flags[codepoint];
172      if (*++l == '<')  /* Compatibility mapping. */
173	continue;
174      for (i = 0; i < 4; i++)
175	{
176	  if (*l == ';')
177	    break;
178	  if (!isxdigit (*l))
179	    fail ("parsing UnicodeData.txt, decomposition format");
180	  this_decomp[i] = strtoul (l, &l, 16);
181	  decomp_useful &= flags[this_decomp[i]];
182	  while (isspace (*l))
183	    l++;
184	}
185      if (i > 2)  /* Decomposition too long.  */
186	fail ("parsing UnicodeData.txt, decomposition too long");
187      for (j = 0; j < i; j++)
188	all_decomp[codepoint][j] = this_decomp[j];
189      if ((flags[codepoint] & all_languages) && decomp_useful)
190	while (--i >= 0)
191	  decomp[codepoint][i] = this_decomp[i];
192    }
193  if (ferror (f))
194    fail ("reading UnicodeData.txt");
195  fclose (f);
196}
197
198/* Read DerivedNormalizationProps.txt and set the flags that say whether
199   a character is in NFC, NFKC, or is context-dependent.  */
200
201static void
202read_derived (const char *fname)
203{
204  FILE * f = fopen (fname, "r");
205
206  if (!f)
207    fail ("opening DerivedNormalizationProps.txt");
208  for (;;)
209    {
210      char line[256];
211      unsigned long start, end;
212      char *l;
213      bool not_NFC_p, not_NFKC_p, maybe_not_NFC_p;
214
215      if (!fgets (line, sizeof (line), f))
216	break;
217      not_NFC_p = (strstr (line, "; NFC_QC; N") != NULL);
218      not_NFKC_p = (strstr (line, "; NFKC_QC; N") != NULL);
219      maybe_not_NFC_p = (strstr (line, "; NFC_QC; M") != NULL);
220      if (! not_NFC_p && ! not_NFKC_p && ! maybe_not_NFC_p)
221	continue;
222
223      start = strtoul (line, &l, 16);
224      if (l == line)
225	fail ("parsing DerivedNormalizationProps.txt, reading start");
226      if (start > MAX_CODE_POINT)
227	fail ("parsing DerivedNormalizationProps.txt, code point too large");
228      if (*l == '.' && l[1] == '.')
229	end = strtoul (l + 2, &l, 16);
230      else
231	end = start;
232
233      while (start <= end)
234	flags[start++] |= ((not_NFC_p ? not_NFC : 0)
235			   | (not_NFKC_p ? not_NFKC : 0)
236			   | (maybe_not_NFC_p ? maybe_not_NFC : 0)
237			   );
238    }
239  if (ferror (f))
240    fail ("reading DerivedNormalizationProps.txt");
241  fclose (f);
242}
243
244/* Write out the table.
245   The table consists of two words per entry.  The first word is the flags
246   for the unicode code points up to and including the second word.  */
247
248static void
249write_table (void)
250{
251  unsigned i;
252  unsigned last_flag = flags[0];
253  bool really_safe = decomp[0][0] == 0;
254  unsigned char last_combine = combining_value[0];
255
256  printf ("static const struct ucnrange ucnranges[] = {\n");
257
258  for (i = 1; i <= NUM_CODE_POINTS; i++)
259    if (i == NUM_CODE_POINTS
260	|| (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages))
261	|| really_safe != (decomp[i][0] == 0)
262	|| combining_value[i] != last_combine)
263      {
264	printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
265		last_flag & C99 ? "C99" : "  0",
266		last_flag & N99 ? "N99" : "  0",
267		last_flag & CXX ? "CXX" : "  0",
268		last_flag & C11 ? "C11" : "  0",
269		last_flag & N11 ? "N11" : "  0",
270		really_safe ? "CID" : "  0",
271		last_flag & not_NFC ? "  0" : "NFC",
272		last_flag & not_NFKC ? "  0" : "NKC",
273		last_flag & maybe_not_NFC ? "CTX" : "  0",
274		combining_value[i - 1],
275		i - 1);
276	last_flag = flags[i];
277	last_combine = combining_value[0];
278	really_safe = decomp[i][0] == 0;
279      }
280
281  printf ("};\n");
282}
283
284/* Return whether a given character is valid in an identifier for some
285   supported language, either as itself or as a UCN.  */
286
287static bool
288char_id_valid (unsigned int c)
289{
290  return ((flags[c] & all_languages)
291	  || (c == 0x24)
292	  || (c >= 0x30 && c <= 0x39)
293	  || (c >= 0x41 && c <= 0x5a)
294	  || (c >= 0x61 && c <= 0x7a));
295}
296
297/* Write out the switch statement over characters for which it is
298   context-dependent whether they are in NFC.  */
299
300static void
301write_context_switch (void)
302{
303  unsigned i;
304  printf ("static bool\n"
305	  "check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n"
306	  "{\n"
307	  "  switch (c)\n"
308	  "    {\n");
309  for (i = 0; i < NUM_CODE_POINTS; i++)
310    {
311      bool found_case = false;
312      unsigned j;
313      if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC))
314	continue;
315      if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2))
316	continue; /* Hangul handled algorithmically.  */
317      printf ("    case %#06x:\n"
318	      "      switch (p)\n"
319	      "\t{\n", i);
320      /* If an NFC starter character decomposes with this character I
321	 as the second character and an NFC starter character S as the
322	 first character, that latter character as a previous
323	 character means this character is not NFC.  Furthermore, any
324	 NFC starter character K made by a series of compositions of S
325	 with combining characters whose combining class is greater
326	 than that of I also means this character is not NFC.  */
327      for (j = 0; j < NUM_CODE_POINTS; j++)
328	{
329	  unsigned s, k;
330	  if (all_decomp[j][1] != i)
331	    continue;
332	  s = all_decomp[j][0];
333	  if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0)
334	    continue;
335	  if (char_id_valid (s))
336	    {
337	      found_case = true;
338	      printf ("\tcase %#06x:\n", s);
339	    }
340	  for (k = 0; k < NUM_CODE_POINTS; k++)
341	    {
342	      unsigned t = k;
343	      if (k == s || !char_id_valid (k))
344		continue;
345	      while (all_decomp[t][1] != 0
346		     && combining_value[all_decomp[t][1]] > combining_value[i])
347		{
348		  if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0)
349		    break;
350		  t = all_decomp[t][0];
351		}
352	      if (t == s)
353		{
354		  found_case = true;
355		  printf ("\tcase %#06x:\n", k);
356		}
357	    }
358	}
359      if (found_case)
360	printf ("\t  return false;\n");
361      else
362	printf ("\t/* Non-NFC cases not applicable to C/C++.  */\n");
363      printf ("\tdefault:\n"
364	      "\t  return true;\n"
365	      "\t}\n\n");
366    }
367  printf ("    default:\n"
368	  "      cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n"
369	  "      return true;\n"
370	  "  }\n"
371	  "}\n");
372}
373
374/* Print out the huge copyright notice.  */
375
376static void
377write_copyright (void)
378{
379  static const char copyright[] = "\
380/* Unicode characters and various properties.\n\
381   Copyright (C) 2003-2015 Free Software Foundation, Inc.\n\
382\n\
383   This program is free software; you can redistribute it and/or modify it\n\
384   under the terms of the GNU General Public License as published by the\n\
385   Free Software Foundation; either version 3, or (at your option) any\n\
386   later version.\n\
387\n\
388   This program is distributed in the hope that it will be useful,\n\
389   but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
390   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n\
391   GNU General Public License for more details.\n\
392\n\
393   You should have received a copy of the GNU General Public License\n\
394   along with this program; see the file COPYING3.  If not see\n\
395   <http://www.gnu.org/licenses/>.\n\
396\n\
397\n\
398   Copyright (C) 1991-2005 Unicode, Inc.  All rights reserved.\n\
399   Distributed under the Terms of Use in\n\
400   http://www.unicode.org/copyright.html.\n\
401\n\
402   Permission is hereby granted, free of charge, to any person\n\
403   obtaining a copy of the Unicode data files and any associated\n\
404   documentation (the \"Data Files\") or Unicode software and any\n\
405   associated documentation (the \"Software\") to deal in the Data Files\n\
406   or Software without restriction, including without limitation the\n\
407   rights to use, copy, modify, merge, publish, distribute, and/or\n\
408   sell copies of the Data Files or Software, and to permit persons to\n\
409   whom the Data Files or Software are furnished to do so, provided\n\
410   that (a) the above copyright notice(s) and this permission notice\n\
411   appear with all copies of the Data Files or Software, (b) both the\n\
412   above copyright notice(s) and this permission notice appear in\n\
413   associated documentation, and (c) there is clear notice in each\n\
414   modified Data File or in the Software as well as in the\n\
415   documentation associated with the Data File(s) or Software that the\n\
416   data or software has been modified.\n\
417\n\
418   THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
419   OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
420   WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
421   NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
422   COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
423   ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
424   DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
425   WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
426   ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
427   OF THE DATA FILES OR SOFTWARE.\n\
428\n\
429   Except as contained in this notice, the name of a copyright holder\n\
430   shall not be used in advertising or otherwise to promote the sale,\n\
431   use or other dealings in these Data Files or Software without prior\n\
432   written authorization of the copyright holder.  */\n";
433
434   puts (copyright);
435}
436
437/* Main program.  */
438
439int
440main(int argc, char ** argv)
441{
442  if (argc != 4)
443    fail ("too few arguments to makeucn");
444  read_ucnid (argv[1]);
445  read_table (argv[2]);
446  read_derived (argv[3]);
447
448  write_copyright ();
449  write_table ();
450  write_context_switch ();
451  return 0;
452}
453