1261287Sdes/* Make ucnid.h from various sources.
2261287Sdes   Copyright (C) 2005 Free Software Foundation, Inc.
3261287Sdes
4261287SdesThis program is free software; you can redistribute it and/or modify it
5261287Sdesunder the terms of the GNU General Public License as published by the
6261287SdesFree Software Foundation; either version 2, or (at your option) any
7261287Sdeslater version.
8261287Sdes
9261287SdesThis program is distributed in the hope that it will be useful,
10261287Sdesbut WITHOUT ANY WARRANTY; without even the implied warranty of
11261287SdesMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12261287SdesGNU General Public License for more details.
13261287Sdes
14261287SdesYou should have received a copy of the GNU General Public License
15261287Sdesalong with this program; if not, write to the Free Software
16261287SdesFoundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
17261287Sdes
18261287Sdes/* Run this program as
19261287Sdes   ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \
20261287Sdes       > ucnid.h
21261287Sdes*/
22261287Sdes
23261287Sdes#include <stdio.h>
24261287Sdes#include <string.h>
25261287Sdes#include <ctype.h>
26261287Sdes#include <stdbool.h>
27261287Sdes#include <stdlib.h>
28261287Sdes
29261287Sdesenum {
30261287Sdes  C99 = 1,
31261287Sdes  CXX = 2,
32261287Sdes  digit = 4,
33261287Sdes  not_NFC = 8,
34261287Sdes  not_NFKC = 16,
35261287Sdes  maybe_not_NFC = 32
36261287Sdes};
37261287Sdes
38261287Sdesstatic unsigned flags[65536];
39261287Sdesstatic unsigned short decomp[65536][2];
40261287Sdesstatic unsigned char combining_value[65536];
41261287Sdes
42261287Sdes/* Die!  */
43261287Sdes
44261287Sdesstatic void
45261287Sdesfail (const char *s)
46261287Sdes{
47261287Sdes  fprintf (stderr, "%s\n", s);
48261287Sdes  exit (1);
49261287Sdes}
50261287Sdes
51261287Sdes/* Read ucnid.tab and set the C99 and CXX flags in header[].  */
52261287Sdes
53261287Sdesstatic void
54261287Sdesread_ucnid (const char *fname)
55261287Sdes{
56261287Sdes  FILE *f = fopen (fname, "r");
57261287Sdes  unsigned fl = 0;
58261287Sdes
59261287Sdes  if (!f)
60261287Sdes    fail ("opening ucnid.tab");
61261287Sdes  for (;;)
62261287Sdes    {
63261287Sdes      char line[256];
64261287Sdes
65261287Sdes      if (!fgets (line, sizeof (line), f))
66261287Sdes	break;
67261287Sdes      if (strcmp (line, "[C99]\n") == 0)
68261287Sdes	fl = C99;
69261287Sdes      else if (strcmp (line, "[CXX]\n") == 0)
70261287Sdes	fl = CXX;
71261287Sdes      else if (isxdigit (line[0]))
72261287Sdes	{
73261287Sdes	  char *l = line;
74261287Sdes	  while (*l)
75261287Sdes	    {
76261287Sdes	      unsigned long start, end;
77261287Sdes	      char *endptr;
78261287Sdes	      start = strtoul (l, &endptr, 16);
79261287Sdes	      if (endptr == l || (*endptr != '-' && ! isspace (*endptr)))
80261287Sdes		fail ("parsing ucnid.tab [1]");
81261287Sdes	      l = endptr;
82261287Sdes	      if (*l != '-')
83261287Sdes		end = start;
84261287Sdes	      else
85261287Sdes		{
86261287Sdes		  end = strtoul (l + 1, &endptr, 16);
87261287Sdes		  if (end < start)
88261287Sdes		    fail ("parsing ucnid.tab, end before start");
89261287Sdes		  l = endptr;
90261287Sdes		  if (! isspace (*l))
91261287Sdes		    fail ("parsing ucnid.tab, junk after range");
92261287Sdes		}
93261287Sdes	      while (isspace (*l))
94261287Sdes		l++;
95261287Sdes	      if (end > 0xFFFF)
96261287Sdes		fail ("parsing ucnid.tab, end too large");
97261287Sdes	      while (start <= end)
98261287Sdes		flags[start++] |= fl;
99261287Sdes	    }
100261287Sdes	}
101261287Sdes    }
102261287Sdes  if (ferror (f))
103261287Sdes    fail ("reading ucnid.tab");
104261287Sdes  fclose (f);
105261287Sdes}
106261287Sdes
107261287Sdes/* Read UnicodeData.txt and set the 'digit' flag, and
108261287Sdes   also fill in the 'decomp' table to be the decompositions of
109261287Sdes   characters for which both the character decomposed and all the code
110261287Sdes   points in the decomposition are either C99 or CXX.  */
111261287Sdes
112261287Sdesstatic void
113261287Sdesread_table (char *fname)
114261287Sdes{
115261287Sdes  FILE * f = fopen (fname, "r");
116261287Sdes
117261287Sdes  if (!f)
118261287Sdes    fail ("opening UnicodeData.txt");
119261287Sdes  for (;;)
120261287Sdes    {
121261287Sdes      char line[256];
122261287Sdes      unsigned long codepoint, this_decomp[4];
123261287Sdes      char *l;
124261287Sdes      int i;
125261287Sdes      int decomp_useful;
126261287Sdes
127      if (!fgets (line, sizeof (line), f))
128	break;
129      codepoint = strtoul (line, &l, 16);
130      if (l == line || *l != ';')
131	fail ("parsing UnicodeData.txt, reading code point");
132      if (codepoint > 0xffff || ! (flags[codepoint] & (C99 | CXX)))
133	continue;
134
135      do {
136	l++;
137      } while (*l != ';');
138      /* Category value; things starting with 'N' are numbers of some
139	 kind.  */
140      if (*++l == 'N')
141	flags[codepoint] |= digit;
142
143      do {
144	l++;
145      } while (*l != ';');
146      /* Canonical combining class; in NFC/NFKC, they must be increasing
147	 (or zero).  */
148      if (! isdigit (*++l))
149	fail ("parsing UnicodeData.txt, combining class not number");
150      combining_value[codepoint] = strtoul (l, &l, 10);
151      if (*l++ != ';')
152	fail ("parsing UnicodeData.txt, junk after combining class");
153
154      /* Skip over bidi value.  */
155      do {
156	l++;
157      } while (*l != ';');
158
159      /* Decomposition mapping.  */
160      decomp_useful = flags[codepoint];
161      if (*++l == '<')  /* Compatibility mapping. */
162	continue;
163      for (i = 0; i < 4; i++)
164	{
165	  if (*l == ';')
166	    break;
167	  if (!isxdigit (*l))
168	    fail ("parsing UnicodeData.txt, decomposition format");
169	  this_decomp[i] = strtoul (l, &l, 16);
170	  decomp_useful &= flags[this_decomp[i]];
171	  while (isspace (*l))
172	    l++;
173	}
174      if (i > 2)  /* Decomposition too long.  */
175	fail ("parsing UnicodeData.txt, decomposition too long");
176      if (decomp_useful)
177	while (--i >= 0)
178	  decomp[codepoint][i] = this_decomp[i];
179    }
180  if (ferror (f))
181    fail ("reading UnicodeData.txt");
182  fclose (f);
183}
184
185/* Read DerivedNormalizationProps.txt and set the flags that say whether
186   a character is in NFC, NFKC, or is context-dependent.  */
187
188static void
189read_derived (const char *fname)
190{
191  FILE * f = fopen (fname, "r");
192
193  if (!f)
194    fail ("opening DerivedNormalizationProps.txt");
195  for (;;)
196    {
197      char line[256];
198      unsigned long start, end;
199      char *l;
200      bool not_NFC_p, not_NFKC_p, maybe_not_NFC_p;
201
202      if (!fgets (line, sizeof (line), f))
203	break;
204      not_NFC_p = (strstr (line, "; NFC_QC; N") != NULL);
205      not_NFKC_p = (strstr (line, "; NFKC_QC; N") != NULL);
206      maybe_not_NFC_p = (strstr (line, "; NFC_QC; M") != NULL);
207      if (! not_NFC_p && ! not_NFKC_p && ! maybe_not_NFC_p)
208	continue;
209
210      start = strtoul (line, &l, 16);
211      if (l == line)
212	fail ("parsing DerivedNormalizationProps.txt, reading start");
213      if (start > 0xffff)
214	continue;
215      if (*l == '.' && l[1] == '.')
216	end = strtoul (l + 2, &l, 16);
217      else
218	end = start;
219
220      while (start <= end)
221	flags[start++] |= ((not_NFC_p ? not_NFC : 0)
222			   | (not_NFKC_p ? not_NFKC : 0)
223			   | (maybe_not_NFC_p ? maybe_not_NFC : 0)
224			   );
225    }
226  if (ferror (f))
227    fail ("reading DerivedNormalizationProps.txt");
228  fclose (f);
229}
230
231/* Write out the table.
232   The table consists of two words per entry.  The first word is the flags
233   for the unicode code points up to and including the second word.  */
234
235static void
236write_table (void)
237{
238  unsigned i;
239  unsigned last_flag = flags[0];
240  bool really_safe = decomp[0][0] == 0;
241  unsigned char last_combine = combining_value[0];
242
243  for (i = 1; i <= 65536; i++)
244    if (i == 65536
245	|| (flags[i] != last_flag && ((flags[i] | last_flag) & (C99 | CXX)))
246	|| really_safe != (decomp[i][0] == 0)
247	|| combining_value[i] != last_combine)
248      {
249	printf ("{ %s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
250		last_flag & C99 ? "C99" : "  0",
251		last_flag & digit ? "DIG" : "  0",
252		last_flag & CXX ? "CXX" : "  0",
253		really_safe ? "CID" : "  0",
254		last_flag & not_NFC ? "  0" : "NFC",
255		last_flag & not_NFKC ? "  0" : "NKC",
256		last_flag & maybe_not_NFC ? "CTX" : "  0",
257		combining_value[i - 1],
258		i - 1);
259	last_flag = flags[i];
260	last_combine = combining_value[0];
261	really_safe = decomp[i][0] == 0;
262      }
263}
264
265/* Print out the huge copyright notice.  */
266
267static void
268write_copyright (void)
269{
270  static const char copyright[] = "\
271/* Unicode characters and various properties.\n\
272   Copyright (C) 2003, 2005 Free Software Foundation, Inc.\n\
273\n\
274   This program is free software; you can redistribute it and/or modify it\n\
275   under the terms of the GNU General Public License as published by the\n\
276   Free Software Foundation; either version 2, or (at your option) any\n\
277   later version.\n\
278\n\
279   This program is distributed in the hope that it will be useful,\n\
280   but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
281   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n\
282   GNU General Public License for more details.\n\
283\n\
284   You should have received a copy of the GNU General Public License\n\
285   along with this program; if not, write to the Free Software\n\
286   Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n\
287\n\
288\n\
289   Copyright (C) 1991-2005 Unicode, Inc.  All rights reserved.\n\
290   Distributed under the Terms of Use in\n\
291   http://www.unicode.org/copyright.html.\n\
292\n\
293   Permission is hereby granted, free of charge, to any person\n\
294   obtaining a copy of the Unicode data files and any associated\n\
295   documentation (the \"Data Files\") or Unicode software and any\n\
296   associated documentation (the \"Software\") to deal in the Data Files\n\
297   or Software without restriction, including without limitation the\n\
298   rights to use, copy, modify, merge, publish, distribute, and/or\n\
299   sell copies of the Data Files or Software, and to permit persons to\n\
300   whom the Data Files or Software are furnished to do so, provided\n\
301   that (a) the above copyright notice(s) and this permission notice\n\
302   appear with all copies of the Data Files or Software, (b) both the\n\
303   above copyright notice(s) and this permission notice appear in\n\
304   associated documentation, and (c) there is clear notice in each\n\
305   modified Data File or in the Software as well as in the\n\
306   documentation associated with the Data File(s) or Software that the\n\
307   data or software has been modified.\n\
308\n\
309   THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
310   OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
311   WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
312   NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
313   COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
314   ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
315   DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
316   WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
317   ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
318   OF THE DATA FILES OR SOFTWARE.\n\
319\n\
320   Except as contained in this notice, the name of a copyright holder\n\
321   shall not be used in advertising or otherwise to promote the sale,\n\
322   use or other dealings in these Data Files or Software without prior\n\
323   written authorization of the copyright holder.  */\n";
324
325   puts (copyright);
326}
327
328/* Main program.  */
329
330int
331main(int argc, char ** argv)
332{
333  if (argc != 4)
334    fail ("too few arguments to makeucn");
335  read_ucnid (argv[1]);
336  read_table (argv[2]);
337  read_derived (argv[3]);
338
339  write_copyright ();
340  write_table ();
341  return 0;
342}
343