1/* Make ucnid.h from various sources. 2 Copyright (C) 2005-2015 Free Software Foundation, Inc. 3 4This program is free software; you can redistribute it and/or modify it 5under the terms of the GNU General Public License as published by the 6Free Software Foundation; either version 3, or (at your option) any 7later version. 8 9This program is distributed in the hope that it will be useful, 10but WITHOUT ANY WARRANTY; without even the implied warranty of 11MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12GNU General Public License for more details. 13 14You should have received a copy of the GNU General Public License 15along with this program; see the file COPYING3. If not see 16<http://www.gnu.org/licenses/>. */ 17 18/* Run this program as 19 ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \ 20 > ucnid.h 21*/ 22 23#include <stdio.h> 24#include <string.h> 25#include <ctype.h> 26#include <stdbool.h> 27#include <stdlib.h> 28 29enum { 30 C99 = 1, 31 CXX = 2, 32 N99 = 4, 33 C11 = 8, 34 N11 = 16, 35 all_languages = C99 | CXX | C11, 36 not_NFC = 32, 37 not_NFKC = 64, 38 maybe_not_NFC = 128 39}; 40 41#define NUM_CODE_POINTS 0x110000 42#define MAX_CODE_POINT 0x10ffff 43 44static unsigned flags[NUM_CODE_POINTS]; 45static unsigned int all_decomp[NUM_CODE_POINTS][2]; 46static unsigned int decomp[NUM_CODE_POINTS][2]; 47static unsigned char combining_value[NUM_CODE_POINTS]; 48 49/* Die! */ 50 51static void 52fail (const char *s) 53{ 54 fprintf (stderr, "%s\n", s); 55 exit (1); 56} 57 58/* Read ucnid.tab and set the flags for language versions in header[]. */ 59 60static void 61read_ucnid (const char *fname) 62{ 63 FILE *f = fopen (fname, "r"); 64 unsigned fl = 0; 65 66 if (!f) 67 fail ("opening ucnid.tab"); 68 for (;;) 69 { 70 char line[256]; 71 72 if (!fgets (line, sizeof (line), f)) 73 break; 74 if (strcmp (line, "[C99]\n") == 0) 75 fl = C99; 76 else if (strcmp (line, "[C99DIG]\n") == 0) 77 fl = C99|N99; 78 else if (strcmp (line, "[CXX]\n") == 0) 79 fl = CXX; 80 else if (strcmp (line, "[C11]\n") == 0) 81 fl = C11; 82 else if (strcmp (line, "[C11NOSTART]\n") == 0) 83 fl = C11|N11; 84 else if (isxdigit (line[0])) 85 { 86 char *l = line; 87 while (*l) 88 { 89 unsigned long start, end; 90 char *endptr; 91 start = strtoul (l, &endptr, 16); 92 if (endptr == l || (*endptr != '-' && ! isspace (*endptr))) 93 fail ("parsing ucnid.tab [1]"); 94 l = endptr; 95 if (*l != '-') 96 end = start; 97 else 98 { 99 end = strtoul (l + 1, &endptr, 16); 100 if (end < start) 101 fail ("parsing ucnid.tab, end before start"); 102 l = endptr; 103 if (! isspace (*l)) 104 fail ("parsing ucnid.tab, junk after range"); 105 } 106 while (isspace (*l)) 107 l++; 108 if (end > MAX_CODE_POINT) 109 fail ("parsing ucnid.tab, end too large"); 110 while (start <= end) 111 flags[start++] |= fl; 112 } 113 } 114 } 115 if (ferror (f)) 116 fail ("reading ucnid.tab"); 117 fclose (f); 118} 119 120/* Read UnicodeData.txt and fill in the 'decomp' table to be the 121 decompositions of characters for which both the character 122 decomposed and all the code points in the decomposition are valid 123 for some supported language version, and the 'all_decomp' table to 124 be the decompositions of all characters without those 125 constraints. */ 126 127static void 128read_table (char *fname) 129{ 130 FILE * f = fopen (fname, "r"); 131 132 if (!f) 133 fail ("opening UnicodeData.txt"); 134 for (;;) 135 { 136 char line[256]; 137 unsigned long codepoint, this_decomp[4]; 138 char *l; 139 int i, j; 140 int decomp_useful; 141 142 if (!fgets (line, sizeof (line), f)) 143 break; 144 codepoint = strtoul (line, &l, 16); 145 if (l == line || *l != ';') 146 fail ("parsing UnicodeData.txt, reading code point"); 147 if (codepoint > MAX_CODE_POINT) 148 fail ("parsing UnicodeData.txt, code point too large"); 149 150 do { 151 l++; 152 } while (*l != ';'); 153 /* Category value. */ 154 do { 155 l++; 156 } while (*l != ';'); 157 /* Canonical combining class; in NFC/NFKC, they must be increasing 158 (or zero). */ 159 if (! isdigit (*++l)) 160 fail ("parsing UnicodeData.txt, combining class not number"); 161 combining_value[codepoint] = strtoul (l, &l, 10); 162 if (*l++ != ';') 163 fail ("parsing UnicodeData.txt, junk after combining class"); 164 165 /* Skip over bidi value. */ 166 do { 167 l++; 168 } while (*l != ';'); 169 170 /* Decomposition mapping. */ 171 decomp_useful = flags[codepoint]; 172 if (*++l == '<') /* Compatibility mapping. */ 173 continue; 174 for (i = 0; i < 4; i++) 175 { 176 if (*l == ';') 177 break; 178 if (!isxdigit (*l)) 179 fail ("parsing UnicodeData.txt, decomposition format"); 180 this_decomp[i] = strtoul (l, &l, 16); 181 decomp_useful &= flags[this_decomp[i]]; 182 while (isspace (*l)) 183 l++; 184 } 185 if (i > 2) /* Decomposition too long. */ 186 fail ("parsing UnicodeData.txt, decomposition too long"); 187 for (j = 0; j < i; j++) 188 all_decomp[codepoint][j] = this_decomp[j]; 189 if ((flags[codepoint] & all_languages) && decomp_useful) 190 while (--i >= 0) 191 decomp[codepoint][i] = this_decomp[i]; 192 } 193 if (ferror (f)) 194 fail ("reading UnicodeData.txt"); 195 fclose (f); 196} 197 198/* Read DerivedNormalizationProps.txt and set the flags that say whether 199 a character is in NFC, NFKC, or is context-dependent. */ 200 201static void 202read_derived (const char *fname) 203{ 204 FILE * f = fopen (fname, "r"); 205 206 if (!f) 207 fail ("opening DerivedNormalizationProps.txt"); 208 for (;;) 209 { 210 char line[256]; 211 unsigned long start, end; 212 char *l; 213 bool not_NFC_p, not_NFKC_p, maybe_not_NFC_p; 214 215 if (!fgets (line, sizeof (line), f)) 216 break; 217 not_NFC_p = (strstr (line, "; NFC_QC; N") != NULL); 218 not_NFKC_p = (strstr (line, "; NFKC_QC; N") != NULL); 219 maybe_not_NFC_p = (strstr (line, "; NFC_QC; M") != NULL); 220 if (! not_NFC_p && ! not_NFKC_p && ! maybe_not_NFC_p) 221 continue; 222 223 start = strtoul (line, &l, 16); 224 if (l == line) 225 fail ("parsing DerivedNormalizationProps.txt, reading start"); 226 if (start > MAX_CODE_POINT) 227 fail ("parsing DerivedNormalizationProps.txt, code point too large"); 228 if (*l == '.' && l[1] == '.') 229 end = strtoul (l + 2, &l, 16); 230 else 231 end = start; 232 233 while (start <= end) 234 flags[start++] |= ((not_NFC_p ? not_NFC : 0) 235 | (not_NFKC_p ? not_NFKC : 0) 236 | (maybe_not_NFC_p ? maybe_not_NFC : 0) 237 ); 238 } 239 if (ferror (f)) 240 fail ("reading DerivedNormalizationProps.txt"); 241 fclose (f); 242} 243 244/* Write out the table. 245 The table consists of two words per entry. The first word is the flags 246 for the unicode code points up to and including the second word. */ 247 248static void 249write_table (void) 250{ 251 unsigned i; 252 unsigned last_flag = flags[0]; 253 bool really_safe = decomp[0][0] == 0; 254 unsigned char last_combine = combining_value[0]; 255 256 printf ("static const struct ucnrange ucnranges[] = {\n"); 257 258 for (i = 1; i <= NUM_CODE_POINTS; i++) 259 if (i == NUM_CODE_POINTS 260 || (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages)) 261 || really_safe != (decomp[i][0] == 0) 262 || combining_value[i] != last_combine) 263 { 264 printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", 265 last_flag & C99 ? "C99" : " 0", 266 last_flag & N99 ? "N99" : " 0", 267 last_flag & CXX ? "CXX" : " 0", 268 last_flag & C11 ? "C11" : " 0", 269 last_flag & N11 ? "N11" : " 0", 270 really_safe ? "CID" : " 0", 271 last_flag & not_NFC ? " 0" : "NFC", 272 last_flag & not_NFKC ? " 0" : "NKC", 273 last_flag & maybe_not_NFC ? "CTX" : " 0", 274 combining_value[i - 1], 275 i - 1); 276 last_flag = flags[i]; 277 last_combine = combining_value[0]; 278 really_safe = decomp[i][0] == 0; 279 } 280 281 printf ("};\n"); 282} 283 284/* Return whether a given character is valid in an identifier for some 285 supported language, either as itself or as a UCN. */ 286 287static bool 288char_id_valid (unsigned int c) 289{ 290 return ((flags[c] & all_languages) 291 || (c == 0x24) 292 || (c >= 0x30 && c <= 0x39) 293 || (c >= 0x41 && c <= 0x5a) 294 || (c >= 0x61 && c <= 0x7a)); 295} 296 297/* Write out the switch statement over characters for which it is 298 context-dependent whether they are in NFC. */ 299 300static void 301write_context_switch (void) 302{ 303 unsigned i; 304 printf ("static bool\n" 305 "check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n" 306 "{\n" 307 " switch (c)\n" 308 " {\n"); 309 for (i = 0; i < NUM_CODE_POINTS; i++) 310 { 311 bool found_case = false; 312 unsigned j; 313 if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC)) 314 continue; 315 if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2)) 316 continue; /* Hangul handled algorithmically. */ 317 printf (" case %#06x:\n" 318 " switch (p)\n" 319 "\t{\n", i); 320 /* If an NFC starter character decomposes with this character I 321 as the second character and an NFC starter character S as the 322 first character, that latter character as a previous 323 character means this character is not NFC. Furthermore, any 324 NFC starter character K made by a series of compositions of S 325 with combining characters whose combining class is greater 326 than that of I also means this character is not NFC. */ 327 for (j = 0; j < NUM_CODE_POINTS; j++) 328 { 329 unsigned s, k; 330 if (all_decomp[j][1] != i) 331 continue; 332 s = all_decomp[j][0]; 333 if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0) 334 continue; 335 if (char_id_valid (s)) 336 { 337 found_case = true; 338 printf ("\tcase %#06x:\n", s); 339 } 340 for (k = 0; k < NUM_CODE_POINTS; k++) 341 { 342 unsigned t = k; 343 if (k == s || !char_id_valid (k)) 344 continue; 345 while (all_decomp[t][1] != 0 346 && combining_value[all_decomp[t][1]] > combining_value[i]) 347 { 348 if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0) 349 break; 350 t = all_decomp[t][0]; 351 } 352 if (t == s) 353 { 354 found_case = true; 355 printf ("\tcase %#06x:\n", k); 356 } 357 } 358 } 359 if (found_case) 360 printf ("\t return false;\n"); 361 else 362 printf ("\t/* Non-NFC cases not applicable to C/C++. */\n"); 363 printf ("\tdefault:\n" 364 "\t return true;\n" 365 "\t}\n\n"); 366 } 367 printf (" default:\n" 368 " cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n" 369 " return true;\n" 370 " }\n" 371 "}\n"); 372} 373 374/* Print out the huge copyright notice. */ 375 376static void 377write_copyright (void) 378{ 379 static const char copyright[] = "\ 380/* Unicode characters and various properties.\n\ 381 Copyright (C) 2003-2015 Free Software Foundation, Inc.\n\ 382\n\ 383 This program is free software; you can redistribute it and/or modify it\n\ 384 under the terms of the GNU General Public License as published by the\n\ 385 Free Software Foundation; either version 3, or (at your option) any\n\ 386 later version.\n\ 387\n\ 388 This program is distributed in the hope that it will be useful,\n\ 389 but WITHOUT ANY WARRANTY; without even the implied warranty of\n\ 390 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\ 391 GNU General Public License for more details.\n\ 392\n\ 393 You should have received a copy of the GNU General Public License\n\ 394 along with this program; see the file COPYING3. If not see\n\ 395 <http://www.gnu.org/licenses/>.\n\ 396\n\ 397\n\ 398 Copyright (C) 1991-2005 Unicode, Inc. All rights reserved.\n\ 399 Distributed under the Terms of Use in\n\ 400 http://www.unicode.org/copyright.html.\n\ 401\n\ 402 Permission is hereby granted, free of charge, to any person\n\ 403 obtaining a copy of the Unicode data files and any associated\n\ 404 documentation (the \"Data Files\") or Unicode software and any\n\ 405 associated documentation (the \"Software\") to deal in the Data Files\n\ 406 or Software without restriction, including without limitation the\n\ 407 rights to use, copy, modify, merge, publish, distribute, and/or\n\ 408 sell copies of the Data Files or Software, and to permit persons to\n\ 409 whom the Data Files or Software are furnished to do so, provided\n\ 410 that (a) the above copyright notice(s) and this permission notice\n\ 411 appear with all copies of the Data Files or Software, (b) both the\n\ 412 above copyright notice(s) and this permission notice appear in\n\ 413 associated documentation, and (c) there is clear notice in each\n\ 414 modified Data File or in the Software as well as in the\n\ 415 documentation associated with the Data File(s) or Software that the\n\ 416 data or software has been modified.\n\ 417\n\ 418 THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\ 419 OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\ 420 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\ 421 NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\ 422 COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\ 423 ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\ 424 DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\ 425 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\ 426 ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\ 427 OF THE DATA FILES OR SOFTWARE.\n\ 428\n\ 429 Except as contained in this notice, the name of a copyright holder\n\ 430 shall not be used in advertising or otherwise to promote the sale,\n\ 431 use or other dealings in these Data Files or Software without prior\n\ 432 written authorization of the copyright holder. */\n"; 433 434 puts (copyright); 435} 436 437/* Main program. */ 438 439int 440main(int argc, char ** argv) 441{ 442 if (argc != 4) 443 fail ("too few arguments to makeucn"); 444 read_ucnid (argv[1]); 445 read_table (argv[2]); 446 read_derived (argv[3]); 447 448 write_copyright (); 449 write_table (); 450 write_context_switch (); 451 return 0; 452} 453