1228072Sbapt/* ccl - routines for character classes */ 2228072Sbapt 3228072Sbapt/* Copyright (c) 1990 The Regents of the University of California. */ 4228072Sbapt/* All rights reserved. */ 5228072Sbapt 6228072Sbapt/* This code is derived from software contributed to Berkeley by */ 7228072Sbapt/* Vern Paxson. */ 8228072Sbapt 9228072Sbapt/* The United States Government has rights in this work pursuant */ 10228072Sbapt/* to contract no. DE-AC03-76SF00098 between the United States */ 11228072Sbapt /* Department of Energy and the University of California. */ 12228072Sbapt 13228072Sbapt/* This file is part of flex. */ 14228072Sbapt 15228072Sbapt/* Redistribution and use in source and binary forms, with or without */ 16228072Sbapt/* modification, are permitted provided that the following conditions */ 17228072Sbapt/* are met: */ 18228072Sbapt 19228072Sbapt/* 1. Redistributions of source code must retain the above copyright */ 20228072Sbapt/* notice, this list of conditions and the following disclaimer. */ 21228072Sbapt/* 2. Redistributions in binary form must reproduce the above copyright */ 22228072Sbapt/* notice, this list of conditions and the following disclaimer in the */ 23228072Sbapt/* documentation and/or other materials provided with the distribution. */ 24228072Sbapt 25228072Sbapt/* Neither the name of the University nor the names of its contributors */ 26228072Sbapt/* may be used to endorse or promote products derived from this software */ 27228072Sbapt/* without specific prior written permission. */ 28228072Sbapt 29228072Sbapt/* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */ 30228072Sbapt/* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */ 31228072Sbapt/* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */ 32228072Sbapt/* PURPOSE. */ 33228072Sbapt 34228072Sbapt#include "flexdef.h" 35228072Sbapt 36228072Sbapt/* return true if the chr is in the ccl. Takes negation into account. */ 37228072Sbaptstatic bool 38228072Sbaptccl_contains (const int cclp, const int ch) 39228072Sbapt{ 40228072Sbapt int ind, len, i; 41228072Sbapt 42228072Sbapt len = ccllen[cclp]; 43228072Sbapt ind = cclmap[cclp]; 44228072Sbapt 45228072Sbapt for (i = 0; i < len; ++i) 46228072Sbapt if (ccltbl[ind + i] == ch) 47228072Sbapt return !cclng[cclp]; 48228072Sbapt 49228072Sbapt return cclng[cclp]; 50228072Sbapt} 51228072Sbapt 52228072Sbapt 53228072Sbapt/* ccladd - add a single character to a ccl */ 54228072Sbapt 55228072Sbaptvoid ccladd (cclp, ch) 56228072Sbapt int cclp; 57228072Sbapt int ch; 58228072Sbapt{ 59228072Sbapt int ind, len, newpos, i; 60228072Sbapt 61228072Sbapt check_char (ch); 62228072Sbapt 63228072Sbapt len = ccllen[cclp]; 64228072Sbapt ind = cclmap[cclp]; 65228072Sbapt 66228072Sbapt /* check to see if the character is already in the ccl */ 67228072Sbapt 68228072Sbapt for (i = 0; i < len; ++i) 69228072Sbapt if (ccltbl[ind + i] == ch) 70228072Sbapt return; 71228072Sbapt 72228072Sbapt /* mark newlines */ 73228072Sbapt if (ch == nlch) 74228072Sbapt ccl_has_nl[cclp] = true; 75228072Sbapt 76228072Sbapt newpos = ind + len; 77228072Sbapt 78228072Sbapt if (newpos >= current_max_ccl_tbl_size) { 79228072Sbapt current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT; 80228072Sbapt 81228072Sbapt ++num_reallocs; 82228072Sbapt 83228072Sbapt ccltbl = reallocate_Character_array (ccltbl, 84228072Sbapt current_max_ccl_tbl_size); 85228072Sbapt } 86228072Sbapt 87228072Sbapt ccllen[cclp] = len + 1; 88228072Sbapt ccltbl[newpos] = ch; 89228072Sbapt} 90228072Sbapt 91228072Sbapt/* dump_cclp - same thing as list_character_set, but for cclps. */ 92228072Sbapt 93228072Sbaptstatic void dump_cclp (FILE* file, int cclp) 94228072Sbapt{ 95250874Sjkim int i; 96228072Sbapt 97228072Sbapt putc ('[', file); 98228072Sbapt 99228072Sbapt for (i = 0; i < csize; ++i) { 100228072Sbapt if (ccl_contains(cclp, i)){ 101250874Sjkim int start_char = i; 102228072Sbapt 103228072Sbapt putc (' ', file); 104228072Sbapt 105228072Sbapt fputs (readable_form (i), file); 106228072Sbapt 107228072Sbapt while (++i < csize && ccl_contains(cclp,i)) ; 108228072Sbapt 109228072Sbapt if (i - 1 > start_char) 110228072Sbapt /* this was a run */ 111228072Sbapt fprintf (file, "-%s", 112228072Sbapt readable_form (i - 1)); 113228072Sbapt 114228072Sbapt putc (' ', file); 115228072Sbapt } 116228072Sbapt } 117228072Sbapt 118228072Sbapt putc (']', file); 119228072Sbapt} 120228072Sbapt 121228072Sbapt 122228072Sbapt 123228072Sbapt/* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */ 124228072Sbaptint 125228072Sbaptccl_set_diff (int a, int b) 126228072Sbapt{ 127228072Sbapt int d, ch; 128228072Sbapt 129228072Sbapt /* create new class */ 130228072Sbapt d = cclinit(); 131228072Sbapt 132228072Sbapt /* In order to handle negation, we spin through all possible chars, 133228072Sbapt * addding each char in a that is not in b. 134228072Sbapt * (This could be O(n^2), but n is small and bounded.) 135228072Sbapt */ 136228072Sbapt for ( ch = 0; ch < csize; ++ch ) 137228072Sbapt if (ccl_contains (a, ch) && !ccl_contains(b, ch)) 138228072Sbapt ccladd (d, ch); 139228072Sbapt 140228072Sbapt /* debug */ 141228072Sbapt if (0){ 142228072Sbapt fprintf(stderr, "ccl_set_diff ("); 143228072Sbapt fprintf(stderr, "\n "); 144228072Sbapt dump_cclp (stderr, a); 145228072Sbapt fprintf(stderr, "\n "); 146228072Sbapt dump_cclp (stderr, b); 147228072Sbapt fprintf(stderr, "\n "); 148228072Sbapt dump_cclp (stderr, d); 149228072Sbapt fprintf(stderr, "\n)\n"); 150228072Sbapt } 151228072Sbapt return d; 152228072Sbapt} 153228072Sbapt 154228072Sbapt/* ccl_set_union - create a new ccl as the set union of the two given ccls. */ 155228072Sbaptint 156228072Sbaptccl_set_union (int a, int b) 157228072Sbapt{ 158228072Sbapt int d, i; 159228072Sbapt 160228072Sbapt /* create new class */ 161228072Sbapt d = cclinit(); 162228072Sbapt 163228072Sbapt /* Add all of a */ 164228072Sbapt for (i = 0; i < ccllen[a]; ++i) 165228072Sbapt ccladd (d, ccltbl[cclmap[a] + i]); 166228072Sbapt 167228072Sbapt /* Add all of b */ 168228072Sbapt for (i = 0; i < ccllen[b]; ++i) 169228072Sbapt ccladd (d, ccltbl[cclmap[b] + i]); 170228072Sbapt 171228072Sbapt /* debug */ 172228072Sbapt if (0){ 173228072Sbapt fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d); 174228072Sbapt fprintf(stderr, "\n "); 175228072Sbapt dump_cclp (stderr, a); 176228072Sbapt fprintf(stderr, "\n "); 177228072Sbapt dump_cclp (stderr, b); 178228072Sbapt fprintf(stderr, "\n "); 179228072Sbapt dump_cclp (stderr, d); 180228072Sbapt fprintf(stderr, "\n)\n"); 181228072Sbapt } 182228072Sbapt return d; 183228072Sbapt} 184228072Sbapt 185228072Sbapt 186228072Sbapt/* cclinit - return an empty ccl */ 187228072Sbapt 188228072Sbaptint cclinit () 189228072Sbapt{ 190228072Sbapt if (++lastccl >= current_maxccls) { 191228072Sbapt current_maxccls += MAX_CCLS_INCREMENT; 192228072Sbapt 193228072Sbapt ++num_reallocs; 194228072Sbapt 195228072Sbapt cclmap = 196228072Sbapt reallocate_integer_array (cclmap, current_maxccls); 197228072Sbapt ccllen = 198228072Sbapt reallocate_integer_array (ccllen, current_maxccls); 199228072Sbapt cclng = reallocate_integer_array (cclng, current_maxccls); 200228072Sbapt ccl_has_nl = 201228072Sbapt reallocate_bool_array (ccl_has_nl, 202228072Sbapt current_maxccls); 203228072Sbapt } 204228072Sbapt 205228072Sbapt if (lastccl == 1) 206228072Sbapt /* we're making the first ccl */ 207228072Sbapt cclmap[lastccl] = 0; 208228072Sbapt 209228072Sbapt else 210228072Sbapt /* The new pointer is just past the end of the last ccl. 211228072Sbapt * Since the cclmap points to the \first/ character of a 212228072Sbapt * ccl, adding the length of the ccl to the cclmap pointer 213228072Sbapt * will produce a cursor to the first free space. 214228072Sbapt */ 215228072Sbapt cclmap[lastccl] = 216228072Sbapt cclmap[lastccl - 1] + ccllen[lastccl - 1]; 217228072Sbapt 218228072Sbapt ccllen[lastccl] = 0; 219228072Sbapt cclng[lastccl] = 0; /* ccl's start out life un-negated */ 220228072Sbapt ccl_has_nl[lastccl] = false; 221228072Sbapt 222228072Sbapt return lastccl; 223228072Sbapt} 224228072Sbapt 225228072Sbapt 226228072Sbapt/* cclnegate - negate the given ccl */ 227228072Sbapt 228228072Sbaptvoid cclnegate (cclp) 229228072Sbapt int cclp; 230228072Sbapt{ 231228072Sbapt cclng[cclp] = 1; 232228072Sbapt ccl_has_nl[cclp] = !ccl_has_nl[cclp]; 233228072Sbapt} 234228072Sbapt 235228072Sbapt 236228072Sbapt/* list_character_set - list the members of a set of characters in CCL form 237228072Sbapt * 238228072Sbapt * Writes to the given file a character-class representation of those 239228072Sbapt * characters present in the given CCL. A character is present if it 240228072Sbapt * has a non-zero value in the cset array. 241228072Sbapt */ 242228072Sbapt 243228072Sbaptvoid list_character_set (file, cset) 244228072Sbapt FILE *file; 245228072Sbapt int cset[]; 246228072Sbapt{ 247250874Sjkim int i; 248228072Sbapt 249228072Sbapt putc ('[', file); 250228072Sbapt 251228072Sbapt for (i = 0; i < csize; ++i) { 252228072Sbapt if (cset[i]) { 253250874Sjkim int start_char = i; 254228072Sbapt 255228072Sbapt putc (' ', file); 256228072Sbapt 257228072Sbapt fputs (readable_form (i), file); 258228072Sbapt 259228072Sbapt while (++i < csize && cset[i]) ; 260228072Sbapt 261228072Sbapt if (i - 1 > start_char) 262228072Sbapt /* this was a run */ 263228072Sbapt fprintf (file, "-%s", 264228072Sbapt readable_form (i - 1)); 265228072Sbapt 266228072Sbapt putc (' ', file); 267228072Sbapt } 268228072Sbapt } 269228072Sbapt 270228072Sbapt putc (']', file); 271228072Sbapt} 272228072Sbapt 273228072Sbapt/** Determines if the range [c1-c2] is unambiguous in a case-insensitive 274228072Sbapt * scanner. Specifically, if a lowercase or uppercase character, x, is in the 275228072Sbapt * range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also 276228072Sbapt * be in the range. If not, then this range is ambiguous, and the function 277228072Sbapt * returns false. For example, [@-_] spans [a-z] but not [A-Z]. Beware that 278228072Sbapt * [a-z] will be labeled ambiguous because it does not include [A-Z]. 279228072Sbapt * 280228072Sbapt * @param c1 the lower end of the range 281228072Sbapt * @param c2 the upper end of the range 282228072Sbapt * @return true if [c1-c2] is not ambiguous for a caseless scanner. 283228072Sbapt */ 284228072Sbaptbool range_covers_case (int c1, int c2) 285228072Sbapt{ 286228072Sbapt int i, o; 287228072Sbapt 288228072Sbapt for (i = c1; i <= c2; i++) { 289228072Sbapt if (has_case (i)) { 290228072Sbapt o = reverse_case (i); 291228072Sbapt if (o < c1 || c2 < o) 292228072Sbapt return false; 293228072Sbapt } 294228072Sbapt } 295228072Sbapt return true; 296228072Sbapt} 297228072Sbapt 298228072Sbapt/** Reverse the case of a character, if possible. 299228072Sbapt * @return c if case-reversal does not apply. 300228072Sbapt */ 301228072Sbaptint reverse_case (int c) 302228072Sbapt{ 303228072Sbapt return isupper (c) ? tolower (c) : (islower (c) ? toupper (c) : c); 304228072Sbapt} 305228072Sbapt 306228072Sbapt/** Return true if c is uppercase or lowercase. */ 307228072Sbaptbool has_case (int c) 308228072Sbapt{ 309228072Sbapt return (isupper (c) || islower (c)) ? true : false; 310228072Sbapt} 311