utf_validate.c revision 299742
1/* 2 * utf_validate.c: Validate a UTF-8 string 3 * 4 * ==================================================================== 5 * Licensed to the Apache Software Foundation (ASF) under one 6 * or more contributor license agreements. See the NOTICE file 7 * distributed with this work for additional information 8 * regarding copyright ownership. The ASF licenses this file 9 * to you under the Apache License, Version 2.0 (the 10 * "License"); you may not use this file except in compliance 11 * with the License. You may obtain a copy of the License at 12 * 13 * http://www.apache.org/licenses/LICENSE-2.0 14 * 15 * Unless required by applicable law or agreed to in writing, 16 * software distributed under the License is distributed on an 17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 * KIND, either express or implied. See the License for the 19 * specific language governing permissions and limitations 20 * under the License. 21 * ==================================================================== 22 */ 23 24/* Validate a UTF-8 string according to the rules in 25 * 26 * Table 3-6. Well-Formed UTF-8 Bytes Sequences 27 * 28 * in 29 * 30 * The Unicode Standard, Version 4.0 31 * 32 * which is available at 33 * 34 * http://www.unicode.org/ 35 * 36 * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8" 37 * is a subset of that enconding. The Unicode enconding prohibits things 38 * like non-shortest encodings (some characters can be represented by more 39 * than one multi-byte encoding) and the encodings for the surrogate code 40 * points. RFC-3629 superceeds RFC-2279 and adopts the same well-formed 41 * rules as Unicode. This is the ABNF in RFC-3629 that describes 42 * well-formed UTF-8 rules: 43 * 44 * UTF8-octets = *( UTF8-char ) 45 * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 46 * UTF8-1 = %x00-7F 47 * UTF8-2 = %xC2-DF UTF8-tail 48 * UTF8-3 = %xE0 %xA0-BF UTF8-tail / 49 * %xE1-EC 2( UTF8-tail ) / 50 * %xED %x80-9F UTF8-tail / 51 * %xEE-EF 2( UTF8-tail ) 52 * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / 53 * %xF1-F3 3( UTF8-tail ) / 54 * %xF4 %x80-8F 2( UTF8-tail ) 55 * UTF8-tail = %x80-BF 56 * 57 */ 58 59#include "private/svn_utf_private.h" 60#include "private/svn_eol_private.h" 61#include "private/svn_dep_compat.h" 62 63/* Lookup table to categorise each octet in the string. */ 64static const char octet_category[256] = { 65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00-0x7f */ 66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80-0x8f */ 74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0x90-0x9f */ 75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xa0-0xbf */ 76 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 77 4, 4, /* 0xc0-0xc1 */ 78 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 0xc2-0xdf */ 79 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 80 6, /* 0xe0 */ 81 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 0xe1-0xec */ 82 8, /* 0xed */ 83 9, 9, /* 0xee-0xef */ 84 10, /* 0xf0 */ 85 11, 11, 11, /* 0xf1-0xf3 */ 86 12, /* 0xf4 */ 87 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */ 88}; 89 90/* Machine states */ 91#define FSM_START 0 92#define FSM_80BF 1 93#define FSM_A0BF 2 94#define FSM_80BF80BF 3 95#define FSM_809F 4 96#define FSM_90BF 5 97#define FSM_80BF80BF80BF 6 98#define FSM_808F 7 99#define FSM_ERROR 8 100 101/* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the 102 same transitions, as do categories 0xe1-0xec and 0xee-0xef. I wonder if 103 there is any great benefit in combining categories? It would reduce the 104 memory footprint of the transition table by 16 bytes, but might it be 105 harder to understand? */ 106 107/* Machine transition table */ 108static const char machine [9][14] = { 109 /* FSM_START */ 110 {FSM_START, /* 0x00-0x7f */ 111 FSM_ERROR, /* 0x80-0x8f */ 112 FSM_ERROR, /* 0x90-0x9f */ 113 FSM_ERROR, /* 0xa0-0xbf */ 114 FSM_ERROR, /* 0xc0-0xc1 */ 115 FSM_80BF, /* 0xc2-0xdf */ 116 FSM_A0BF, /* 0xe0 */ 117 FSM_80BF80BF, /* 0xe1-0xec */ 118 FSM_809F, /* 0xed */ 119 FSM_80BF80BF, /* 0xee-0xef */ 120 FSM_90BF, /* 0xf0 */ 121 FSM_80BF80BF80BF, /* 0xf1-0xf3 */ 122 FSM_808F, /* 0xf4 */ 123 FSM_ERROR}, /* 0xf5-0xff */ 124 125 /* FSM_80BF */ 126 {FSM_ERROR, /* 0x00-0x7f */ 127 FSM_START, /* 0x80-0x8f */ 128 FSM_START, /* 0x90-0x9f */ 129 FSM_START, /* 0xa0-0xbf */ 130 FSM_ERROR, /* 0xc0-0xc1 */ 131 FSM_ERROR, /* 0xc2-0xdf */ 132 FSM_ERROR, /* 0xe0 */ 133 FSM_ERROR, /* 0xe1-0xec */ 134 FSM_ERROR, /* 0xed */ 135 FSM_ERROR, /* 0xee-0xef */ 136 FSM_ERROR, /* 0xf0 */ 137 FSM_ERROR, /* 0xf1-0xf3 */ 138 FSM_ERROR, /* 0xf4 */ 139 FSM_ERROR}, /* 0xf5-0xff */ 140 141 /* FSM_A0BF */ 142 {FSM_ERROR, /* 0x00-0x7f */ 143 FSM_ERROR, /* 0x80-0x8f */ 144 FSM_ERROR, /* 0x90-0x9f */ 145 FSM_80BF, /* 0xa0-0xbf */ 146 FSM_ERROR, /* 0xc0-0xc1 */ 147 FSM_ERROR, /* 0xc2-0xdf */ 148 FSM_ERROR, /* 0xe0 */ 149 FSM_ERROR, /* 0xe1-0xec */ 150 FSM_ERROR, /* 0xed */ 151 FSM_ERROR, /* 0xee-0xef */ 152 FSM_ERROR, /* 0xf0 */ 153 FSM_ERROR, /* 0xf1-0xf3 */ 154 FSM_ERROR, /* 0xf4 */ 155 FSM_ERROR}, /* 0xf5-0xff */ 156 157 /* FSM_80BF80BF */ 158 {FSM_ERROR, /* 0x00-0x7f */ 159 FSM_80BF, /* 0x80-0x8f */ 160 FSM_80BF, /* 0x90-0x9f */ 161 FSM_80BF, /* 0xa0-0xbf */ 162 FSM_ERROR, /* 0xc0-0xc1 */ 163 FSM_ERROR, /* 0xc2-0xdf */ 164 FSM_ERROR, /* 0xe0 */ 165 FSM_ERROR, /* 0xe1-0xec */ 166 FSM_ERROR, /* 0xed */ 167 FSM_ERROR, /* 0xee-0xef */ 168 FSM_ERROR, /* 0xf0 */ 169 FSM_ERROR, /* 0xf1-0xf3 */ 170 FSM_ERROR, /* 0xf4 */ 171 FSM_ERROR}, /* 0xf5-0xff */ 172 173 /* FSM_809F */ 174 {FSM_ERROR, /* 0x00-0x7f */ 175 FSM_80BF, /* 0x80-0x8f */ 176 FSM_80BF, /* 0x90-0x9f */ 177 FSM_ERROR, /* 0xa0-0xbf */ 178 FSM_ERROR, /* 0xc0-0xc1 */ 179 FSM_ERROR, /* 0xc2-0xdf */ 180 FSM_ERROR, /* 0xe0 */ 181 FSM_ERROR, /* 0xe1-0xec */ 182 FSM_ERROR, /* 0xed */ 183 FSM_ERROR, /* 0xee-0xef */ 184 FSM_ERROR, /* 0xf0 */ 185 FSM_ERROR, /* 0xf1-0xf3 */ 186 FSM_ERROR, /* 0xf4 */ 187 FSM_ERROR}, /* 0xf5-0xff */ 188 189 /* FSM_90BF */ 190 {FSM_ERROR, /* 0x00-0x7f */ 191 FSM_ERROR, /* 0x80-0x8f */ 192 FSM_80BF80BF, /* 0x90-0x9f */ 193 FSM_80BF80BF, /* 0xa0-0xbf */ 194 FSM_ERROR, /* 0xc0-0xc1 */ 195 FSM_ERROR, /* 0xc2-0xdf */ 196 FSM_ERROR, /* 0xe0 */ 197 FSM_ERROR, /* 0xe1-0xec */ 198 FSM_ERROR, /* 0xed */ 199 FSM_ERROR, /* 0xee-0xef */ 200 FSM_ERROR, /* 0xf0 */ 201 FSM_ERROR, /* 0xf1-0xf3 */ 202 FSM_ERROR, /* 0xf4 */ 203 FSM_ERROR}, /* 0xf5-0xff */ 204 205 /* FSM_80BF80BF80BF */ 206 {FSM_ERROR, /* 0x00-0x7f */ 207 FSM_80BF80BF, /* 0x80-0x8f */ 208 FSM_80BF80BF, /* 0x90-0x9f */ 209 FSM_80BF80BF, /* 0xa0-0xbf */ 210 FSM_ERROR, /* 0xc0-0xc1 */ 211 FSM_ERROR, /* 0xc2-0xdf */ 212 FSM_ERROR, /* 0xe0 */ 213 FSM_ERROR, /* 0xe1-0xec */ 214 FSM_ERROR, /* 0xed */ 215 FSM_ERROR, /* 0xee-0xef */ 216 FSM_ERROR, /* 0xf0 */ 217 FSM_ERROR, /* 0xf1-0xf3 */ 218 FSM_ERROR, /* 0xf4 */ 219 FSM_ERROR}, /* 0xf5-0xff */ 220 221 /* FSM_808F */ 222 {FSM_ERROR, /* 0x00-0x7f */ 223 FSM_80BF80BF, /* 0x80-0x8f */ 224 FSM_ERROR, /* 0x90-0x9f */ 225 FSM_ERROR, /* 0xa0-0xbf */ 226 FSM_ERROR, /* 0xc0-0xc1 */ 227 FSM_ERROR, /* 0xc2-0xdf */ 228 FSM_ERROR, /* 0xe0 */ 229 FSM_ERROR, /* 0xe1-0xec */ 230 FSM_ERROR, /* 0xed */ 231 FSM_ERROR, /* 0xee-0xef */ 232 FSM_ERROR, /* 0xf0 */ 233 FSM_ERROR, /* 0xf1-0xf3 */ 234 FSM_ERROR, /* 0xf4 */ 235 FSM_ERROR}, /* 0xf5-0xff */ 236 237 /* FSM_ERROR */ 238 {FSM_ERROR, /* 0x00-0x7f */ 239 FSM_ERROR, /* 0x80-0x8f */ 240 FSM_ERROR, /* 0x90-0x9f */ 241 FSM_ERROR, /* 0xa0-0xbf */ 242 FSM_ERROR, /* 0xc0-0xc1 */ 243 FSM_ERROR, /* 0xc2-0xdf */ 244 FSM_ERROR, /* 0xe0 */ 245 FSM_ERROR, /* 0xe1-0xec */ 246 FSM_ERROR, /* 0xed */ 247 FSM_ERROR, /* 0xee-0xef */ 248 FSM_ERROR, /* 0xf0 */ 249 FSM_ERROR, /* 0xf1-0xf3 */ 250 FSM_ERROR, /* 0xf4 */ 251 FSM_ERROR}, /* 0xf5-0xff */ 252}; 253 254/* Scan MAX_LEN bytes in *DATA for chars that are not in the octet 255 * category 0 (FSM_START). Return the position of the first such char 256 * or DATA + MAX_LEN if all were cat 0. 257 */ 258static const char * 259first_non_fsm_start_char(const char *data, apr_size_t max_len) 260{ 261#if SVN_UNALIGNED_ACCESS_IS_OK 262 263 /* Scan the input one machine word at a time. */ 264 for (; max_len > sizeof(apr_uintptr_t) 265 ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t)) 266 if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET) 267 break; 268 269#endif 270 271 /* The remaining odd bytes will be examined the naive way: */ 272 for (; max_len > 0; ++data, --max_len) 273 if ((unsigned char)*data >= 0x80) 274 break; 275 276 return data; 277} 278 279const char * 280svn_utf__last_valid(const char *data, apr_size_t len) 281{ 282 const char *start = first_non_fsm_start_char(data, len); 283 const char *end = data + len; 284 int state = FSM_START; 285 286 data = start; 287 while (data < end) 288 { 289 unsigned char octet = *data++; 290 int category = octet_category[octet]; 291 state = machine[state][category]; 292 if (state == FSM_START) 293 start = data; 294 } 295 return start; 296} 297 298svn_boolean_t 299svn_utf__cstring_is_valid(const char *data) 300{ 301 if (!data) 302 return FALSE; 303 304 return svn_utf__is_valid(data, strlen(data)); 305} 306 307svn_boolean_t 308svn_utf__is_valid(const char *data, apr_size_t len) 309{ 310 const char *end = data + len; 311 int state = FSM_START; 312 313 if (!data) 314 return FALSE; 315 316 data = first_non_fsm_start_char(data, len); 317 318 while (data < end) 319 { 320 unsigned char octet = *data++; 321 int category = octet_category[octet]; 322 state = machine[state][category]; 323 } 324 return state == FSM_START; 325} 326 327const char * 328svn_utf__last_valid2(const char *data, apr_size_t len) 329{ 330 const char *start = first_non_fsm_start_char(data, len); 331 const char *end = data + len; 332 int state = FSM_START; 333 334 data = start; 335 while (data < end) 336 { 337 unsigned char octet = *data++; 338 switch (state) 339 { 340 case FSM_START: 341 if (octet <= 0x7F) 342 break; 343 else if (octet <= 0xC1) 344 state = FSM_ERROR; 345 else if (octet <= 0xDF) 346 state = FSM_80BF; 347 else if (octet == 0xE0) 348 state = FSM_A0BF; 349 else if (octet <= 0xEC) 350 state = FSM_80BF80BF; 351 else if (octet == 0xED) 352 state = FSM_809F; 353 else if (octet <= 0xEF) 354 state = FSM_80BF80BF; 355 else if (octet == 0xF0) 356 state = FSM_90BF; 357 else if (octet <= 0xF3) 358 state = FSM_80BF80BF80BF; 359 else if (octet <= 0xF4) 360 state = FSM_808F; 361 else 362 state = FSM_ERROR; 363 break; 364 case FSM_80BF: 365 if (octet >= 0x80 && octet <= 0xBF) 366 state = FSM_START; 367 else 368 state = FSM_ERROR; 369 break; 370 case FSM_A0BF: 371 if (octet >= 0xA0 && octet <= 0xBF) 372 state = FSM_80BF; 373 else 374 state = FSM_ERROR; 375 break; 376 case FSM_80BF80BF: 377 if (octet >= 0x80 && octet <= 0xBF) 378 state = FSM_80BF; 379 else 380 state = FSM_ERROR; 381 break; 382 case FSM_809F: 383 if (octet >= 0x80 && octet <= 0x9F) 384 state = FSM_80BF; 385 else 386 state = FSM_ERROR; 387 break; 388 case FSM_90BF: 389 if (octet >= 0x90 && octet <= 0xBF) 390 state = FSM_80BF80BF; 391 else 392 state = FSM_ERROR; 393 break; 394 case FSM_80BF80BF80BF: 395 if (octet >= 0x80 && octet <= 0xBF) 396 state = FSM_80BF80BF; 397 else 398 state = FSM_ERROR; 399 break; 400 case FSM_808F: 401 if (octet >= 0x80 && octet <= 0x8F) 402 state = FSM_80BF80BF; 403 else 404 state = FSM_ERROR; 405 break; 406 default: 407 case FSM_ERROR: 408 return start; 409 } 410 if (state == FSM_START) 411 start = data; 412 } 413 return start; 414} 415