1/* 2 * utf_validate.c: Validate a UTF-8 string 3 * 4 * ==================================================================== 5 * Licensed to the Apache Software Foundation (ASF) under one 6 * or more contributor license agreements. See the NOTICE file 7 * distributed with this work for additional information 8 * regarding copyright ownership. The ASF licenses this file 9 * to you under the Apache License, Version 2.0 (the 10 * "License"); you may not use this file except in compliance 11 * with the License. You may obtain a copy of the License at 12 * 13 * http://www.apache.org/licenses/LICENSE-2.0 14 * 15 * Unless required by applicable law or agreed to in writing, 16 * software distributed under the License is distributed on an 17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 * KIND, either express or implied. See the License for the 19 * specific language governing permissions and limitations 20 * under the License. 21 * ==================================================================== 22 */ 23 24/* Validate a UTF-8 string according to the rules in 25 * 26 * Table 3-6. Well-Formed UTF-8 Bytes Sequences 27 * 28 * in 29 * 30 * The Unicode Standard, Version 4.0 31 * 32 * which is available at 33 * 34 * http://www.unicode.org/ 35 * 36 * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8" 37 * is a subset of that enconding. The Unicode enconding prohibits things 38 * like non-shortest encodings (some characters can be represented by more 39 * than one multi-byte encoding) and the encodings for the surrogate code 40 * points. RFC-3629 superceeds RFC-2279 and adopts the same well-formed 41 * rules as Unicode. This is the ABNF in RFC-3629 that describes 42 * well-formed UTF-8 rules: 43 * 44 * UTF8-octets = *( UTF8-char ) 45 * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 46 * UTF8-1 = %x00-7F 47 * UTF8-2 = %xC2-DF UTF8-tail 48 * UTF8-3 = %xE0 %xA0-BF UTF8-tail / 49 * %xE1-EC 2( UTF8-tail ) / 50 * %xED %x80-9F UTF8-tail / 51 * %xEE-EF 2( UTF8-tail ) 52 * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / 53 * %xF1-F3 3( UTF8-tail ) / 54 * %xF4 %x80-8F 2( UTF8-tail ) 55 * UTF8-tail = %x80-BF 56 * 57 */ 58 59#include "private/svn_utf_private.h" 60#include "private/svn_eol_private.h" 61#include "private/svn_dep_compat.h" 62 63/* Lookup table to categorise each octet in the string. */ 64static const char octet_category[256] = { 65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00-0x7f */ 66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80-0x8f */ 74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0x90-0x9f */ 75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xa0-0xbf */ 76 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 77 4, 4, /* 0xc0-0xc1 */ 78 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 0xc2-0xdf */ 79 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 80 6, /* 0xe0 */ 81 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 0xe1-0xec */ 82 8, /* 0xed */ 83 9, 9, /* 0xee-0xef */ 84 10, /* 0xf0 */ 85 11, 11, 11, /* 0xf1-0xf3 */ 86 12, /* 0xf4 */ 87 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */ 88}; 89 90/* Machine states */ 91#define FSM_START 0 92#define FSM_80BF 1 93#define FSM_A0BF 2 94#define FSM_80BF80BF 3 95#define FSM_809F 4 96#define FSM_90BF 5 97#define FSM_80BF80BF80BF 6 98#define FSM_808F 7 99#define FSM_ERROR 8 100 101/* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the 102 same transitions, as do categories 0xe1-0xec and 0xee-0xef. I wonder if 103 there is any great benefit in combining categories? It would reduce the 104 memory footprint of the transition table by 16 bytes, but might it be 105 harder to understand? */ 106 107/* Machine transition table */ 108static const char machine [9][14] = { 109 /* FSM_START */ 110 {FSM_START, /* 0x00-0x7f */ 111 FSM_ERROR, /* 0x80-0x8f */ 112 FSM_ERROR, /* 0x90-0x9f */ 113 FSM_ERROR, /* 0xa0-0xbf */ 114 FSM_ERROR, /* 0xc0-0xc1 */ 115 FSM_80BF, /* 0xc2-0xdf */ 116 FSM_A0BF, /* 0xe0 */ 117 FSM_80BF80BF, /* 0xe1-0xec */ 118 FSM_809F, /* 0xed */ 119 FSM_80BF80BF, /* 0xee-0xef */ 120 FSM_90BF, /* 0xf0 */ 121 FSM_80BF80BF80BF, /* 0xf1-0xf3 */ 122 FSM_808F, /* 0xf4 */ 123 FSM_ERROR}, /* 0xf5-0xff */ 124 125 /* FSM_80BF */ 126 {FSM_ERROR, /* 0x00-0x7f */ 127 FSM_START, /* 0x80-0x8f */ 128 FSM_START, /* 0x90-0x9f */ 129 FSM_START, /* 0xa0-0xbf */ 130 FSM_ERROR, /* 0xc0-0xc1 */ 131 FSM_ERROR, /* 0xc2-0xdf */ 132 FSM_ERROR, /* 0xe0 */ 133 FSM_ERROR, /* 0xe1-0xec */ 134 FSM_ERROR, /* 0xed */ 135 FSM_ERROR, /* 0xee-0xef */ 136 FSM_ERROR, /* 0xf0 */ 137 FSM_ERROR, /* 0xf1-0xf3 */ 138 FSM_ERROR, /* 0xf4 */ 139 FSM_ERROR}, /* 0xf5-0xff */ 140 141 /* FSM_A0BF */ 142 {FSM_ERROR, /* 0x00-0x7f */ 143 FSM_ERROR, /* 0x80-0x8f */ 144 FSM_ERROR, /* 0x90-0x9f */ 145 FSM_80BF, /* 0xa0-0xbf */ 146 FSM_ERROR, /* 0xc0-0xc1 */ 147 FSM_ERROR, /* 0xc2-0xdf */ 148 FSM_ERROR, /* 0xe0 */ 149 FSM_ERROR, /* 0xe1-0xec */ 150 FSM_ERROR, /* 0xed */ 151 FSM_ERROR, /* 0xee-0xef */ 152 FSM_ERROR, /* 0xf0 */ 153 FSM_ERROR, /* 0xf1-0xf3 */ 154 FSM_ERROR, /* 0xf4 */ 155 FSM_ERROR}, /* 0xf5-0xff */ 156 157 /* FSM_80BF80BF */ 158 {FSM_ERROR, /* 0x00-0x7f */ 159 FSM_80BF, /* 0x80-0x8f */ 160 FSM_80BF, /* 0x90-0x9f */ 161 FSM_80BF, /* 0xa0-0xbf */ 162 FSM_ERROR, /* 0xc0-0xc1 */ 163 FSM_ERROR, /* 0xc2-0xdf */ 164 FSM_ERROR, /* 0xe0 */ 165 FSM_ERROR, /* 0xe1-0xec */ 166 FSM_ERROR, /* 0xed */ 167 FSM_ERROR, /* 0xee-0xef */ 168 FSM_ERROR, /* 0xf0 */ 169 FSM_ERROR, /* 0xf1-0xf3 */ 170 FSM_ERROR, /* 0xf4 */ 171 FSM_ERROR}, /* 0xf5-0xff */ 172 173 /* FSM_809F */ 174 {FSM_ERROR, /* 0x00-0x7f */ 175 FSM_80BF, /* 0x80-0x8f */ 176 FSM_80BF, /* 0x90-0x9f */ 177 FSM_ERROR, /* 0xa0-0xbf */ 178 FSM_ERROR, /* 0xc0-0xc1 */ 179 FSM_ERROR, /* 0xc2-0xdf */ 180 FSM_ERROR, /* 0xe0 */ 181 FSM_ERROR, /* 0xe1-0xec */ 182 FSM_ERROR, /* 0xed */ 183 FSM_ERROR, /* 0xee-0xef */ 184 FSM_ERROR, /* 0xf0 */ 185 FSM_ERROR, /* 0xf1-0xf3 */ 186 FSM_ERROR, /* 0xf4 */ 187 FSM_ERROR}, /* 0xf5-0xff */ 188 189 /* FSM_90BF */ 190 {FSM_ERROR, /* 0x00-0x7f */ 191 FSM_ERROR, /* 0x80-0x8f */ 192 FSM_80BF80BF, /* 0x90-0x9f */ 193 FSM_80BF80BF, /* 0xa0-0xbf */ 194 FSM_ERROR, /* 0xc0-0xc1 */ 195 FSM_ERROR, /* 0xc2-0xdf */ 196 FSM_ERROR, /* 0xe0 */ 197 FSM_ERROR, /* 0xe1-0xec */ 198 FSM_ERROR, /* 0xed */ 199 FSM_ERROR, /* 0xee-0xef */ 200 FSM_ERROR, /* 0xf0 */ 201 FSM_ERROR, /* 0xf1-0xf3 */ 202 FSM_ERROR, /* 0xf4 */ 203 FSM_ERROR}, /* 0xf5-0xff */ 204 205 /* FSM_80BF80BF80BF */ 206 {FSM_ERROR, /* 0x00-0x7f */ 207 FSM_80BF80BF, /* 0x80-0x8f */ 208 FSM_80BF80BF, /* 0x90-0x9f */ 209 FSM_80BF80BF, /* 0xa0-0xbf */ 210 FSM_ERROR, /* 0xc0-0xc1 */ 211 FSM_ERROR, /* 0xc2-0xdf */ 212 FSM_ERROR, /* 0xe0 */ 213 FSM_ERROR, /* 0xe1-0xec */ 214 FSM_ERROR, /* 0xed */ 215 FSM_ERROR, /* 0xee-0xef */ 216 FSM_ERROR, /* 0xf0 */ 217 FSM_ERROR, /* 0xf1-0xf3 */ 218 FSM_ERROR, /* 0xf4 */ 219 FSM_ERROR}, /* 0xf5-0xff */ 220 221 /* FSM_808F */ 222 {FSM_ERROR, /* 0x00-0x7f */ 223 FSM_80BF80BF, /* 0x80-0x8f */ 224 FSM_ERROR, /* 0x90-0x9f */ 225 FSM_ERROR, /* 0xa0-0xbf */ 226 FSM_ERROR, /* 0xc0-0xc1 */ 227 FSM_ERROR, /* 0xc2-0xdf */ 228 FSM_ERROR, /* 0xe0 */ 229 FSM_ERROR, /* 0xe1-0xec */ 230 FSM_ERROR, /* 0xed */ 231 FSM_ERROR, /* 0xee-0xef */ 232 FSM_ERROR, /* 0xf0 */ 233 FSM_ERROR, /* 0xf1-0xf3 */ 234 FSM_ERROR, /* 0xf4 */ 235 FSM_ERROR}, /* 0xf5-0xff */ 236 237 /* FSM_ERROR */ 238 {FSM_ERROR, /* 0x00-0x7f */ 239 FSM_ERROR, /* 0x80-0x8f */ 240 FSM_ERROR, /* 0x90-0x9f */ 241 FSM_ERROR, /* 0xa0-0xbf */ 242 FSM_ERROR, /* 0xc0-0xc1 */ 243 FSM_ERROR, /* 0xc2-0xdf */ 244 FSM_ERROR, /* 0xe0 */ 245 FSM_ERROR, /* 0xe1-0xec */ 246 FSM_ERROR, /* 0xed */ 247 FSM_ERROR, /* 0xee-0xef */ 248 FSM_ERROR, /* 0xf0 */ 249 FSM_ERROR, /* 0xf1-0xf3 */ 250 FSM_ERROR, /* 0xf4 */ 251 FSM_ERROR}, /* 0xf5-0xff */ 252}; 253 254/* Scan MAX_LEN bytes in *DATA for chars that are not in the octet 255 * category 0 (FSM_START). Return the position of the first such char 256 * or DATA + MAX_LEN if all were cat 0. 257 */ 258static const char * 259first_non_fsm_start_char(const char *data, apr_size_t max_len) 260{ 261#if !SVN_UNALIGNED_ACCESS_IS_OK 262 263 /* On some systems, we need to make sure that buf is properly aligned 264 * for chunky data access. 265 */ 266 if ((apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1)) 267 { 268 apr_size_t len = (~(apr_uintptr_t)data) & (sizeof(apr_uintptr_t)-1); 269 if (len > max_len) 270 len = max_len; 271 max_len -= len; 272 273 for (; len > 0; ++data, --len) 274 if (*data < 0 || *data >= 0x80) 275 return data; 276 } 277 278#endif 279 280 /* Scan the input one machine word at a time. */ 281 for (; max_len > sizeof(apr_uintptr_t) 282 ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t)) 283 if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET) 284 break; 285 286 /* The remaining odd bytes will be examined the naive way: */ 287 for (; max_len > 0; ++data, --max_len) 288 if (*data < 0 || *data >= 0x80) 289 break; 290 291 return data; 292} 293 294/* Scan the C string in *DATA for chars that are not in the octet 295 * category 0 (FSM_START). Return the position of either the such 296 * char or of the terminating NUL. 297 */ 298static const char * 299first_non_fsm_start_char_cstring(const char *data) 300{ 301 /* We need to make sure that BUF is properly aligned for chunky data 302 * access because we don't know the string's length. Unaligned chunk 303 * read access beyond the NUL terminator could therefore result in a 304 * segfault. 305 */ 306 for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data) 307 if (*data <= 0 || *data >= 0x80) 308 return data; 309 310 /* Scan the input one machine word at a time. */ 311#ifndef SVN_UTF_NO_UNINITIALISED_ACCESS 312 /* This may read allocated but initialised bytes beyond the 313 terminating null. Any such bytes are always readable and this 314 code operates correctly whatever the uninitialised values happen 315 to be. However memory checking tools such as valgrind and GCC 316 4.8's address santitizer will object so this bit of code can be 317 disabled at compile time. */ 318 for (; ; data += sizeof(apr_uintptr_t)) 319 { 320 /* Check for non-ASCII chars: */ 321 apr_uintptr_t chunk = *(const apr_uintptr_t *)data; 322 if (chunk & SVN__BIT_7_SET) 323 break; 324 325 /* This is the well-known strlen test: */ 326 chunk |= (chunk & SVN__LOWER_7BITS_SET) + SVN__LOWER_7BITS_SET; 327 if ((chunk & SVN__BIT_7_SET) != SVN__BIT_7_SET) 328 break; 329 } 330#endif 331 332 /* The remaining odd bytes will be examined the naive way: */ 333 for (; ; ++data) 334 if (*data <= 0 || *data >= 0x80) 335 break; 336 337 return data; 338} 339 340const char * 341svn_utf__last_valid(const char *data, apr_size_t len) 342{ 343 const char *start = first_non_fsm_start_char(data, len); 344 const char *end = data + len; 345 int state = FSM_START; 346 347 data = start; 348 while (data < end) 349 { 350 unsigned char octet = *data++; 351 int category = octet_category[octet]; 352 state = machine[state][category]; 353 if (state == FSM_START) 354 start = data; 355 } 356 return start; 357} 358 359svn_boolean_t 360svn_utf__cstring_is_valid(const char *data) 361{ 362 int state = FSM_START; 363 364 if (!data) 365 return FALSE; 366 367 data = first_non_fsm_start_char_cstring(data); 368 369 while (*data) 370 { 371 unsigned char octet = *data++; 372 int category = octet_category[octet]; 373 state = machine[state][category]; 374 } 375 return state == FSM_START; 376} 377 378svn_boolean_t 379svn_utf__is_valid(const char *data, apr_size_t len) 380{ 381 const char *end = data + len; 382 int state = FSM_START; 383 384 if (!data) 385 return FALSE; 386 387 data = first_non_fsm_start_char(data, len); 388 389 while (data < end) 390 { 391 unsigned char octet = *data++; 392 int category = octet_category[octet]; 393 state = machine[state][category]; 394 } 395 return state == FSM_START; 396} 397 398const char * 399svn_utf__last_valid2(const char *data, apr_size_t len) 400{ 401 const char *start = first_non_fsm_start_char(data, len); 402 const char *end = data + len; 403 int state = FSM_START; 404 405 data = start; 406 while (data < end) 407 { 408 unsigned char octet = *data++; 409 switch (state) 410 { 411 case FSM_START: 412 if (octet <= 0x7F) 413 break; 414 else if (octet <= 0xC1) 415 state = FSM_ERROR; 416 else if (octet <= 0xDF) 417 state = FSM_80BF; 418 else if (octet == 0xE0) 419 state = FSM_A0BF; 420 else if (octet <= 0xEC) 421 state = FSM_80BF80BF; 422 else if (octet == 0xED) 423 state = FSM_809F; 424 else if (octet <= 0xEF) 425 state = FSM_80BF80BF; 426 else if (octet == 0xF0) 427 state = FSM_90BF; 428 else if (octet <= 0xF3) 429 state = FSM_80BF80BF80BF; 430 else if (octet <= 0xF4) 431 state = FSM_808F; 432 else 433 state = FSM_ERROR; 434 break; 435 case FSM_80BF: 436 if (octet >= 0x80 && octet <= 0xBF) 437 state = FSM_START; 438 else 439 state = FSM_ERROR; 440 break; 441 case FSM_A0BF: 442 if (octet >= 0xA0 && octet <= 0xBF) 443 state = FSM_80BF; 444 else 445 state = FSM_ERROR; 446 break; 447 case FSM_80BF80BF: 448 if (octet >= 0x80 && octet <= 0xBF) 449 state = FSM_80BF; 450 else 451 state = FSM_ERROR; 452 break; 453 case FSM_809F: 454 if (octet >= 0x80 && octet <= 0x9F) 455 state = FSM_80BF; 456 else 457 state = FSM_ERROR; 458 break; 459 case FSM_90BF: 460 if (octet >= 0x90 && octet <= 0xBF) 461 state = FSM_80BF80BF; 462 else 463 state = FSM_ERROR; 464 break; 465 case FSM_80BF80BF80BF: 466 if (octet >= 0x80 && octet <= 0xBF) 467 state = FSM_80BF80BF; 468 else 469 state = FSM_ERROR; 470 break; 471 case FSM_808F: 472 if (octet >= 0x80 && octet <= 0x8F) 473 state = FSM_80BF80BF; 474 else 475 state = FSM_ERROR; 476 break; 477 default: 478 case FSM_ERROR: 479 return start; 480 } 481 if (state == FSM_START) 482 start = data; 483 } 484 return start; 485} 486