1/*
2 * utf_validate.c:  Validate a UTF-8 string
3 *
4 * ====================================================================
5 *    Licensed to the Apache Software Foundation (ASF) under one
6 *    or more contributor license agreements.  See the NOTICE file
7 *    distributed with this work for additional information
8 *    regarding copyright ownership.  The ASF licenses this file
9 *    to you under the Apache License, Version 2.0 (the
10 *    "License"); you may not use this file except in compliance
11 *    with the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 *    Unless required by applicable law or agreed to in writing,
16 *    software distributed under the License is distributed on an
17 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 *    KIND, either express or implied.  See the License for the
19 *    specific language governing permissions and limitations
20 *    under the License.
21 * ====================================================================
22 */
23
24/* Validate a UTF-8 string according to the rules in
25 *
26 *    Table 3-6. Well-Formed UTF-8 Bytes Sequences
27 *
28 * in
29 *
30 *    The Unicode Standard, Version 4.0
31 *
32 * which is available at
33 *
34 *    http://www.unicode.org/
35 *
36 * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
37 * is a subset of that enconding.  The Unicode enconding prohibits things
38 * like non-shortest encodings (some characters can be represented by more
39 * than one multi-byte encoding) and the encodings for the surrogate code
40 * points.  RFC-3629 superceeds RFC-2279 and adopts the same well-formed
41 * rules as Unicode.  This is the ABNF in RFC-3629 that describes
42 * well-formed UTF-8 rules:
43 *
44 *   UTF8-octets = *( UTF8-char )
45 *   UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
46 *   UTF8-1      = %x00-7F
47 *   UTF8-2      = %xC2-DF UTF8-tail
48 *   UTF8-3      = %xE0 %xA0-BF UTF8-tail /
49 *                 %xE1-EC 2( UTF8-tail ) /
50 *                 %xED %x80-9F UTF8-tail /
51 *                 %xEE-EF 2( UTF8-tail )
52 *   UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
53 *                 %xF1-F3 3( UTF8-tail ) /
54 *                 %xF4 %x80-8F 2( UTF8-tail )
55 *   UTF8-tail   = %x80-BF
56 *
57 */
58
59#include "private/svn_utf_private.h"
60#include "private/svn_eol_private.h"
61#include "private/svn_dep_compat.h"
62
63/* Lookup table to categorise each octet in the string. */
64static const char octet_category[256] = {
65  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0x00-0x7f */
66  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
67  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
68  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
69  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
70  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
71  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
72  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
73  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, /* 0x80-0x8f */
74  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, /* 0x90-0x9f */
75  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, /* 0xa0-0xbf */
76  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
77  4,  4,                                                         /* 0xc0-0xc1 */
78          5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, /* 0xc2-0xdf */
79  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
80  6,                                                             /* 0xe0 */
81      7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,             /* 0xe1-0xec */
82                                                      8,         /* 0xed */
83                                                          9,  9, /* 0xee-0xef */
84  10,                                                            /* 0xf0 */
85      11, 11, 11,                                                /* 0xf1-0xf3 */
86                  12,                                            /* 0xf4 */
87                      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
88};
89
90/* Machine states */
91#define FSM_START         0
92#define FSM_80BF          1
93#define FSM_A0BF          2
94#define FSM_80BF80BF      3
95#define FSM_809F          4
96#define FSM_90BF          5
97#define FSM_80BF80BF80BF  6
98#define FSM_808F          7
99#define FSM_ERROR         8
100
101/* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the
102   same transitions, as do categories 0xe1-0xec and 0xee-0xef.  I wonder if
103   there is any great benefit in combining categories?  It would reduce the
104   memory footprint of the transition table by 16 bytes, but might it be
105   harder to understand?  */
106
107/* Machine transition table */
108static const char machine [9][14] = {
109  /* FSM_START */
110  {FSM_START,         /* 0x00-0x7f */
111   FSM_ERROR,         /* 0x80-0x8f */
112   FSM_ERROR,         /* 0x90-0x9f */
113   FSM_ERROR,         /* 0xa0-0xbf */
114   FSM_ERROR,         /* 0xc0-0xc1 */
115   FSM_80BF,          /* 0xc2-0xdf */
116   FSM_A0BF,          /* 0xe0 */
117   FSM_80BF80BF,      /* 0xe1-0xec */
118   FSM_809F,          /* 0xed */
119   FSM_80BF80BF,      /* 0xee-0xef */
120   FSM_90BF,          /* 0xf0 */
121   FSM_80BF80BF80BF,  /* 0xf1-0xf3 */
122   FSM_808F,          /* 0xf4 */
123   FSM_ERROR},        /* 0xf5-0xff */
124
125  /* FSM_80BF */
126  {FSM_ERROR,         /* 0x00-0x7f */
127   FSM_START,         /* 0x80-0x8f */
128   FSM_START,         /* 0x90-0x9f */
129   FSM_START,         /* 0xa0-0xbf */
130   FSM_ERROR,         /* 0xc0-0xc1 */
131   FSM_ERROR,         /* 0xc2-0xdf */
132   FSM_ERROR,         /* 0xe0 */
133   FSM_ERROR,         /* 0xe1-0xec */
134   FSM_ERROR,         /* 0xed */
135   FSM_ERROR,         /* 0xee-0xef */
136   FSM_ERROR,         /* 0xf0 */
137   FSM_ERROR,         /* 0xf1-0xf3 */
138   FSM_ERROR,         /* 0xf4 */
139   FSM_ERROR},        /* 0xf5-0xff */
140
141  /* FSM_A0BF */
142  {FSM_ERROR,         /* 0x00-0x7f */
143   FSM_ERROR,         /* 0x80-0x8f */
144   FSM_ERROR,         /* 0x90-0x9f */
145   FSM_80BF,          /* 0xa0-0xbf */
146   FSM_ERROR,         /* 0xc0-0xc1 */
147   FSM_ERROR,         /* 0xc2-0xdf */
148   FSM_ERROR,         /* 0xe0 */
149   FSM_ERROR,         /* 0xe1-0xec */
150   FSM_ERROR,         /* 0xed */
151   FSM_ERROR,         /* 0xee-0xef */
152   FSM_ERROR,         /* 0xf0 */
153   FSM_ERROR,         /* 0xf1-0xf3 */
154   FSM_ERROR,         /* 0xf4 */
155   FSM_ERROR},        /* 0xf5-0xff */
156
157  /* FSM_80BF80BF */
158  {FSM_ERROR,         /* 0x00-0x7f */
159   FSM_80BF,          /* 0x80-0x8f */
160   FSM_80BF,          /* 0x90-0x9f */
161   FSM_80BF,          /* 0xa0-0xbf */
162   FSM_ERROR,         /* 0xc0-0xc1 */
163   FSM_ERROR,         /* 0xc2-0xdf */
164   FSM_ERROR,         /* 0xe0 */
165   FSM_ERROR,         /* 0xe1-0xec */
166   FSM_ERROR,         /* 0xed */
167   FSM_ERROR,         /* 0xee-0xef */
168   FSM_ERROR,         /* 0xf0 */
169   FSM_ERROR,         /* 0xf1-0xf3 */
170   FSM_ERROR,         /* 0xf4 */
171   FSM_ERROR},        /* 0xf5-0xff */
172
173  /* FSM_809F */
174  {FSM_ERROR,         /* 0x00-0x7f */
175   FSM_80BF,          /* 0x80-0x8f */
176   FSM_80BF,          /* 0x90-0x9f */
177   FSM_ERROR,         /* 0xa0-0xbf */
178   FSM_ERROR,         /* 0xc0-0xc1 */
179   FSM_ERROR,         /* 0xc2-0xdf */
180   FSM_ERROR,         /* 0xe0 */
181   FSM_ERROR,         /* 0xe1-0xec */
182   FSM_ERROR,         /* 0xed */
183   FSM_ERROR,         /* 0xee-0xef */
184   FSM_ERROR,         /* 0xf0 */
185   FSM_ERROR,         /* 0xf1-0xf3 */
186   FSM_ERROR,         /* 0xf4 */
187   FSM_ERROR},        /* 0xf5-0xff */
188
189  /* FSM_90BF */
190  {FSM_ERROR,         /* 0x00-0x7f */
191   FSM_ERROR,         /* 0x80-0x8f */
192   FSM_80BF80BF,      /* 0x90-0x9f */
193   FSM_80BF80BF,      /* 0xa0-0xbf */
194   FSM_ERROR,         /* 0xc0-0xc1 */
195   FSM_ERROR,         /* 0xc2-0xdf */
196   FSM_ERROR,         /* 0xe0 */
197   FSM_ERROR,         /* 0xe1-0xec */
198   FSM_ERROR,         /* 0xed */
199   FSM_ERROR,         /* 0xee-0xef */
200   FSM_ERROR,         /* 0xf0 */
201   FSM_ERROR,         /* 0xf1-0xf3 */
202   FSM_ERROR,         /* 0xf4 */
203   FSM_ERROR},        /* 0xf5-0xff */
204
205  /* FSM_80BF80BF80BF */
206  {FSM_ERROR,         /* 0x00-0x7f */
207   FSM_80BF80BF,      /* 0x80-0x8f */
208   FSM_80BF80BF,      /* 0x90-0x9f */
209   FSM_80BF80BF,      /* 0xa0-0xbf */
210   FSM_ERROR,         /* 0xc0-0xc1 */
211   FSM_ERROR,         /* 0xc2-0xdf */
212   FSM_ERROR,         /* 0xe0 */
213   FSM_ERROR,         /* 0xe1-0xec */
214   FSM_ERROR,         /* 0xed */
215   FSM_ERROR,         /* 0xee-0xef */
216   FSM_ERROR,         /* 0xf0 */
217   FSM_ERROR,         /* 0xf1-0xf3 */
218   FSM_ERROR,         /* 0xf4 */
219   FSM_ERROR},        /* 0xf5-0xff */
220
221  /* FSM_808F */
222  {FSM_ERROR,         /* 0x00-0x7f */
223   FSM_80BF80BF,      /* 0x80-0x8f */
224   FSM_ERROR,         /* 0x90-0x9f */
225   FSM_ERROR,         /* 0xa0-0xbf */
226   FSM_ERROR,         /* 0xc0-0xc1 */
227   FSM_ERROR,         /* 0xc2-0xdf */
228   FSM_ERROR,         /* 0xe0 */
229   FSM_ERROR,         /* 0xe1-0xec */
230   FSM_ERROR,         /* 0xed */
231   FSM_ERROR,         /* 0xee-0xef */
232   FSM_ERROR,         /* 0xf0 */
233   FSM_ERROR,         /* 0xf1-0xf3 */
234   FSM_ERROR,         /* 0xf4 */
235   FSM_ERROR},        /* 0xf5-0xff */
236
237  /* FSM_ERROR */
238  {FSM_ERROR,         /* 0x00-0x7f */
239   FSM_ERROR,         /* 0x80-0x8f */
240   FSM_ERROR,         /* 0x90-0x9f */
241   FSM_ERROR,         /* 0xa0-0xbf */
242   FSM_ERROR,         /* 0xc0-0xc1 */
243   FSM_ERROR,         /* 0xc2-0xdf */
244   FSM_ERROR,         /* 0xe0 */
245   FSM_ERROR,         /* 0xe1-0xec */
246   FSM_ERROR,         /* 0xed */
247   FSM_ERROR,         /* 0xee-0xef */
248   FSM_ERROR,         /* 0xf0 */
249   FSM_ERROR,         /* 0xf1-0xf3 */
250   FSM_ERROR,         /* 0xf4 */
251   FSM_ERROR},        /* 0xf5-0xff */
252};
253
254/* Scan MAX_LEN bytes in *DATA for chars that are not in the octet
255 * category 0 (FSM_START).  Return the position of the first such char
256 * or DATA + MAX_LEN if all were cat 0.
257 */
258static const char *
259first_non_fsm_start_char(const char *data, apr_size_t max_len)
260{
261#if !SVN_UNALIGNED_ACCESS_IS_OK
262
263  /* On some systems, we need to make sure that buf is properly aligned
264   * for chunky data access.
265   */
266  if ((apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1))
267    {
268      apr_size_t len = (~(apr_uintptr_t)data) & (sizeof(apr_uintptr_t)-1);
269      if (len > max_len)
270        len = max_len;
271      max_len -= len;
272
273      for (; len > 0; ++data, --len)
274        if (*data < 0 || *data >= 0x80)
275          return data;
276    }
277
278#endif
279
280  /* Scan the input one machine word at a time. */
281  for (; max_len > sizeof(apr_uintptr_t)
282       ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t))
283    if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET)
284      break;
285
286  /* The remaining odd bytes will be examined the naive way: */
287  for (; max_len > 0; ++data, --max_len)
288    if (*data < 0 || *data >= 0x80)
289      break;
290
291  return data;
292}
293
294/* Scan the C string in *DATA for chars that are not in the octet
295 * category 0 (FSM_START).  Return the position of either the such
296 * char or of the terminating NUL.
297 */
298static const char *
299first_non_fsm_start_char_cstring(const char *data)
300{
301  /* We need to make sure that BUF is properly aligned for chunky data
302   * access because we don't know the string's length. Unaligned chunk
303   * read access beyond the NUL terminator could therefore result in a
304   * segfault.
305   */
306  for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data)
307    if (*data <= 0 || *data >= 0x80)
308      return data;
309
310  /* Scan the input one machine word at a time. */
311#ifndef SVN_UTF_NO_UNINITIALISED_ACCESS
312  /* This may read allocated but initialised bytes beyond the
313     terminating null.  Any such bytes are always readable and this
314     code operates correctly whatever the uninitialised values happen
315     to be.  However memory checking tools such as valgrind and GCC
316     4.8's address santitizer will object so this bit of code can be
317     disabled at compile time. */
318  for (; ; data += sizeof(apr_uintptr_t))
319    {
320      /* Check for non-ASCII chars: */
321      apr_uintptr_t chunk = *(const apr_uintptr_t *)data;
322      if (chunk & SVN__BIT_7_SET)
323        break;
324
325      /* This is the well-known strlen test: */
326      chunk |= (chunk & SVN__LOWER_7BITS_SET) + SVN__LOWER_7BITS_SET;
327      if ((chunk & SVN__BIT_7_SET) != SVN__BIT_7_SET)
328        break;
329    }
330#endif
331
332  /* The remaining odd bytes will be examined the naive way: */
333  for (; ; ++data)
334    if (*data <= 0 || *data >= 0x80)
335      break;
336
337  return data;
338}
339
340const char *
341svn_utf__last_valid(const char *data, apr_size_t len)
342{
343  const char *start = first_non_fsm_start_char(data, len);
344  const char *end = data + len;
345  int state = FSM_START;
346
347  data = start;
348  while (data < end)
349    {
350      unsigned char octet = *data++;
351      int category = octet_category[octet];
352      state = machine[state][category];
353      if (state == FSM_START)
354        start = data;
355    }
356  return start;
357}
358
359svn_boolean_t
360svn_utf__cstring_is_valid(const char *data)
361{
362  int state = FSM_START;
363
364  if (!data)
365    return FALSE;
366
367  data = first_non_fsm_start_char_cstring(data);
368
369  while (*data)
370    {
371      unsigned char octet = *data++;
372      int category = octet_category[octet];
373      state = machine[state][category];
374    }
375  return state == FSM_START;
376}
377
378svn_boolean_t
379svn_utf__is_valid(const char *data, apr_size_t len)
380{
381  const char *end = data + len;
382  int state = FSM_START;
383
384  if (!data)
385    return FALSE;
386
387  data = first_non_fsm_start_char(data, len);
388
389  while (data < end)
390    {
391      unsigned char octet = *data++;
392      int category = octet_category[octet];
393      state = machine[state][category];
394    }
395  return state == FSM_START;
396}
397
398const char *
399svn_utf__last_valid2(const char *data, apr_size_t len)
400{
401  const char *start = first_non_fsm_start_char(data, len);
402  const char *end = data + len;
403  int state = FSM_START;
404
405  data = start;
406  while (data < end)
407    {
408      unsigned char octet = *data++;
409      switch (state)
410        {
411        case FSM_START:
412          if (octet <= 0x7F)
413            break;
414          else if (octet <= 0xC1)
415            state = FSM_ERROR;
416          else if (octet <= 0xDF)
417            state = FSM_80BF;
418          else if (octet == 0xE0)
419            state = FSM_A0BF;
420          else if (octet <= 0xEC)
421            state = FSM_80BF80BF;
422          else if (octet == 0xED)
423            state = FSM_809F;
424          else if (octet <= 0xEF)
425            state = FSM_80BF80BF;
426          else if (octet == 0xF0)
427            state = FSM_90BF;
428          else if (octet <= 0xF3)
429            state = FSM_80BF80BF80BF;
430          else if (octet <= 0xF4)
431            state = FSM_808F;
432          else
433            state = FSM_ERROR;
434          break;
435        case FSM_80BF:
436          if (octet >= 0x80 && octet <= 0xBF)
437            state = FSM_START;
438          else
439            state = FSM_ERROR;
440          break;
441        case FSM_A0BF:
442          if (octet >= 0xA0 && octet <= 0xBF)
443            state = FSM_80BF;
444          else
445            state = FSM_ERROR;
446          break;
447        case FSM_80BF80BF:
448          if (octet >= 0x80 && octet <= 0xBF)
449            state = FSM_80BF;
450          else
451            state = FSM_ERROR;
452          break;
453        case FSM_809F:
454          if (octet >= 0x80 && octet <= 0x9F)
455            state = FSM_80BF;
456          else
457            state = FSM_ERROR;
458          break;
459        case FSM_90BF:
460          if (octet >= 0x90 && octet <= 0xBF)
461            state = FSM_80BF80BF;
462          else
463            state = FSM_ERROR;
464          break;
465        case FSM_80BF80BF80BF:
466          if (octet >= 0x80 && octet <= 0xBF)
467            state = FSM_80BF80BF;
468          else
469            state = FSM_ERROR;
470          break;
471        case FSM_808F:
472          if (octet >= 0x80 && octet <= 0x8F)
473            state = FSM_80BF80BF;
474          else
475            state = FSM_ERROR;
476          break;
477        default:
478        case FSM_ERROR:
479          return start;
480        }
481      if (state == FSM_START)
482        start = data;
483    }
484  return start;
485}
486