utf_validate.c revision 299742
1/*
2 * utf_validate.c:  Validate a UTF-8 string
3 *
4 * ====================================================================
5 *    Licensed to the Apache Software Foundation (ASF) under one
6 *    or more contributor license agreements.  See the NOTICE file
7 *    distributed with this work for additional information
8 *    regarding copyright ownership.  The ASF licenses this file
9 *    to you under the Apache License, Version 2.0 (the
10 *    "License"); you may not use this file except in compliance
11 *    with the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 *    Unless required by applicable law or agreed to in writing,
16 *    software distributed under the License is distributed on an
17 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 *    KIND, either express or implied.  See the License for the
19 *    specific language governing permissions and limitations
20 *    under the License.
21 * ====================================================================
22 */
23
24/* Validate a UTF-8 string according to the rules in
25 *
26 *    Table 3-6. Well-Formed UTF-8 Bytes Sequences
27 *
28 * in
29 *
30 *    The Unicode Standard, Version 4.0
31 *
32 * which is available at
33 *
34 *    http://www.unicode.org/
35 *
36 * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
37 * is a subset of that enconding.  The Unicode enconding prohibits things
38 * like non-shortest encodings (some characters can be represented by more
39 * than one multi-byte encoding) and the encodings for the surrogate code
40 * points.  RFC-3629 superceeds RFC-2279 and adopts the same well-formed
41 * rules as Unicode.  This is the ABNF in RFC-3629 that describes
42 * well-formed UTF-8 rules:
43 *
44 *   UTF8-octets = *( UTF8-char )
45 *   UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
46 *   UTF8-1      = %x00-7F
47 *   UTF8-2      = %xC2-DF UTF8-tail
48 *   UTF8-3      = %xE0 %xA0-BF UTF8-tail /
49 *                 %xE1-EC 2( UTF8-tail ) /
50 *                 %xED %x80-9F UTF8-tail /
51 *                 %xEE-EF 2( UTF8-tail )
52 *   UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
53 *                 %xF1-F3 3( UTF8-tail ) /
54 *                 %xF4 %x80-8F 2( UTF8-tail )
55 *   UTF8-tail   = %x80-BF
56 *
57 */
58
59#include "private/svn_utf_private.h"
60#include "private/svn_eol_private.h"
61#include "private/svn_dep_compat.h"
62
63/* Lookup table to categorise each octet in the string. */
64static const char octet_category[256] = {
65  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0x00-0x7f */
66  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
67  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
68  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
69  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
70  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
71  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
72  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
73  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, /* 0x80-0x8f */
74  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, /* 0x90-0x9f */
75  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, /* 0xa0-0xbf */
76  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
77  4,  4,                                                         /* 0xc0-0xc1 */
78          5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, /* 0xc2-0xdf */
79  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
80  6,                                                             /* 0xe0 */
81      7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,             /* 0xe1-0xec */
82                                                      8,         /* 0xed */
83                                                          9,  9, /* 0xee-0xef */
84  10,                                                            /* 0xf0 */
85      11, 11, 11,                                                /* 0xf1-0xf3 */
86                  12,                                            /* 0xf4 */
87                      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
88};
89
90/* Machine states */
91#define FSM_START         0
92#define FSM_80BF          1
93#define FSM_A0BF          2
94#define FSM_80BF80BF      3
95#define FSM_809F          4
96#define FSM_90BF          5
97#define FSM_80BF80BF80BF  6
98#define FSM_808F          7
99#define FSM_ERROR         8
100
101/* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the
102   same transitions, as do categories 0xe1-0xec and 0xee-0xef.  I wonder if
103   there is any great benefit in combining categories?  It would reduce the
104   memory footprint of the transition table by 16 bytes, but might it be
105   harder to understand?  */
106
107/* Machine transition table */
108static const char machine [9][14] = {
109  /* FSM_START */
110  {FSM_START,         /* 0x00-0x7f */
111   FSM_ERROR,         /* 0x80-0x8f */
112   FSM_ERROR,         /* 0x90-0x9f */
113   FSM_ERROR,         /* 0xa0-0xbf */
114   FSM_ERROR,         /* 0xc0-0xc1 */
115   FSM_80BF,          /* 0xc2-0xdf */
116   FSM_A0BF,          /* 0xe0 */
117   FSM_80BF80BF,      /* 0xe1-0xec */
118   FSM_809F,          /* 0xed */
119   FSM_80BF80BF,      /* 0xee-0xef */
120   FSM_90BF,          /* 0xf0 */
121   FSM_80BF80BF80BF,  /* 0xf1-0xf3 */
122   FSM_808F,          /* 0xf4 */
123   FSM_ERROR},        /* 0xf5-0xff */
124
125  /* FSM_80BF */
126  {FSM_ERROR,         /* 0x00-0x7f */
127   FSM_START,         /* 0x80-0x8f */
128   FSM_START,         /* 0x90-0x9f */
129   FSM_START,         /* 0xa0-0xbf */
130   FSM_ERROR,         /* 0xc0-0xc1 */
131   FSM_ERROR,         /* 0xc2-0xdf */
132   FSM_ERROR,         /* 0xe0 */
133   FSM_ERROR,         /* 0xe1-0xec */
134   FSM_ERROR,         /* 0xed */
135   FSM_ERROR,         /* 0xee-0xef */
136   FSM_ERROR,         /* 0xf0 */
137   FSM_ERROR,         /* 0xf1-0xf3 */
138   FSM_ERROR,         /* 0xf4 */
139   FSM_ERROR},        /* 0xf5-0xff */
140
141  /* FSM_A0BF */
142  {FSM_ERROR,         /* 0x00-0x7f */
143   FSM_ERROR,         /* 0x80-0x8f */
144   FSM_ERROR,         /* 0x90-0x9f */
145   FSM_80BF,          /* 0xa0-0xbf */
146   FSM_ERROR,         /* 0xc0-0xc1 */
147   FSM_ERROR,         /* 0xc2-0xdf */
148   FSM_ERROR,         /* 0xe0 */
149   FSM_ERROR,         /* 0xe1-0xec */
150   FSM_ERROR,         /* 0xed */
151   FSM_ERROR,         /* 0xee-0xef */
152   FSM_ERROR,         /* 0xf0 */
153   FSM_ERROR,         /* 0xf1-0xf3 */
154   FSM_ERROR,         /* 0xf4 */
155   FSM_ERROR},        /* 0xf5-0xff */
156
157  /* FSM_80BF80BF */
158  {FSM_ERROR,         /* 0x00-0x7f */
159   FSM_80BF,          /* 0x80-0x8f */
160   FSM_80BF,          /* 0x90-0x9f */
161   FSM_80BF,          /* 0xa0-0xbf */
162   FSM_ERROR,         /* 0xc0-0xc1 */
163   FSM_ERROR,         /* 0xc2-0xdf */
164   FSM_ERROR,         /* 0xe0 */
165   FSM_ERROR,         /* 0xe1-0xec */
166   FSM_ERROR,         /* 0xed */
167   FSM_ERROR,         /* 0xee-0xef */
168   FSM_ERROR,         /* 0xf0 */
169   FSM_ERROR,         /* 0xf1-0xf3 */
170   FSM_ERROR,         /* 0xf4 */
171   FSM_ERROR},        /* 0xf5-0xff */
172
173  /* FSM_809F */
174  {FSM_ERROR,         /* 0x00-0x7f */
175   FSM_80BF,          /* 0x80-0x8f */
176   FSM_80BF,          /* 0x90-0x9f */
177   FSM_ERROR,         /* 0xa0-0xbf */
178   FSM_ERROR,         /* 0xc0-0xc1 */
179   FSM_ERROR,         /* 0xc2-0xdf */
180   FSM_ERROR,         /* 0xe0 */
181   FSM_ERROR,         /* 0xe1-0xec */
182   FSM_ERROR,         /* 0xed */
183   FSM_ERROR,         /* 0xee-0xef */
184   FSM_ERROR,         /* 0xf0 */
185   FSM_ERROR,         /* 0xf1-0xf3 */
186   FSM_ERROR,         /* 0xf4 */
187   FSM_ERROR},        /* 0xf5-0xff */
188
189  /* FSM_90BF */
190  {FSM_ERROR,         /* 0x00-0x7f */
191   FSM_ERROR,         /* 0x80-0x8f */
192   FSM_80BF80BF,      /* 0x90-0x9f */
193   FSM_80BF80BF,      /* 0xa0-0xbf */
194   FSM_ERROR,         /* 0xc0-0xc1 */
195   FSM_ERROR,         /* 0xc2-0xdf */
196   FSM_ERROR,         /* 0xe0 */
197   FSM_ERROR,         /* 0xe1-0xec */
198   FSM_ERROR,         /* 0xed */
199   FSM_ERROR,         /* 0xee-0xef */
200   FSM_ERROR,         /* 0xf0 */
201   FSM_ERROR,         /* 0xf1-0xf3 */
202   FSM_ERROR,         /* 0xf4 */
203   FSM_ERROR},        /* 0xf5-0xff */
204
205  /* FSM_80BF80BF80BF */
206  {FSM_ERROR,         /* 0x00-0x7f */
207   FSM_80BF80BF,      /* 0x80-0x8f */
208   FSM_80BF80BF,      /* 0x90-0x9f */
209   FSM_80BF80BF,      /* 0xa0-0xbf */
210   FSM_ERROR,         /* 0xc0-0xc1 */
211   FSM_ERROR,         /* 0xc2-0xdf */
212   FSM_ERROR,         /* 0xe0 */
213   FSM_ERROR,         /* 0xe1-0xec */
214   FSM_ERROR,         /* 0xed */
215   FSM_ERROR,         /* 0xee-0xef */
216   FSM_ERROR,         /* 0xf0 */
217   FSM_ERROR,         /* 0xf1-0xf3 */
218   FSM_ERROR,         /* 0xf4 */
219   FSM_ERROR},        /* 0xf5-0xff */
220
221  /* FSM_808F */
222  {FSM_ERROR,         /* 0x00-0x7f */
223   FSM_80BF80BF,      /* 0x80-0x8f */
224   FSM_ERROR,         /* 0x90-0x9f */
225   FSM_ERROR,         /* 0xa0-0xbf */
226   FSM_ERROR,         /* 0xc0-0xc1 */
227   FSM_ERROR,         /* 0xc2-0xdf */
228   FSM_ERROR,         /* 0xe0 */
229   FSM_ERROR,         /* 0xe1-0xec */
230   FSM_ERROR,         /* 0xed */
231   FSM_ERROR,         /* 0xee-0xef */
232   FSM_ERROR,         /* 0xf0 */
233   FSM_ERROR,         /* 0xf1-0xf3 */
234   FSM_ERROR,         /* 0xf4 */
235   FSM_ERROR},        /* 0xf5-0xff */
236
237  /* FSM_ERROR */
238  {FSM_ERROR,         /* 0x00-0x7f */
239   FSM_ERROR,         /* 0x80-0x8f */
240   FSM_ERROR,         /* 0x90-0x9f */
241   FSM_ERROR,         /* 0xa0-0xbf */
242   FSM_ERROR,         /* 0xc0-0xc1 */
243   FSM_ERROR,         /* 0xc2-0xdf */
244   FSM_ERROR,         /* 0xe0 */
245   FSM_ERROR,         /* 0xe1-0xec */
246   FSM_ERROR,         /* 0xed */
247   FSM_ERROR,         /* 0xee-0xef */
248   FSM_ERROR,         /* 0xf0 */
249   FSM_ERROR,         /* 0xf1-0xf3 */
250   FSM_ERROR,         /* 0xf4 */
251   FSM_ERROR},        /* 0xf5-0xff */
252};
253
254/* Scan MAX_LEN bytes in *DATA for chars that are not in the octet
255 * category 0 (FSM_START).  Return the position of the first such char
256 * or DATA + MAX_LEN if all were cat 0.
257 */
258static const char *
259first_non_fsm_start_char(const char *data, apr_size_t max_len)
260{
261#if SVN_UNALIGNED_ACCESS_IS_OK
262
263  /* Scan the input one machine word at a time. */
264  for (; max_len > sizeof(apr_uintptr_t)
265       ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t))
266    if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET)
267      break;
268
269#endif
270
271  /* The remaining odd bytes will be examined the naive way: */
272  for (; max_len > 0; ++data, --max_len)
273    if ((unsigned char)*data >= 0x80)
274      break;
275
276  return data;
277}
278
279const char *
280svn_utf__last_valid(const char *data, apr_size_t len)
281{
282  const char *start = first_non_fsm_start_char(data, len);
283  const char *end = data + len;
284  int state = FSM_START;
285
286  data = start;
287  while (data < end)
288    {
289      unsigned char octet = *data++;
290      int category = octet_category[octet];
291      state = machine[state][category];
292      if (state == FSM_START)
293        start = data;
294    }
295  return start;
296}
297
298svn_boolean_t
299svn_utf__cstring_is_valid(const char *data)
300{
301  if (!data)
302    return FALSE;
303
304  return svn_utf__is_valid(data, strlen(data));
305}
306
307svn_boolean_t
308svn_utf__is_valid(const char *data, apr_size_t len)
309{
310  const char *end = data + len;
311  int state = FSM_START;
312
313  if (!data)
314    return FALSE;
315
316  data = first_non_fsm_start_char(data, len);
317
318  while (data < end)
319    {
320      unsigned char octet = *data++;
321      int category = octet_category[octet];
322      state = machine[state][category];
323    }
324  return state == FSM_START;
325}
326
327const char *
328svn_utf__last_valid2(const char *data, apr_size_t len)
329{
330  const char *start = first_non_fsm_start_char(data, len);
331  const char *end = data + len;
332  int state = FSM_START;
333
334  data = start;
335  while (data < end)
336    {
337      unsigned char octet = *data++;
338      switch (state)
339        {
340        case FSM_START:
341          if (octet <= 0x7F)
342            break;
343          else if (octet <= 0xC1)
344            state = FSM_ERROR;
345          else if (octet <= 0xDF)
346            state = FSM_80BF;
347          else if (octet == 0xE0)
348            state = FSM_A0BF;
349          else if (octet <= 0xEC)
350            state = FSM_80BF80BF;
351          else if (octet == 0xED)
352            state = FSM_809F;
353          else if (octet <= 0xEF)
354            state = FSM_80BF80BF;
355          else if (octet == 0xF0)
356            state = FSM_90BF;
357          else if (octet <= 0xF3)
358            state = FSM_80BF80BF80BF;
359          else if (octet <= 0xF4)
360            state = FSM_808F;
361          else
362            state = FSM_ERROR;
363          break;
364        case FSM_80BF:
365          if (octet >= 0x80 && octet <= 0xBF)
366            state = FSM_START;
367          else
368            state = FSM_ERROR;
369          break;
370        case FSM_A0BF:
371          if (octet >= 0xA0 && octet <= 0xBF)
372            state = FSM_80BF;
373          else
374            state = FSM_ERROR;
375          break;
376        case FSM_80BF80BF:
377          if (octet >= 0x80 && octet <= 0xBF)
378            state = FSM_80BF;
379          else
380            state = FSM_ERROR;
381          break;
382        case FSM_809F:
383          if (octet >= 0x80 && octet <= 0x9F)
384            state = FSM_80BF;
385          else
386            state = FSM_ERROR;
387          break;
388        case FSM_90BF:
389          if (octet >= 0x90 && octet <= 0xBF)
390            state = FSM_80BF80BF;
391          else
392            state = FSM_ERROR;
393          break;
394        case FSM_80BF80BF80BF:
395          if (octet >= 0x80 && octet <= 0xBF)
396            state = FSM_80BF80BF;
397          else
398            state = FSM_ERROR;
399          break;
400        case FSM_808F:
401          if (octet >= 0x80 && octet <= 0x8F)
402            state = FSM_80BF80BF;
403          else
404            state = FSM_ERROR;
405          break;
406        default:
407        case FSM_ERROR:
408          return start;
409        }
410      if (state == FSM_START)
411        start = data;
412    }
413  return start;
414}
415