subversion/libsvn_subr/utf_validate.c

/*
 * utf_validate.c:  Validate a UTF-8 string
 *
 * ====================================================================
 *    Licensed to the Apache Software Foundation (ASF) under one
 *    or more contributor license agreements.  See the NOTICE file
 *    distributed with this work for additional information
 *    regarding copyright ownership.  The ASF licenses this file
 *    to you under the Apache License, Version 2.0 (the
 *    "License"); you may not use this file except in compliance
 *    with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing,
 *    software distributed under the License is distributed on an
 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *    KIND, either express or implied.  See the License for the
 *    specific language governing permissions and limitations
 *    under the License.
 * ====================================================================
 */

/* Validate a UTF-8 string according to the rules in
 *
 *    Table 3-6. Well-Formed UTF-8 Bytes Sequences
 *
 * in
 *
 *    The Unicode Standard, Version 4.0
 *
 * which is available at
 *
 *    http://www.unicode.org/
 *
 * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
 * is a subset of that enconding.  The Unicode enconding prohibits things
 * like non-shortest encodings (some characters can be represented by more
 * than one multi-byte encoding) and the encodings for the surrogate code
 * points.  RFC-3629 superceeds RFC-2279 and adopts the same well-formed
 * rules as Unicode.  This is the ABNF in RFC-3629 that describes
 * well-formed UTF-8 rules:
 *
 *   UTF8-octets = *( UTF8-char )
 *   UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
 *   UTF8-1      = %x00-7F
 *   UTF8-2      = %xC2-DF UTF8-tail
 *   UTF8-3      = %xE0 %xA0-BF UTF8-tail /
 *                 %xE1-EC 2( UTF8-tail ) /
 *                 %xED %x80-9F UTF8-tail /
 *                 %xEE-EF 2( UTF8-tail )
 *   UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
 *                 %xF1-F3 3( UTF8-tail ) /
 *                 %xF4 %x80-8F 2( UTF8-tail )
 *   UTF8-tail   = %x80-BF
 *
 */

#include "private/svn_utf_private.h"
#include "private/svn_eol_private.h"
#include "private/svn_dep_compat.h"

/* Lookup table to categorise each octet in the string. */
static const char octet_category[256] = {
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0x00-0x7f */
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, /* 0x80-0x8f */
  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, /* 0x90-0x9f */
  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, /* 0xa0-0xbf */
  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
  4,  4,                                                         /* 0xc0-0xc1 */
          5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, /* 0xc2-0xdf */
  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
  6,                                                             /* 0xe0 */
      7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,             /* 0xe1-0xec */
                                                      8,         /* 0xed */
                                                          9,  9, /* 0xee-0xef */
  10,                                                            /* 0xf0 */
      11, 11, 11,                                                /* 0xf1-0xf3 */
                  12,                                            /* 0xf4 */
                      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
};

/* Machine states */
#define FSM_START         0
#define FSM_80BF          1
#define FSM_A0BF          2
#define FSM_80BF80BF      3
#define FSM_809F          4
#define FSM_90BF          5
#define FSM_80BF80BF80BF  6
#define FSM_808F          7
#define FSM_ERROR         8

/* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the
   same transitions, as do categories 0xe1-0xec and 0xee-0xef.  I wonder if
   there is any great benefit in combining categories?  It would reduce the
   memory footprint of the transition table by 16 bytes, but might it be
   harder to understand?  */

/* Machine transition table */
static const char machine [9][14] = {
  /* FSM_START */
  {FSM_START,         /* 0x00-0x7f */
   FSM_ERROR,         /* 0x80-0x8f */
   FSM_ERROR,         /* 0x90-0x9f */
   FSM_ERROR,         /* 0xa0-0xbf */
   FSM_ERROR,         /* 0xc0-0xc1 */
   FSM_80BF,          /* 0xc2-0xdf */
   FSM_A0BF,          /* 0xe0 */
   FSM_80BF80BF,      /* 0xe1-0xec */
   FSM_809F,          /* 0xed */
   FSM_80BF80BF,      /* 0xee-0xef */
   FSM_90BF,          /* 0xf0 */
   FSM_80BF80BF80BF,  /* 0xf1-0xf3 */
   FSM_808F,          /* 0xf4 */
   FSM_ERROR},        /* 0xf5-0xff */

  /* FSM_80BF */
  {FSM_ERROR,         /* 0x00-0x7f */
   FSM_START,         /* 0x80-0x8f */
   FSM_START,         /* 0x90-0x9f */
   FSM_START,         /* 0xa0-0xbf */
   FSM_ERROR,         /* 0xc0-0xc1 */
   FSM_ERROR,         /* 0xc2-0xdf */
   FSM_ERROR,         /* 0xe0 */
   FSM_ERROR,         /* 0xe1-0xec */
   FSM_ERROR,         /* 0xed */
   FSM_ERROR,         /* 0xee-0xef */
   FSM_ERROR,         /* 0xf0 */
   FSM_ERROR,         /* 0xf1-0xf3 */
   FSM_ERROR,         /* 0xf4 */
   FSM_ERROR},        /* 0xf5-0xff */

  /* FSM_A0BF */
  {FSM_ERROR,         /* 0x00-0x7f */
   FSM_ERROR,         /* 0x80-0x8f */
   FSM_ERROR,         /* 0x90-0x9f */
   FSM_80BF,          /* 0xa0-0xbf */
   FSM_ERROR,         /* 0xc0-0xc1 */
   FSM_ERROR,         /* 0xc2-0xdf */
   FSM_ERROR,         /* 0xe0 */
   FSM_ERROR,         /* 0xe1-0xec */
   FSM_ERROR,         /* 0xed */
   FSM_ERROR,         /* 0xee-0xef */
   FSM_ERROR,         /* 0xf0 */
   FSM_ERROR,         /* 0xf1-0xf3 */
   FSM_ERROR,         /* 0xf4 */
   FSM_ERROR},        /* 0xf5-0xff */

  /* FSM_80BF80BF */
  {FSM_ERROR,         /* 0x00-0x7f */
   FSM_80BF,          /* 0x80-0x8f */
   FSM_80BF,          /* 0x90-0x9f */
   FSM_80BF,          /* 0xa0-0xbf */
   FSM_ERROR,         /* 0xc0-0xc1 */
   FSM_ERROR,         /* 0xc2-0xdf */
   FSM_ERROR,         /* 0xe0 */
   FSM_ERROR,         /* 0xe1-0xec */
   FSM_ERROR,         /* 0xed */
   FSM_ERROR,         /* 0xee-0xef */
   FSM_ERROR,         /* 0xf0 */
   FSM_ERROR,         /* 0xf1-0xf3 */
   FSM_ERROR,         /* 0xf4 */
   FSM_ERROR},        /* 0xf5-0xff */

  /* FSM_809F */
  {FSM_ERROR,         /* 0x00-0x7f */
   FSM_80BF,          /* 0x80-0x8f */
   FSM_80BF,          /* 0x90-0x9f */
   FSM_ERROR,         /* 0xa0-0xbf */
   FSM_ERROR,         /* 0xc0-0xc1 */
   FSM_ERROR,         /* 0xc2-0xdf */
   FSM_ERROR,         /* 0xe0 */
   FSM_ERROR,         /* 0xe1-0xec */
   FSM_ERROR,         /* 0xed */
   FSM_ERROR,         /* 0xee-0xef */
   FSM_ERROR,         /* 0xf0 */
   FSM_ERROR,         /* 0xf1-0xf3 */
   FSM_ERROR,         /* 0xf4 */
   FSM_ERROR},        /* 0xf5-0xff */

  /* FSM_90BF */
  {FSM_ERROR,         /* 0x00-0x7f */
   FSM_ERROR,         /* 0x80-0x8f */
   FSM_80BF80BF,      /* 0x90-0x9f */
   FSM_80BF80BF,      /* 0xa0-0xbf */
   FSM_ERROR,         /* 0xc0-0xc1 */
   FSM_ERROR,         /* 0xc2-0xdf */
   FSM_ERROR,         /* 0xe0 */
   FSM_ERROR,         /* 0xe1-0xec */
   FSM_ERROR,         /* 0xed */
   FSM_ERROR,         /* 0xee-0xef */
   FSM_ERROR,         /* 0xf0 */
   FSM_ERROR,         /* 0xf1-0xf3 */
   FSM_ERROR,         /* 0xf4 */
   FSM_ERROR},        /* 0xf5-0xff */

  /* FSM_80BF80BF80BF */
  {FSM_ERROR,         /* 0x00-0x7f */
   FSM_80BF80BF,      /* 0x80-0x8f */
   FSM_80BF80BF,      /* 0x90-0x9f */
   FSM_80BF80BF,      /* 0xa0-0xbf */
   FSM_ERROR,         /* 0xc0-0xc1 */
   FSM_ERROR,         /* 0xc2-0xdf */
   FSM_ERROR,         /* 0xe0 */
   FSM_ERROR,         /* 0xe1-0xec */
   FSM_ERROR,         /* 0xed */
   FSM_ERROR,         /* 0xee-0xef */
   FSM_ERROR,         /* 0xf0 */
   FSM_ERROR,         /* 0xf1-0xf3 */
   FSM_ERROR,         /* 0xf4 */
   FSM_ERROR},        /* 0xf5-0xff */

  /* FSM_808F */
  {FSM_ERROR,         /* 0x00-0x7f */
   FSM_80BF80BF,      /* 0x80-0x8f */
   FSM_ERROR,         /* 0x90-0x9f */
   FSM_ERROR,         /* 0xa0-0xbf */
   FSM_ERROR,         /* 0xc0-0xc1 */
   FSM_ERROR,         /* 0xc2-0xdf */
   FSM_ERROR,         /* 0xe0 */
   FSM_ERROR,         /* 0xe1-0xec */
   FSM_ERROR,         /* 0xed */
   FSM_ERROR,         /* 0xee-0xef */
   FSM_ERROR,         /* 0xf0 */
   FSM_ERROR,         /* 0xf1-0xf3 */
   FSM_ERROR,         /* 0xf4 */
   FSM_ERROR},        /* 0xf5-0xff */

  /* FSM_ERROR */
  {FSM_ERROR,         /* 0x00-0x7f */
   FSM_ERROR,         /* 0x80-0x8f */
   FSM_ERROR,         /* 0x90-0x9f */
   FSM_ERROR,         /* 0xa0-0xbf */
   FSM_ERROR,         /* 0xc0-0xc1 */
   FSM_ERROR,         /* 0xc2-0xdf */
   FSM_ERROR,         /* 0xe0 */
   FSM_ERROR,         /* 0xe1-0xec */
   FSM_ERROR,         /* 0xed */
   FSM_ERROR,         /* 0xee-0xef */
   FSM_ERROR,         /* 0xf0 */
   FSM_ERROR,         /* 0xf1-0xf3 */
   FSM_ERROR,         /* 0xf4 */
   FSM_ERROR},        /* 0xf5-0xff */
};

/* Scan MAX_LEN bytes in *DATA for chars that are not in the octet
 * category 0 (FSM_START).  Return the position of the first such char
 * or DATA + MAX_LEN if all were cat 0.
 */
static const char *
first_non_fsm_start_char(const char *data, apr_size_t max_len)
{
#if !SVN_UNALIGNED_ACCESS_IS_OK

  /* On some systems, we need to make sure that buf is properly aligned
   * for chunky data access.
   */
  if ((apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1))
    {
      apr_size_t len = (~(apr_uintptr_t)data) & (sizeof(apr_uintptr_t)-1);
      if (len > max_len)
        len = max_len;
      max_len -= len;

      for (; len > 0; ++data, --len)
        if (*data < 0 || *data >= 0x80)
          return data;
    }

#endif

  /* Scan the input one machine word at a time. */
  for (; max_len > sizeof(apr_uintptr_t)
       ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t))
    if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET)
      break;

  /* The remaining odd bytes will be examined the naive way: */
  for (; max_len > 0; ++data, --max_len)
    if (*data < 0 || *data >= 0x80)
      break;

  return data;
}

/* Scan the C string in *DATA for chars that are not in the octet
 * category 0 (FSM_START).  Return the position of either the such
 * char or of the terminating NUL.
 */
static const char *
first_non_fsm_start_char_cstring(const char *data)
{
  /* We need to make sure that BUF is properly aligned for chunky data
   * access because we don't know the string's length. Unaligned chunk
   * read access beyond the NUL terminator could therefore result in a
   * segfault.
   */
  for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data)
    if (*data <= 0 || *data >= 0x80)
      return data;

  /* Scan the input one machine word at a time. */
#ifndef SVN_UTF_NO_UNINITIALISED_ACCESS
  /* This may read allocated but initialised bytes beyond the
     terminating null.  Any such bytes are always readable and this
     code operates correctly whatever the uninitialised values happen
     to be.  However memory checking tools such as valgrind and GCC
     4.8's address santitizer will object so this bit of code can be
     disabled at compile time. */
  for (; ; data += sizeof(apr_uintptr_t))
    {
      /* Check for non-ASCII chars: */
      apr_uintptr_t chunk = *(const apr_uintptr_t *)data;
      if (chunk & SVN__BIT_7_SET)
        break;

      /* This is the well-known strlen test: */
      chunk |= (chunk & SVN__LOWER_7BITS_SET) + SVN__LOWER_7BITS_SET;
      if ((chunk & SVN__BIT_7_SET) != SVN__BIT_7_SET)
        break;
    }
#endif

  /* The remaining odd bytes will be examined the naive way: */
  for (; ; ++data)
    if (*data <= 0 || *data >= 0x80)
      break;

  return data;
}

const char *
svn_utf__last_valid(const char *data, apr_size_t len)
{
  const char *start = first_non_fsm_start_char(data, len);
  const char *end = data + len;
  int state = FSM_START;

  data = start;
  while (data < end)
    {
      unsigned char octet = *data++;
      int category = octet_category[octet];
      state = machine[state][category];
      if (state == FSM_START)
        start = data;
    }
  return start;
}

svn_boolean_t
svn_utf__cstring_is_valid(const char *data)
{
  int state = FSM_START;

  if (!data)
    return FALSE;

  data = first_non_fsm_start_char_cstring(data);

  while (*data)
    {
      unsigned char octet = *data++;
      int category = octet_category[octet];
      state = machine[state][category];
    }
  return state == FSM_START;
}

svn_boolean_t
svn_utf__is_valid(const char *data, apr_size_t len)
{
  const char *end = data + len;
  int state = FSM_START;

  if (!data)
    return FALSE;

  data = first_non_fsm_start_char(data, len);

  while (data < end)
    {
      unsigned char octet = *data++;
      int category = octet_category[octet];
      state = machine[state][category];
    }
  return state == FSM_START;
}

const char *
svn_utf__last_valid2(const char *data, apr_size_t len)
{
  const char *start = first_non_fsm_start_char(data, len);
  const char *end = data + len;
  int state = FSM_START;

  data = start;
  while (data < end)
    {
      unsigned char octet = *data++;
      switch (state)
        {
        case FSM_START:
          if (octet <= 0x7F)
            break;
          else if (octet <= 0xC1)
            state = FSM_ERROR;
          else if (octet <= 0xDF)
            state = FSM_80BF;
          else if (octet == 0xE0)
            state = FSM_A0BF;
          else if (octet <= 0xEC)
            state = FSM_80BF80BF;
          else if (octet == 0xED)
            state = FSM_809F;
          else if (octet <= 0xEF)
            state = FSM_80BF80BF;
          else if (octet == 0xF0)
            state = FSM_90BF;
          else if (octet <= 0xF3)
            state = FSM_80BF80BF80BF;
          else if (octet <= 0xF4)
            state = FSM_808F;
          else
            state = FSM_ERROR;
          break;
        case FSM_80BF:
          if (octet >= 0x80 && octet <= 0xBF)
            state = FSM_START;
          else
            state = FSM_ERROR;
          break;
        case FSM_A0BF:
          if (octet >= 0xA0 && octet <= 0xBF)
            state = FSM_80BF;
          else
            state = FSM_ERROR;
          break;
        case FSM_80BF80BF:
          if (octet >= 0x80 && octet <= 0xBF)
            state = FSM_80BF;
          else
            state = FSM_ERROR;
          break;
        case FSM_809F:
          if (octet >= 0x80 && octet <= 0x9F)
            state = FSM_80BF;
          else
            state = FSM_ERROR;
          break;
        case FSM_90BF:
          if (octet >= 0x90 && octet <= 0xBF)
            state = FSM_80BF80BF;
          else
            state = FSM_ERROR;
          break;
        case FSM_80BF80BF80BF:
          if (octet >= 0x80 && octet <= 0xBF)
            state = FSM_80BF80BF;
          else
            state = FSM_ERROR;
          break;
        case FSM_808F:
          if (octet >= 0x80 && octet <= 0x8F)
            state = FSM_80BF80BF;
          else
            state = FSM_ERROR;
          break;
        default:
        case FSM_ERROR:
          return start;
        }
      if (state == FSM_START)
        start = data;
    }
  return start;
}