utf.c revision 299742
1/*
2 * utf.c:  UTF-8 conversion routines
3 *
4 * ====================================================================
5 *    Licensed to the Apache Software Foundation (ASF) under one
6 *    or more contributor license agreements.  See the NOTICE file
7 *    distributed with this work for additional information
8 *    regarding copyright ownership.  The ASF licenses this file
9 *    to you under the Apache License, Version 2.0 (the
10 *    "License"); you may not use this file except in compliance
11 *    with the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 *    Unless required by applicable law or agreed to in writing,
16 *    software distributed under the License is distributed on an
17 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 *    KIND, either express or implied.  See the License for the
19 *    specific language governing permissions and limitations
20 *    under the License.
21 * ====================================================================
22 */
23
24
25
26#include <stdlib.h>
27#include <string.h>
28#include <assert.h>
29
30#include <apr_strings.h>
31#include <apr_lib.h>
32#include <apr_xlate.h>
33#include <apr_atomic.h>
34
35#include "svn_hash.h"
36#include "svn_string.h"
37#include "svn_error.h"
38#include "svn_pools.h"
39#include "svn_ctype.h"
40#include "svn_utf.h"
41#include "svn_private_config.h"
42#include "win32_xlate.h"
43
44#include "private/svn_utf_private.h"
45#include "private/svn_dep_compat.h"
46#include "private/svn_string_private.h"
47#include "private/svn_mutex.h"
48
49
50
51/* Use these static strings to maximize performance on standard conversions.
52 * Any strings on other locations are still valid, however.
53 */
54static const char *SVN_UTF_NTOU_XLATE_HANDLE = "svn-utf-ntou-xlate-handle";
55static const char *SVN_UTF_UTON_XLATE_HANDLE = "svn-utf-uton-xlate-handle";
56
57static const char *SVN_APR_UTF8_CHARSET = "UTF-8";
58
59static svn_mutex__t *xlate_handle_mutex = NULL;
60static svn_boolean_t assume_native_charset_is_utf8 = FALSE;
61
62#if defined(WIN32)
63typedef svn_subr__win32_xlate_t xlate_handle_t;
64#else
65typedef apr_xlate_t xlate_handle_t;
66#endif
67
68/* The xlate handle cache is a global hash table with linked lists of xlate
69 * handles.  In multi-threaded environments, a thread "borrows" an xlate
70 * handle from the cache during a translation and puts it back afterwards.
71 * This avoids holding a global lock for all translations.
72 * If there is no handle for a particular key when needed, a new is
73 * handle is created and put in the cache after use.
74 * This means that there will be at most N handles open for a key, where N
75 * is the number of simultanous handles in use for that key. */
76
77typedef struct xlate_handle_node_t {
78  xlate_handle_t *handle;
79  /* FALSE if the handle is not valid, since its pool is being
80     destroyed. */
81  svn_boolean_t valid;
82  /* The name of a char encoding or APR_LOCALE_CHARSET. */
83  const char *frompage, *topage;
84  struct xlate_handle_node_t *next;
85} xlate_handle_node_t;
86
87/* This maps const char * userdata_key strings to xlate_handle_node_t **
88   handles to the first entry in the linked list of xlate handles.  We don't
89   store the pointer to the list head directly in the hash table, since we
90   remove/insert entries at the head in the list in the code below, and
91   we can't use apr_hash_set() in each character translation because that
92   function allocates memory in each call where the value is non-NULL.
93   Since these allocations take place in a global pool, this would be a
94   memory leak. */
95static apr_hash_t *xlate_handle_hash = NULL;
96
97/* "1st level cache" to standard conversion maps. We may access these
98 * using atomic xchange ops, i.e. without further thread synchronization.
99 * If the respective item is NULL, fallback to hash lookup.
100 */
101static void * volatile xlat_ntou_static_handle = NULL;
102static void * volatile xlat_uton_static_handle = NULL;
103
104/* Clean up the xlate handle cache. */
105static apr_status_t
106xlate_cleanup(void *arg)
107{
108  /* We set the cache variables to NULL so that translation works in other
109     cleanup functions, even if it isn't cached then. */
110  xlate_handle_hash = NULL;
111
112  /* ensure no stale objects get accessed */
113  xlat_ntou_static_handle = NULL;
114  xlat_uton_static_handle = NULL;
115
116  return APR_SUCCESS;
117}
118
119/* Set the handle of ARG to NULL. */
120static apr_status_t
121xlate_handle_node_cleanup(void *arg)
122{
123  xlate_handle_node_t *node = arg;
124
125  node->valid = FALSE;
126  return APR_SUCCESS;
127}
128
129void
130svn_utf_initialize2(svn_boolean_t assume_native_utf8,
131                    apr_pool_t *pool)
132{
133  if (!xlate_handle_hash)
134    {
135      /* We create our own subpool, which we protect with the mutex.
136         We can't use the pool passed to us by the caller, since we will
137         use it for xlate handle allocations, possibly in multiple threads,
138         and pool allocation is not thread-safe. */
139      apr_pool_t *subpool = svn_pool_create(pool);
140      svn_mutex__t *mutex;
141      svn_error_t *err = svn_mutex__init(&mutex, TRUE, subpool);
142      if (err)
143        {
144          svn_error_clear(err);
145          return;
146        }
147
148      xlate_handle_mutex = mutex;
149      xlate_handle_hash = apr_hash_make(subpool);
150
151      apr_pool_cleanup_register(subpool, NULL, xlate_cleanup,
152                                apr_pool_cleanup_null);
153    }
154
155    if (!assume_native_charset_is_utf8)
156      assume_native_charset_is_utf8 = assume_native_utf8;
157}
158
159/* Return a unique string key based on TOPAGE and FROMPAGE.  TOPAGE and
160 * FROMPAGE can be any valid arguments of the same name to
161 * apr_xlate_open().  Allocate the returned string in POOL. */
162static const char*
163get_xlate_key(const char *topage,
164              const char *frompage,
165              apr_pool_t *pool)
166{
167  /* In the cases of SVN_APR_LOCALE_CHARSET and SVN_APR_DEFAULT_CHARSET
168   * topage/frompage is really an int, not a valid string.  So generate a
169   * unique key accordingly. */
170  if (frompage == SVN_APR_LOCALE_CHARSET)
171    frompage = "APR_LOCALE_CHARSET";
172  else if (frompage == SVN_APR_DEFAULT_CHARSET)
173    frompage = "APR_DEFAULT_CHARSET";
174
175  if (topage == SVN_APR_LOCALE_CHARSET)
176    topage = "APR_LOCALE_CHARSET";
177  else if (topage == SVN_APR_DEFAULT_CHARSET)
178    topage = "APR_DEFAULT_CHARSET";
179
180  return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage,
181                     "-xlate-handle", SVN_VA_NULL);
182}
183
184/* Atomically replace the content in *MEM with NEW_VALUE and return
185 * the previous content of *MEM. If atomicy cannot be guaranteed,
186 * *MEM will not be modified and NEW_VALUE is simply returned to
187 * the caller.
188 */
189static APR_INLINE void*
190atomic_swap(void * volatile * mem, void *new_value)
191{
192#if APR_HAS_THREADS
193  /* Cast is necessary because of APR bug:
194     https://issues.apache.org/bugzilla/show_bug.cgi?id=50731 */
195   return apr_atomic_xchgptr((volatile void **)mem, new_value);
196#else
197   /* no threads - no sync. necessary */
198   void *old_value = (void*)*mem;
199   *mem = new_value;
200   return old_value;
201#endif
202}
203
204/* Set *RET to a newly created handle node for converting from FROMPAGE
205   to TOPAGE, If apr_xlate_open() returns APR_EINVAL or APR_ENOTIMPL, set
206   (*RET)->handle to NULL.  If fail for any other reason, return the error.
207   Allocate *RET and its xlate handle in POOL. */
208static svn_error_t *
209xlate_alloc_handle(xlate_handle_node_t **ret,
210                   const char *topage, const char *frompage,
211                   apr_pool_t *pool)
212{
213  apr_status_t apr_err;
214  xlate_handle_t *handle;
215  const char *name;
216
217  /* The error handling doesn't support the following cases, since we don't
218     use them currently.  Catch this here. */
219  SVN_ERR_ASSERT(frompage != SVN_APR_DEFAULT_CHARSET
220                 && topage != SVN_APR_DEFAULT_CHARSET
221                 && (frompage != SVN_APR_LOCALE_CHARSET
222                     || topage != SVN_APR_LOCALE_CHARSET));
223
224  /* Try to create a handle. */
225#if defined(WIN32)
226  apr_err = svn_subr__win32_xlate_open(&handle, topage,
227                                       frompage, pool);
228  name = "win32-xlate: ";
229#else
230  apr_err = apr_xlate_open(&handle, topage, frompage, pool);
231  name = "APR: ";
232#endif
233
234  if (APR_STATUS_IS_EINVAL(apr_err) || APR_STATUS_IS_ENOTIMPL(apr_err))
235    handle = NULL;
236  else if (apr_err != APR_SUCCESS)
237    {
238      const char *errstr;
239      char apr_strerr[512];
240
241      /* Can't use svn_error_wrap_apr here because it calls functions in
242         this file, leading to infinite recursion. */
243      if (frompage == SVN_APR_LOCALE_CHARSET)
244        errstr = apr_psprintf(pool,
245                              _("Can't create a character converter from "
246                                "native encoding to '%s'"), topage);
247      else if (topage == SVN_APR_LOCALE_CHARSET)
248        errstr = apr_psprintf(pool,
249                              _("Can't create a character converter from "
250                                "'%s' to native encoding"), frompage);
251      else
252        errstr = apr_psprintf(pool,
253                              _("Can't create a character converter from "
254                                "'%s' to '%s'"), frompage, topage);
255
256      /* Just put the error on the stack, since svn_error_create duplicates it
257         later.  APR_STRERR will be in the local encoding, not in UTF-8, though.
258       */
259      svn_strerror(apr_err, apr_strerr, sizeof(apr_strerr));
260      return svn_error_createf(SVN_ERR_PLUGIN_LOAD_FAILURE,
261                               svn_error_create(apr_err, NULL, apr_strerr),
262                               "%s%s", name, errstr);
263    }
264
265  /* Allocate and initialize the node. */
266  *ret = apr_palloc(pool, sizeof(xlate_handle_node_t));
267  (*ret)->handle = handle;
268  (*ret)->valid = TRUE;
269  (*ret)->frompage = ((frompage != SVN_APR_LOCALE_CHARSET)
270                      ? apr_pstrdup(pool, frompage) : frompage);
271  (*ret)->topage = ((topage != SVN_APR_LOCALE_CHARSET)
272                    ? apr_pstrdup(pool, topage) : topage);
273  (*ret)->next = NULL;
274
275  /* If we are called from inside a pool cleanup handler, the just created
276     xlate handle will be closed when that handler returns by a newly
277     registered cleanup handler, however, the handle is still cached by us.
278     To prevent this, we register a cleanup handler that will reset the valid
279     flag of our node, so we don't use an invalid handle. */
280  if (handle)
281    apr_pool_cleanup_register(pool, *ret, xlate_handle_node_cleanup,
282                              apr_pool_cleanup_null);
283
284  return SVN_NO_ERROR;
285}
286
287/* Extend xlate_alloc_handle by using USERDATA_KEY as a key in our
288   global hash map, if available.
289
290   Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
291   hasn't been called or USERDATA_KEY is NULL.  Else, allocate them
292   in the pool of xlate_handle_hash.
293
294   Note: this function is not thread-safe. Call get_xlate_handle_node
295   instead. */
296static svn_error_t *
297get_xlate_handle_node_internal(xlate_handle_node_t **ret,
298                               const char *topage, const char *frompage,
299                               const char *userdata_key, apr_pool_t *pool)
300{
301  /* If we already have a handle, just return it. */
302  if (userdata_key && xlate_handle_hash)
303    {
304      xlate_handle_node_t *old_node = NULL;
305
306      /* 2nd level: hash lookup */
307      xlate_handle_node_t **old_node_p = svn_hash_gets(xlate_handle_hash,
308                                                       userdata_key);
309      if (old_node_p)
310        old_node = *old_node_p;
311      if (old_node)
312        {
313          /* Ensure that the handle is still valid. */
314          if (old_node->valid)
315            {
316              /* Remove from the list. */
317              *old_node_p = old_node->next;
318              old_node->next = NULL;
319              *ret = old_node;
320              return SVN_NO_ERROR;
321            }
322        }
323    }
324
325  /* Note that we still have the mutex locked (if it is initialized), so we
326     can use the global pool for creating the new xlate handle. */
327
328  /* Use the correct pool for creating the handle. */
329  pool = apr_hash_pool_get(xlate_handle_hash);
330
331  return xlate_alloc_handle(ret, topage, frompage, pool);
332}
333
334/* Set *RET to a handle node for converting from FROMPAGE to TOPAGE,
335   creating the handle node if it doesn't exist in USERDATA_KEY.
336   If a node is not cached and apr_xlate_open() returns APR_EINVAL or
337   APR_ENOTIMPL, set (*RET)->handle to NULL.  If fail for any other
338   reason, return the error.
339
340   Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
341   hasn't been called or USERDATA_KEY is NULL.  Else, allocate them
342   in the pool of xlate_handle_hash. */
343static svn_error_t *
344get_xlate_handle_node(xlate_handle_node_t **ret,
345                      const char *topage, const char *frompage,
346                      const char *userdata_key, apr_pool_t *pool)
347{
348  xlate_handle_node_t *old_node = NULL;
349
350  /* If we already have a handle, just return it. */
351  if (userdata_key)
352    {
353      if (xlate_handle_hash)
354        {
355          /* 1st level: global, static items */
356          if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
357            old_node = atomic_swap(&xlat_ntou_static_handle, NULL);
358          else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
359            old_node = atomic_swap(&xlat_uton_static_handle, NULL);
360
361          if (old_node && old_node->valid)
362            {
363              *ret = old_node;
364              return SVN_NO_ERROR;
365            }
366        }
367      else
368        {
369          void *p;
370          /* We fall back on a per-pool cache instead. */
371          apr_pool_userdata_get(&p, userdata_key, pool);
372          old_node = p;
373          /* Ensure that the handle is still valid. */
374          if (old_node && old_node->valid)
375            {
376              *ret = old_node;
377              return SVN_NO_ERROR;
378            }
379
380          return xlate_alloc_handle(ret, topage, frompage, pool);
381        }
382    }
383
384  SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
385                       get_xlate_handle_node_internal(ret,
386                                                      topage,
387                                                      frompage,
388                                                      userdata_key,
389                                                      pool));
390
391  return SVN_NO_ERROR;
392}
393
394/* Put back NODE into the xlate handle cache for use by other calls.
395
396   Note: this function is not thread-safe. Call put_xlate_handle_node
397   instead. */
398static svn_error_t *
399put_xlate_handle_node_internal(xlate_handle_node_t *node,
400                               const char *userdata_key)
401{
402  xlate_handle_node_t **node_p = svn_hash_gets(xlate_handle_hash, userdata_key);
403  if (node_p == NULL)
404    {
405      userdata_key = apr_pstrdup(apr_hash_pool_get(xlate_handle_hash),
406                                  userdata_key);
407      node_p = apr_palloc(apr_hash_pool_get(xlate_handle_hash),
408                          sizeof(*node_p));
409      *node_p = NULL;
410      svn_hash_sets(xlate_handle_hash, userdata_key, node_p);
411    }
412  node->next = *node_p;
413  *node_p = node;
414
415  return SVN_NO_ERROR;
416}
417
418/* Put back NODE into the xlate handle cache for use by other calls.
419   If there is no global cache, store the handle in POOL.
420   Ignore errors related to locking/unlocking the mutex. */
421static svn_error_t *
422put_xlate_handle_node(xlate_handle_node_t *node,
423                      const char *userdata_key,
424                      apr_pool_t *pool)
425{
426  assert(node->next == NULL);
427  if (!userdata_key)
428    return SVN_NO_ERROR;
429
430  /* push previous global node to the hash */
431  if (xlate_handle_hash)
432    {
433      /* 1st level: global, static items */
434      if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
435        node = atomic_swap(&xlat_ntou_static_handle, node);
436      else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
437        node = atomic_swap(&xlat_uton_static_handle, node);
438      if (node == NULL)
439        return SVN_NO_ERROR;
440
441      SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
442                           put_xlate_handle_node_internal(node,
443                                                          userdata_key));
444    }
445  else
446    {
447      /* Store it in the per-pool cache. */
448      apr_pool_userdata_set(node, userdata_key, apr_pool_cleanup_null, pool);
449    }
450
451  return SVN_NO_ERROR;
452}
453
454/* Return the apr_xlate handle for converting native characters to UTF-8. */
455static svn_error_t *
456get_ntou_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
457{
458  return get_xlate_handle_node(ret, SVN_APR_UTF8_CHARSET,
459                               assume_native_charset_is_utf8
460                                 ? SVN_APR_UTF8_CHARSET
461                                 : SVN_APR_LOCALE_CHARSET,
462                               SVN_UTF_NTOU_XLATE_HANDLE, pool);
463}
464
465
466/* Return the apr_xlate handle for converting UTF-8 to native characters.
467   Create one if it doesn't exist.  If unable to find a handle, or
468   unable to create one because apr_xlate_open returned APR_EINVAL, then
469   set *RET to null and return SVN_NO_ERROR; if fail for some other
470   reason, return error. */
471static svn_error_t *
472get_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
473{
474  return get_xlate_handle_node(ret,
475                               assume_native_charset_is_utf8
476                                 ? SVN_APR_UTF8_CHARSET
477                                 : SVN_APR_LOCALE_CHARSET,
478                               SVN_APR_UTF8_CHARSET,
479                               SVN_UTF_UTON_XLATE_HANDLE, pool);
480}
481
482
483/* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result
484   in *DEST, which is allocated in POOL. */
485static svn_error_t *
486convert_to_stringbuf(xlate_handle_node_t *node,
487                     const char *src_data,
488                     apr_size_t src_length,
489                     svn_stringbuf_t **dest,
490                     apr_pool_t *pool)
491{
492#ifdef WIN32
493  apr_status_t apr_err;
494
495  apr_err = svn_subr__win32_xlate_to_stringbuf(node->handle, src_data,
496                                               src_length, dest, pool);
497#else
498  apr_size_t buflen = src_length * 2;
499  apr_status_t apr_err;
500  apr_size_t srclen = src_length;
501  apr_size_t destlen = buflen;
502
503  /* Initialize *DEST to an empty stringbuf.
504     A 1:2 ratio of input bytes to output bytes (as assigned above)
505     should be enough for most translations, and if it turns out not
506     to be enough, we'll grow the buffer again, sizing it based on a
507     1:3 ratio of the remainder of the string. */
508  *dest = svn_stringbuf_create_ensure(buflen + 1, pool);
509
510  /* Not only does it not make sense to convert an empty string, but
511     apr-iconv is quite unreasonable about not allowing that. */
512  if (src_length == 0)
513    return SVN_NO_ERROR;
514
515  do
516    {
517      /* Set up state variables for xlate. */
518      destlen = buflen - (*dest)->len;
519
520      /* Attempt the conversion. */
521      apr_err = apr_xlate_conv_buffer(node->handle,
522                                      src_data + (src_length - srclen),
523                                      &srclen,
524                                      (*dest)->data + (*dest)->len,
525                                      &destlen);
526
527      /* Now, update the *DEST->len to track the amount of output data
528         churned out so far from this loop. */
529      (*dest)->len += ((buflen - (*dest)->len) - destlen);
530      buflen += srclen * 3; /* 3 is middle ground, 2 wasn't enough
531                               for all characters in the buffer, 4 is
532                               maximum character size (currently) */
533
534
535    } while (apr_err == APR_SUCCESS && srclen != 0);
536#endif
537
538  /* If we exited the loop with an error, return the error. */
539  if (apr_err)
540    {
541      const char *errstr;
542      svn_error_t *err;
543
544      /* Can't use svn_error_wrap_apr here because it calls functions in
545         this file, leading to infinite recursion. */
546      if (node->frompage == SVN_APR_LOCALE_CHARSET)
547        errstr = apr_psprintf
548          (pool, _("Can't convert string from native encoding to '%s':"),
549           node->topage);
550      else if (node->topage == SVN_APR_LOCALE_CHARSET)
551        errstr = apr_psprintf
552          (pool, _("Can't convert string from '%s' to native encoding:"),
553           node->frompage);
554      else
555        errstr = apr_psprintf
556          (pool, _("Can't convert string from '%s' to '%s':"),
557           node->frompage, node->topage);
558
559      err = svn_error_create(
560          apr_err, NULL, svn_utf__fuzzy_escape(src_data, src_length, pool));
561      return svn_error_create(apr_err, err, errstr);
562    }
563  /* Else, exited due to success.  Trim the result buffer down to the
564     right length. */
565  (*dest)->data[(*dest)->len] = '\0';
566
567  return SVN_NO_ERROR;
568}
569
570
571/* Return APR_EINVAL if the first LEN bytes of DATA contain anything
572   other than seven-bit, non-control (except for whitespace) ASCII
573   characters, finding the error pool from POOL.  Otherwise, return
574   SVN_NO_ERROR. */
575static svn_error_t *
576check_non_ascii(const char *data, apr_size_t len, apr_pool_t *pool)
577{
578  const char *data_start = data;
579
580  for (; len > 0; --len, data++)
581    {
582      if ((! svn_ctype_isascii(*data))
583          || ((! svn_ctype_isspace(*data))
584              && svn_ctype_iscntrl(*data)))
585        {
586          /* Show the printable part of the data, followed by the
587             decimal code of the questionable character.  Because if a
588             user ever gets this error, she's going to have to spend
589             time tracking down the non-ASCII data, so we want to help
590             as much as possible.  And yes, we just call the unsafe
591             data "non-ASCII", even though the actual constraint is
592             somewhat more complex than that. */
593
594          if (data - data_start)
595            {
596              const char *error_data
597                = apr_pstrndup(pool, data_start, (data - data_start));
598
599              return svn_error_createf
600                (APR_EINVAL, NULL,
601                 _("Safe data '%s' was followed by non-ASCII byte %d: "
602                   "unable to convert to/from UTF-8"),
603                 error_data, *((const unsigned char *) data));
604            }
605          else
606            {
607              return svn_error_createf
608                (APR_EINVAL, NULL,
609                 _("Non-ASCII character (code %d) detected, "
610                   "and unable to convert to/from UTF-8"),
611                 *((const unsigned char *) data));
612            }
613        }
614    }
615
616  return SVN_NO_ERROR;
617}
618
619/* Construct an error with code APR_EINVAL and with a suitable message
620 * to describe the invalid UTF-8 sequence DATA of length LEN (which
621 * may have embedded NULLs).  We can't simply print the data, almost
622 * by definition we don't really know how it is encoded.
623 */
624static svn_error_t *
625invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
626{
627  const char *last = svn_utf__last_valid(data, len);
628  const char *valid_txt = "", *invalid_txt = "";
629  apr_size_t i;
630  size_t valid, invalid;
631
632  /* We will display at most 24 valid octets (this may split a leading
633     multi-byte character) as that should fit on one 80 character line. */
634  valid = last - data;
635  if (valid > 24)
636    valid = 24;
637  for (i = 0; i < valid; ++i)
638    valid_txt = apr_pstrcat(pool, valid_txt,
639                            apr_psprintf(pool, " %02x",
640                                         (unsigned char)last[i-valid]),
641                                         SVN_VA_NULL);
642
643  /* 4 invalid octets will guarantee that the faulty octet is displayed */
644  invalid = data + len - last;
645  if (invalid > 4)
646    invalid = 4;
647  for (i = 0; i < invalid; ++i)
648    invalid_txt = apr_pstrcat(pool, invalid_txt,
649                              apr_psprintf(pool, " %02x",
650                                           (unsigned char)last[i]),
651                                           SVN_VA_NULL);
652
653  return svn_error_createf(APR_EINVAL, NULL,
654                           _("Valid UTF-8 data\n(hex:%s)\n"
655                             "followed by invalid UTF-8 sequence\n(hex:%s)"),
656                           valid_txt, invalid_txt);
657}
658
659/* Verify that the sequence DATA of length LEN is valid UTF-8.
660   If it is not, return an error with code APR_EINVAL. */
661static svn_error_t *
662check_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
663{
664  if (! svn_utf__is_valid(data, len))
665    return invalid_utf8(data, len, pool);
666  return SVN_NO_ERROR;
667}
668
669/* Verify that the NULL terminated sequence DATA is valid UTF-8.
670   If it is not, return an error with code APR_EINVAL. */
671static svn_error_t *
672check_cstring_utf8(const char *data, apr_pool_t *pool)
673{
674
675  if (! svn_utf__cstring_is_valid(data))
676    return invalid_utf8(data, strlen(data), pool);
677  return SVN_NO_ERROR;
678}
679
680
681svn_error_t *
682svn_utf_stringbuf_to_utf8(svn_stringbuf_t **dest,
683                          const svn_stringbuf_t *src,
684                          apr_pool_t *pool)
685{
686  xlate_handle_node_t *node;
687  svn_error_t *err;
688
689  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
690
691  if (node->handle)
692    {
693      err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
694      if (! err)
695        err = check_utf8((*dest)->data, (*dest)->len, pool);
696    }
697  else
698    {
699      err = check_non_ascii(src->data, src->len, pool);
700      if (! err)
701        *dest = svn_stringbuf_dup(src, pool);
702    }
703
704  return svn_error_compose_create(err,
705                                  put_xlate_handle_node
706                                     (node,
707                                      SVN_UTF_NTOU_XLATE_HANDLE,
708                                      pool));
709}
710
711
712svn_error_t *
713svn_utf_string_to_utf8(const svn_string_t **dest,
714                       const svn_string_t *src,
715                       apr_pool_t *pool)
716{
717  svn_stringbuf_t *destbuf;
718  xlate_handle_node_t *node;
719  svn_error_t *err;
720
721  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
722
723  if (node->handle)
724    {
725      err = convert_to_stringbuf(node, src->data, src->len, &destbuf, pool);
726      if (! err)
727        err = check_utf8(destbuf->data, destbuf->len, pool);
728      if (! err)
729        *dest = svn_stringbuf__morph_into_string(destbuf);
730    }
731  else
732    {
733      err = check_non_ascii(src->data, src->len, pool);
734      if (! err)
735        *dest = svn_string_dup(src, pool);
736    }
737
738  return svn_error_compose_create(err,
739                                  put_xlate_handle_node
740                                     (node,
741                                      SVN_UTF_NTOU_XLATE_HANDLE,
742                                      pool));
743}
744
745
746/* Common implementation for svn_utf_cstring_to_utf8,
747   svn_utf_cstring_to_utf8_ex, svn_utf_cstring_from_utf8 and
748   svn_utf_cstring_from_utf8_ex. Convert SRC to DEST using NODE->handle as
749   the translator and allocating from POOL. */
750static svn_error_t *
751convert_cstring(const char **dest,
752                const char *src,
753                xlate_handle_node_t *node,
754                apr_pool_t *pool)
755{
756  if (node->handle)
757    {
758      svn_stringbuf_t *destbuf;
759      SVN_ERR(convert_to_stringbuf(node, src, strlen(src),
760                                   &destbuf, pool));
761      *dest = destbuf->data;
762    }
763  else
764    {
765      apr_size_t len = strlen(src);
766      SVN_ERR(check_non_ascii(src, len, pool));
767      *dest = apr_pstrmemdup(pool, src, len);
768    }
769  return SVN_NO_ERROR;
770}
771
772
773svn_error_t *
774svn_utf_cstring_to_utf8(const char **dest,
775                        const char *src,
776                        apr_pool_t *pool)
777{
778  xlate_handle_node_t *node;
779  svn_error_t *err;
780
781  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
782  err = convert_cstring(dest, src, node, pool);
783  SVN_ERR(svn_error_compose_create(err,
784                                   put_xlate_handle_node
785                                      (node,
786                                       SVN_UTF_NTOU_XLATE_HANDLE,
787                                       pool)));
788  return check_cstring_utf8(*dest, pool);
789}
790
791
792svn_error_t *
793svn_utf_cstring_to_utf8_ex2(const char **dest,
794                            const char *src,
795                            const char *frompage,
796                            apr_pool_t *pool)
797{
798  xlate_handle_node_t *node;
799  svn_error_t *err;
800  const char *convset_key = get_xlate_key(SVN_APR_UTF8_CHARSET, frompage,
801                                          pool);
802
803  SVN_ERR(get_xlate_handle_node(&node, SVN_APR_UTF8_CHARSET, frompage,
804                                convset_key, pool));
805  err = convert_cstring(dest, src, node, pool);
806  SVN_ERR(svn_error_compose_create(err,
807                                   put_xlate_handle_node
808                                      (node,
809                                       SVN_UTF_NTOU_XLATE_HANDLE,
810                                       pool)));
811
812  return check_cstring_utf8(*dest, pool);
813}
814
815
816svn_error_t *
817svn_utf_cstring_to_utf8_ex(const char **dest,
818                           const char *src,
819                           const char *frompage,
820                           const char *convset_key,
821                           apr_pool_t *pool)
822{
823  return svn_utf_cstring_to_utf8_ex2(dest, src, frompage, pool);
824}
825
826
827svn_error_t *
828svn_utf_stringbuf_from_utf8(svn_stringbuf_t **dest,
829                            const svn_stringbuf_t *src,
830                            apr_pool_t *pool)
831{
832  xlate_handle_node_t *node;
833  svn_error_t *err;
834
835  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
836
837  if (node->handle)
838    {
839      err = check_utf8(src->data, src->len, pool);
840      if (! err)
841        err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
842    }
843  else
844    {
845      err = check_non_ascii(src->data, src->len, pool);
846      if (! err)
847        *dest = svn_stringbuf_dup(src, pool);
848    }
849
850  err = svn_error_compose_create(
851          err,
852          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
853
854  return err;
855}
856
857
858svn_error_t *
859svn_utf_string_from_utf8(const svn_string_t **dest,
860                         const svn_string_t *src,
861                         apr_pool_t *pool)
862{
863  svn_stringbuf_t *dbuf;
864  xlate_handle_node_t *node;
865  svn_error_t *err;
866
867  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
868
869  if (node->handle)
870    {
871      err = check_utf8(src->data, src->len, pool);
872      if (! err)
873        err = convert_to_stringbuf(node, src->data, src->len,
874                                   &dbuf, pool);
875      if (! err)
876        *dest = svn_stringbuf__morph_into_string(dbuf);
877    }
878  else
879    {
880      err = check_non_ascii(src->data, src->len, pool);
881      if (! err)
882        *dest = svn_string_dup(src, pool);
883    }
884
885  err = svn_error_compose_create(
886          err,
887          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
888
889  return err;
890}
891
892
893svn_error_t *
894svn_utf_cstring_from_utf8(const char **dest,
895                          const char *src,
896                          apr_pool_t *pool)
897{
898  xlate_handle_node_t *node;
899  svn_error_t *err;
900
901  SVN_ERR(check_cstring_utf8(src, pool));
902
903  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
904  err = convert_cstring(dest, src, node, pool);
905  err = svn_error_compose_create(
906          err,
907          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
908
909  return err;
910}
911
912
913svn_error_t *
914svn_utf_cstring_from_utf8_ex2(const char **dest,
915                              const char *src,
916                              const char *topage,
917                              apr_pool_t *pool)
918{
919  xlate_handle_node_t *node;
920  svn_error_t *err;
921  const char *convset_key = get_xlate_key(topage, SVN_APR_UTF8_CHARSET,
922                                          pool);
923
924  SVN_ERR(check_cstring_utf8(src, pool));
925
926  SVN_ERR(get_xlate_handle_node(&node, topage, SVN_APR_UTF8_CHARSET,
927                                convset_key, pool));
928  err = convert_cstring(dest, src, node, pool);
929  err = svn_error_compose_create(
930          err,
931          put_xlate_handle_node(node, convset_key, pool));
932
933  return err;
934}
935
936const char *
937svn_utf__cstring_from_utf8_fuzzy(const char *src,
938                                 apr_pool_t *pool,
939                                 svn_error_t *(*convert_from_utf8)
940                                 (const char **, const char *, apr_pool_t *))
941{
942  const char *escaped, *converted;
943  svn_error_t *err;
944
945  escaped = svn_utf__fuzzy_escape(src, strlen(src), pool);
946
947  /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to
948     contain only 7-bit bytes :-).  Recode to native... */
949  err = convert_from_utf8(((const char **) &converted), escaped, pool);
950
951  if (err)
952    {
953      svn_error_clear(err);
954      return escaped;
955    }
956  else
957    return converted;
958
959  /* ### Check the client locale, maybe we can avoid that second
960   * conversion!  See Ulrich Drepper's patch at
961   * http://subversion.tigris.org/issues/show_bug.cgi?id=807.
962   */
963}
964
965
966const char *
967svn_utf_cstring_from_utf8_fuzzy(const char *src,
968                                apr_pool_t *pool)
969{
970  return svn_utf__cstring_from_utf8_fuzzy(src, pool,
971                                          svn_utf_cstring_from_utf8);
972}
973
974
975svn_error_t *
976svn_utf_cstring_from_utf8_stringbuf(const char **dest,
977                                    const svn_stringbuf_t *src,
978                                    apr_pool_t *pool)
979{
980  svn_stringbuf_t *destbuf;
981
982  SVN_ERR(svn_utf_stringbuf_from_utf8(&destbuf, src, pool));
983  *dest = destbuf->data;
984
985  return SVN_NO_ERROR;
986}
987
988
989svn_error_t *
990svn_utf_cstring_from_utf8_string(const char **dest,
991                                 const svn_string_t *src,
992                                 apr_pool_t *pool)
993{
994  svn_stringbuf_t *dbuf;
995  xlate_handle_node_t *node;
996  svn_error_t *err;
997
998  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
999
1000  if (node->handle)
1001    {
1002      err = check_utf8(src->data, src->len, pool);
1003      if (! err)
1004        err = convert_to_stringbuf(node, src->data, src->len,
1005                                   &dbuf, pool);
1006      if (! err)
1007        *dest = dbuf->data;
1008    }
1009  else
1010    {
1011      err = check_non_ascii(src->data, src->len, pool);
1012      if (! err)
1013        *dest = apr_pstrmemdup(pool, src->data, src->len);
1014    }
1015
1016  err = svn_error_compose_create(
1017          err,
1018          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
1019
1020  return err;
1021}
1022
1023
1024/* Insert the given UCS-4 VALUE into BUF at the given OFFSET. */
1025static void
1026membuf_insert_ucs4(svn_membuf_t *buf, apr_size_t offset, apr_int32_t value)
1027{
1028  svn_membuf__resize(buf, (offset + 1) * sizeof(value));
1029  ((apr_int32_t*)buf->data)[offset] = value;
1030}
1031
1032/* TODO: Use compiler intrinsics for byte swaps. */
1033#define SWAP_SHORT(x)  ((((x) & 0xff) << 8) | (((x) >> 8) & 0xff))
1034#define SWAP_LONG(x)   ((((x) & 0xff) << 24) | (((x) & 0xff00) << 8)    \
1035                        | (((x) >> 8) & 0xff00) | (((x) >> 24) & 0xff))
1036
1037#define IS_UTF16_LEAD_SURROGATE(c)   ((c) >= 0xd800 && (c) <= 0xdbff)
1038#define IS_UTF16_TRAIL_SURROGATE(c)  ((c) >= 0xdc00 && (c) <= 0xdfff)
1039
1040svn_error_t *
1041svn_utf__utf16_to_utf8(const svn_string_t **result,
1042                       const apr_uint16_t *utf16str,
1043                       apr_size_t utf16len,
1044                       svn_boolean_t big_endian,
1045                       apr_pool_t *result_pool,
1046                       apr_pool_t *scratch_pool)
1047{
1048  static const apr_uint16_t endiancheck = 0xa55a;
1049  const svn_boolean_t arch_big_endian =
1050    (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
1051  const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
1052
1053  apr_uint16_t lead_surrogate;
1054  apr_size_t length;
1055  apr_size_t offset;
1056  svn_membuf_t ucs4buf;
1057  svn_membuf_t resultbuf;
1058  svn_string_t *res;
1059
1060  if (utf16len == SVN_UTF__UNKNOWN_LENGTH)
1061    {
1062      const apr_uint16_t *endp = utf16str;
1063      while (*endp++)
1064        ;
1065      utf16len = (endp - utf16str);
1066    }
1067
1068  svn_membuf__create(&ucs4buf, utf16len * sizeof(apr_int32_t), scratch_pool);
1069
1070  for (lead_surrogate = 0, length = 0, offset = 0;
1071       offset < utf16len; ++offset)
1072    {
1073      const apr_uint16_t code =
1074        (swap_order ? SWAP_SHORT(utf16str[offset]) : utf16str[offset]);
1075
1076      if (lead_surrogate)
1077        {
1078          if (IS_UTF16_TRAIL_SURROGATE(code))
1079            {
1080              /* Combine the lead and trail currogates into a 32-bit code. */
1081              membuf_insert_ucs4(&ucs4buf, length++,
1082                                 (0x010000
1083                                  + (((lead_surrogate & 0x03ff) << 10)
1084                                     | (code & 0x03ff))));
1085              lead_surrogate = 0;
1086              continue;
1087            }
1088          else
1089            {
1090              /* If we didn't find a surrogate pair, just dump the
1091                 lead surrogate into the stream. */
1092              membuf_insert_ucs4(&ucs4buf, length++, lead_surrogate);
1093              lead_surrogate = 0;
1094            }
1095        }
1096
1097      if ((offset + 1) < utf16len && IS_UTF16_LEAD_SURROGATE(code))
1098        {
1099          /* Store a lead surrogate that is followed by at least one
1100             code for the next iteration. */
1101          lead_surrogate = code;
1102          continue;
1103        }
1104      else
1105        membuf_insert_ucs4(&ucs4buf, length++, code);
1106    }
1107
1108  /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
1109     per code point for encoding. The buffer will grow as
1110     necessary. */
1111  svn_membuf__create(&resultbuf, length * 2, result_pool);
1112  SVN_ERR(svn_utf__encode_ucs4_string(
1113              &resultbuf, ucs4buf.data, length, &length));
1114
1115  res = apr_palloc(result_pool, sizeof(*res));
1116  res->data = resultbuf.data;
1117  res->len = length;
1118  *result = res;
1119  return SVN_NO_ERROR;
1120}
1121
1122
1123svn_error_t *
1124svn_utf__utf32_to_utf8(const svn_string_t **result,
1125                       const apr_int32_t *utf32str,
1126                       apr_size_t utf32len,
1127                       svn_boolean_t big_endian,
1128                       apr_pool_t *result_pool,
1129                       apr_pool_t *scratch_pool)
1130{
1131  static const apr_int32_t endiancheck = 0xa5cbbc5a;
1132  const svn_boolean_t arch_big_endian =
1133    (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
1134  const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
1135
1136  apr_size_t length;
1137  svn_membuf_t resultbuf;
1138  svn_string_t *res;
1139
1140  if (utf32len == SVN_UTF__UNKNOWN_LENGTH)
1141    {
1142      const apr_int32_t *endp = utf32str;
1143      while (*endp++)
1144        ;
1145      utf32len = (endp - utf32str);
1146    }
1147
1148  if (swap_order)
1149    {
1150      apr_size_t offset;
1151      svn_membuf_t ucs4buf;
1152
1153      svn_membuf__create(&ucs4buf, utf32len * sizeof(apr_int32_t),
1154                         scratch_pool);
1155
1156      for (offset = 0; offset < utf32len; ++offset)
1157        {
1158          const apr_int32_t code = SWAP_LONG(utf32str[offset]);
1159          membuf_insert_ucs4(&ucs4buf, offset, code);
1160        }
1161      utf32str = ucs4buf.data;
1162    }
1163
1164  /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
1165     per code point for encoding. The buffer will grow as
1166     necessary. */
1167  svn_membuf__create(&resultbuf, utf32len * 2, result_pool);
1168  SVN_ERR(svn_utf__encode_ucs4_string(
1169              &resultbuf, utf32str, utf32len, &length));
1170
1171  res = apr_palloc(result_pool, sizeof(*res));
1172  res->data = resultbuf.data;
1173  res->len = length;
1174  *result = res;
1175  return SVN_NO_ERROR;
1176}
1177
1178
1179#ifdef WIN32
1180
1181
1182svn_error_t *
1183svn_utf__win32_utf8_to_utf16(const WCHAR **result,
1184                             const char *src,
1185                             const WCHAR *prefix,
1186                             apr_pool_t *result_pool)
1187{
1188  const int utf8_count = strlen(src);
1189  const int prefix_len = (prefix ? lstrlenW(prefix) : 0);
1190  WCHAR *wide_str;
1191  int wide_count;
1192
1193  if (0 == prefix_len + utf8_count)
1194    {
1195      *result = L"";
1196      return SVN_NO_ERROR;
1197    }
1198
1199  wide_count = MultiByteToWideChar(CP_UTF8, 0, src, utf8_count, NULL, 0);
1200  if (wide_count == 0)
1201    return svn_error_wrap_apr(apr_get_os_error(),
1202                              _("Conversion to UTF-16 failed"));
1203
1204  wide_str = apr_palloc(result_pool,
1205                        (prefix_len + wide_count + 1) * sizeof(*wide_str));
1206  if (prefix_len)
1207    memcpy(wide_str, prefix, prefix_len * sizeof(*wide_str));
1208  if (0 == MultiByteToWideChar(CP_UTF8, 0, src, utf8_count,
1209                               wide_str + prefix_len, wide_count))
1210    return svn_error_wrap_apr(apr_get_os_error(),
1211                              _("Conversion to UTF-16 failed"));
1212
1213  wide_str[prefix_len + wide_count] = 0;
1214  *result = wide_str;
1215
1216  return SVN_NO_ERROR;
1217}
1218
1219svn_error_t *
1220svn_utf__win32_utf16_to_utf8(const char **result,
1221                             const WCHAR *src,
1222                             const char *prefix,
1223                             apr_pool_t *result_pool)
1224{
1225  const int wide_count = lstrlenW(src);
1226  const int prefix_len = (prefix ? strlen(prefix) : 0);
1227  char *utf8_str;
1228  int utf8_count;
1229
1230  if (0 == prefix_len + wide_count)
1231    {
1232      *result = "";
1233      return SVN_NO_ERROR;
1234    }
1235
1236  utf8_count = WideCharToMultiByte(CP_UTF8, 0, src, wide_count,
1237                                   NULL, 0, NULL, FALSE);
1238  if (utf8_count == 0)
1239    return svn_error_wrap_apr(apr_get_os_error(),
1240                              _("Conversion from UTF-16 failed"));
1241
1242  utf8_str = apr_palloc(result_pool,
1243                        (prefix_len + utf8_count + 1) * sizeof(*utf8_str));
1244  if (prefix_len)
1245    memcpy(utf8_str, prefix, prefix_len * sizeof(*utf8_str));
1246  if (0 == WideCharToMultiByte(CP_UTF8, 0, src, wide_count,
1247                               utf8_str + prefix_len, utf8_count,
1248                               NULL, FALSE))
1249    return svn_error_wrap_apr(apr_get_os_error(),
1250                              _("Conversion from UTF-16 failed"));
1251
1252  utf8_str[prefix_len + utf8_count] = 0;
1253  *result = utf8_str;
1254
1255  return SVN_NO_ERROR;
1256}
1257
1258#endif /* WIN32 */
1259