utf.c revision 262253
1/*
2 * utf.c:  UTF-8 conversion routines
3 *
4 * ====================================================================
5 *    Licensed to the Apache Software Foundation (ASF) under one
6 *    or more contributor license agreements.  See the NOTICE file
7 *    distributed with this work for additional information
8 *    regarding copyright ownership.  The ASF licenses this file
9 *    to you under the Apache License, Version 2.0 (the
10 *    "License"); you may not use this file except in compliance
11 *    with the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 *    Unless required by applicable law or agreed to in writing,
16 *    software distributed under the License is distributed on an
17 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 *    KIND, either express or implied.  See the License for the
19 *    specific language governing permissions and limitations
20 *    under the License.
21 * ====================================================================
22 */
23
24
25
26#include <stdlib.h>
27#include <string.h>
28#include <assert.h>
29
30#include <apr_strings.h>
31#include <apr_lib.h>
32#include <apr_xlate.h>
33#include <apr_atomic.h>
34
35#include "svn_hash.h"
36#include "svn_string.h"
37#include "svn_error.h"
38#include "svn_pools.h"
39#include "svn_ctype.h"
40#include "svn_utf.h"
41#include "svn_private_config.h"
42#include "win32_xlate.h"
43
44#include "private/svn_utf_private.h"
45#include "private/svn_dep_compat.h"
46#include "private/svn_string_private.h"
47#include "private/svn_mutex.h"
48
49
50
51/* Use these static strings to maximize performance on standard conversions.
52 * Any strings on other locations are still valid, however.
53 */
54static const char *SVN_UTF_NTOU_XLATE_HANDLE = "svn-utf-ntou-xlate-handle";
55static const char *SVN_UTF_UTON_XLATE_HANDLE = "svn-utf-uton-xlate-handle";
56
57static const char *SVN_APR_UTF8_CHARSET = "UTF-8";
58
59static svn_mutex__t *xlate_handle_mutex = NULL;
60static svn_boolean_t assume_native_charset_is_utf8 = FALSE;
61
62/* The xlate handle cache is a global hash table with linked lists of xlate
63 * handles.  In multi-threaded environments, a thread "borrows" an xlate
64 * handle from the cache during a translation and puts it back afterwards.
65 * This avoids holding a global lock for all translations.
66 * If there is no handle for a particular key when needed, a new is
67 * handle is created and put in the cache after use.
68 * This means that there will be at most N handles open for a key, where N
69 * is the number of simultanous handles in use for that key. */
70
71typedef struct xlate_handle_node_t {
72  apr_xlate_t *handle;
73  /* FALSE if the handle is not valid, since its pool is being
74     destroyed. */
75  svn_boolean_t valid;
76  /* The name of a char encoding or APR_LOCALE_CHARSET. */
77  const char *frompage, *topage;
78  struct xlate_handle_node_t *next;
79} xlate_handle_node_t;
80
81/* This maps const char * userdata_key strings to xlate_handle_node_t **
82   handles to the first entry in the linked list of xlate handles.  We don't
83   store the pointer to the list head directly in the hash table, since we
84   remove/insert entries at the head in the list in the code below, and
85   we can't use apr_hash_set() in each character translation because that
86   function allocates memory in each call where the value is non-NULL.
87   Since these allocations take place in a global pool, this would be a
88   memory leak. */
89static apr_hash_t *xlate_handle_hash = NULL;
90
91/* "1st level cache" to standard conversion maps. We may access these
92 * using atomic xchange ops, i.e. without further thread synchronization.
93 * If the respective item is NULL, fallback to hash lookup.
94 */
95static void * volatile xlat_ntou_static_handle = NULL;
96static void * volatile xlat_uton_static_handle = NULL;
97
98/* Clean up the xlate handle cache. */
99static apr_status_t
100xlate_cleanup(void *arg)
101{
102  /* We set the cache variables to NULL so that translation works in other
103     cleanup functions, even if it isn't cached then. */
104  xlate_handle_hash = NULL;
105
106  /* ensure no stale objects get accessed */
107  xlat_ntou_static_handle = NULL;
108  xlat_uton_static_handle = NULL;
109
110  return APR_SUCCESS;
111}
112
113/* Set the handle of ARG to NULL. */
114static apr_status_t
115xlate_handle_node_cleanup(void *arg)
116{
117  xlate_handle_node_t *node = arg;
118
119  node->valid = FALSE;
120  return APR_SUCCESS;
121}
122
123void
124svn_utf_initialize2(svn_boolean_t assume_native_utf8,
125                    apr_pool_t *pool)
126{
127  if (!xlate_handle_hash)
128    {
129      /* We create our own subpool, which we protect with the mutex.
130         We can't use the pool passed to us by the caller, since we will
131         use it for xlate handle allocations, possibly in multiple threads,
132         and pool allocation is not thread-safe. */
133      apr_pool_t *subpool = svn_pool_create(pool);
134      svn_mutex__t *mutex;
135      svn_error_t *err = svn_mutex__init(&mutex, TRUE, subpool);
136      if (err)
137        {
138          svn_error_clear(err);
139          return;
140        }
141
142      xlate_handle_mutex = mutex;
143      xlate_handle_hash = apr_hash_make(subpool);
144
145      apr_pool_cleanup_register(subpool, NULL, xlate_cleanup,
146                                apr_pool_cleanup_null);
147    }
148
149    if (!assume_native_charset_is_utf8)
150      assume_native_charset_is_utf8 = assume_native_utf8;
151}
152
153/* Return a unique string key based on TOPAGE and FROMPAGE.  TOPAGE and
154 * FROMPAGE can be any valid arguments of the same name to
155 * apr_xlate_open().  Allocate the returned string in POOL. */
156static const char*
157get_xlate_key(const char *topage,
158              const char *frompage,
159              apr_pool_t *pool)
160{
161  /* In the cases of SVN_APR_LOCALE_CHARSET and SVN_APR_DEFAULT_CHARSET
162   * topage/frompage is really an int, not a valid string.  So generate a
163   * unique key accordingly. */
164  if (frompage == SVN_APR_LOCALE_CHARSET)
165    frompage = "APR_LOCALE_CHARSET";
166  else if (frompage == SVN_APR_DEFAULT_CHARSET)
167    frompage = "APR_DEFAULT_CHARSET";
168
169  if (topage == SVN_APR_LOCALE_CHARSET)
170    topage = "APR_LOCALE_CHARSET";
171  else if (topage == SVN_APR_DEFAULT_CHARSET)
172    topage = "APR_DEFAULT_CHARSET";
173
174  return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage,
175                     "-xlate-handle", (char *)NULL);
176}
177
178/* Atomically replace the content in *MEM with NEW_VALUE and return
179 * the previous content of *MEM. If atomicy cannot be guaranteed,
180 * *MEM will not be modified and NEW_VALUE is simply returned to
181 * the caller.
182 */
183static APR_INLINE void*
184atomic_swap(void * volatile * mem, void *new_value)
185{
186#if APR_HAS_THREADS
187#if APR_VERSION_AT_LEAST(1,3,0)
188  /* Cast is necessary because of APR bug:
189     https://issues.apache.org/bugzilla/show_bug.cgi?id=50731 */
190   return apr_atomic_xchgptr((volatile void **)mem, new_value);
191#else
192   /* old APRs don't support atomic swaps. Simply return the
193    * input to the caller for further proccessing. */
194   return new_value;
195#endif
196#else
197   /* no threads - no sync. necessary */
198   void *old_value = (void*)*mem;
199   *mem = new_value;
200   return old_value;
201#endif
202}
203
204/* Set *RET to a newly created handle node for converting from FROMPAGE
205   to TOPAGE, If apr_xlate_open() returns APR_EINVAL or APR_ENOTIMPL, set
206   (*RET)->handle to NULL.  If fail for any other reason, return the error.
207   Allocate *RET and its xlate handle in POOL. */
208static svn_error_t *
209xlate_alloc_handle(xlate_handle_node_t **ret,
210                   const char *topage, const char *frompage,
211                   apr_pool_t *pool)
212{
213  apr_status_t apr_err;
214  apr_xlate_t *handle;
215  const char *name;
216
217  /* The error handling doesn't support the following cases, since we don't
218     use them currently.  Catch this here. */
219  SVN_ERR_ASSERT(frompage != SVN_APR_DEFAULT_CHARSET
220                 && topage != SVN_APR_DEFAULT_CHARSET
221                 && (frompage != SVN_APR_LOCALE_CHARSET
222                     || topage != SVN_APR_LOCALE_CHARSET));
223
224  /* Try to create a handle. */
225#if defined(WIN32)
226  apr_err = svn_subr__win32_xlate_open((win32_xlate_t **)&handle, topage,
227                                       frompage, pool);
228  name = "win32-xlate: ";
229#else
230  apr_err = apr_xlate_open(&handle, topage, frompage, pool);
231  name = "APR: ";
232#endif
233
234  if (APR_STATUS_IS_EINVAL(apr_err) || APR_STATUS_IS_ENOTIMPL(apr_err))
235    handle = NULL;
236  else if (apr_err != APR_SUCCESS)
237    {
238      const char *errstr;
239      char apr_strerr[512];
240
241      /* Can't use svn_error_wrap_apr here because it calls functions in
242         this file, leading to infinite recursion. */
243      if (frompage == SVN_APR_LOCALE_CHARSET)
244        errstr = apr_psprintf(pool,
245                              _("Can't create a character converter from "
246                                "native encoding to '%s'"), topage);
247      else if (topage == SVN_APR_LOCALE_CHARSET)
248        errstr = apr_psprintf(pool,
249                              _("Can't create a character converter from "
250                                "'%s' to native encoding"), frompage);
251      else
252        errstr = apr_psprintf(pool,
253                              _("Can't create a character converter from "
254                                "'%s' to '%s'"), frompage, topage);
255
256      /* Just put the error on the stack, since svn_error_create duplicates it
257         later.  APR_STRERR will be in the local encoding, not in UTF-8, though.
258       */
259      svn_strerror(apr_err, apr_strerr, sizeof(apr_strerr));
260      return svn_error_createf(SVN_ERR_PLUGIN_LOAD_FAILURE,
261                               svn_error_create(apr_err, NULL, apr_strerr),
262                               "%s%s", name, errstr);
263    }
264
265  /* Allocate and initialize the node. */
266  *ret = apr_palloc(pool, sizeof(xlate_handle_node_t));
267  (*ret)->handle = handle;
268  (*ret)->valid = TRUE;
269  (*ret)->frompage = ((frompage != SVN_APR_LOCALE_CHARSET)
270                      ? apr_pstrdup(pool, frompage) : frompage);
271  (*ret)->topage = ((topage != SVN_APR_LOCALE_CHARSET)
272                    ? apr_pstrdup(pool, topage) : topage);
273  (*ret)->next = NULL;
274
275  /* If we are called from inside a pool cleanup handler, the just created
276     xlate handle will be closed when that handler returns by a newly
277     registered cleanup handler, however, the handle is still cached by us.
278     To prevent this, we register a cleanup handler that will reset the valid
279     flag of our node, so we don't use an invalid handle. */
280  if (handle)
281    apr_pool_cleanup_register(pool, *ret, xlate_handle_node_cleanup,
282                              apr_pool_cleanup_null);
283
284  return SVN_NO_ERROR;
285}
286
287/* Extend xlate_alloc_handle by using USERDATA_KEY as a key in our
288   global hash map, if available.
289
290   Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
291   hasn't been called or USERDATA_KEY is NULL.  Else, allocate them
292   in the pool of xlate_handle_hash.
293
294   Note: this function is not thread-safe. Call get_xlate_handle_node
295   instead. */
296static svn_error_t *
297get_xlate_handle_node_internal(xlate_handle_node_t **ret,
298                               const char *topage, const char *frompage,
299                               const char *userdata_key, apr_pool_t *pool)
300{
301  /* If we already have a handle, just return it. */
302  if (userdata_key && xlate_handle_hash)
303    {
304      xlate_handle_node_t *old_node = NULL;
305
306      /* 2nd level: hash lookup */
307      xlate_handle_node_t **old_node_p = svn_hash_gets(xlate_handle_hash,
308                                                       userdata_key);
309      if (old_node_p)
310        old_node = *old_node_p;
311      if (old_node)
312        {
313          /* Ensure that the handle is still valid. */
314          if (old_node->valid)
315            {
316              /* Remove from the list. */
317              *old_node_p = old_node->next;
318              old_node->next = NULL;
319              *ret = old_node;
320              return SVN_NO_ERROR;
321            }
322        }
323    }
324
325  /* Note that we still have the mutex locked (if it is initialized), so we
326     can use the global pool for creating the new xlate handle. */
327
328  /* Use the correct pool for creating the handle. */
329  pool = apr_hash_pool_get(xlate_handle_hash);
330
331  return xlate_alloc_handle(ret, topage, frompage, pool);
332}
333
334/* Set *RET to a handle node for converting from FROMPAGE to TOPAGE,
335   creating the handle node if it doesn't exist in USERDATA_KEY.
336   If a node is not cached and apr_xlate_open() returns APR_EINVAL or
337   APR_ENOTIMPL, set (*RET)->handle to NULL.  If fail for any other
338   reason, return the error.
339
340   Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
341   hasn't been called or USERDATA_KEY is NULL.  Else, allocate them
342   in the pool of xlate_handle_hash. */
343static svn_error_t *
344get_xlate_handle_node(xlate_handle_node_t **ret,
345                      const char *topage, const char *frompage,
346                      const char *userdata_key, apr_pool_t *pool)
347{
348  xlate_handle_node_t *old_node = NULL;
349
350  /* If we already have a handle, just return it. */
351  if (userdata_key)
352    {
353      if (xlate_handle_hash)
354        {
355          /* 1st level: global, static items */
356          if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
357            old_node = atomic_swap(&xlat_ntou_static_handle, NULL);
358          else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
359            old_node = atomic_swap(&xlat_uton_static_handle, NULL);
360
361          if (old_node && old_node->valid)
362            {
363              *ret = old_node;
364              return SVN_NO_ERROR;
365            }
366        }
367      else
368        {
369          void *p;
370          /* We fall back on a per-pool cache instead. */
371          apr_pool_userdata_get(&p, userdata_key, pool);
372          old_node = p;
373          /* Ensure that the handle is still valid. */
374          if (old_node && old_node->valid)
375            {
376              *ret = old_node;
377              return SVN_NO_ERROR;
378            }
379
380          return xlate_alloc_handle(ret, topage, frompage, pool);
381        }
382    }
383
384  SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
385                       get_xlate_handle_node_internal(ret,
386                                                      topage,
387                                                      frompage,
388                                                      userdata_key,
389                                                      pool));
390
391  return SVN_NO_ERROR;
392}
393
394/* Put back NODE into the xlate handle cache for use by other calls.
395
396   Note: this function is not thread-safe. Call put_xlate_handle_node
397   instead. */
398static svn_error_t *
399put_xlate_handle_node_internal(xlate_handle_node_t *node,
400                               const char *userdata_key)
401{
402  xlate_handle_node_t **node_p = svn_hash_gets(xlate_handle_hash, userdata_key);
403  if (node_p == NULL)
404    {
405      userdata_key = apr_pstrdup(apr_hash_pool_get(xlate_handle_hash),
406                                  userdata_key);
407      node_p = apr_palloc(apr_hash_pool_get(xlate_handle_hash),
408                          sizeof(*node_p));
409      *node_p = NULL;
410      svn_hash_sets(xlate_handle_hash, userdata_key, node_p);
411    }
412  node->next = *node_p;
413  *node_p = node;
414
415  return SVN_NO_ERROR;
416}
417
418/* Put back NODE into the xlate handle cache for use by other calls.
419   If there is no global cache, store the handle in POOL.
420   Ignore errors related to locking/unlocking the mutex. */
421static svn_error_t *
422put_xlate_handle_node(xlate_handle_node_t *node,
423                      const char *userdata_key,
424                      apr_pool_t *pool)
425{
426  assert(node->next == NULL);
427  if (!userdata_key)
428    return SVN_NO_ERROR;
429
430  /* push previous global node to the hash */
431  if (xlate_handle_hash)
432    {
433      /* 1st level: global, static items */
434      if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
435        node = atomic_swap(&xlat_ntou_static_handle, node);
436      else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
437        node = atomic_swap(&xlat_uton_static_handle, node);
438      if (node == NULL)
439        return SVN_NO_ERROR;
440
441      SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
442                           put_xlate_handle_node_internal(node,
443                                                          userdata_key));
444    }
445  else
446    {
447      /* Store it in the per-pool cache. */
448      apr_pool_userdata_set(node, userdata_key, apr_pool_cleanup_null, pool);
449    }
450
451  return SVN_NO_ERROR;
452}
453
454/* Return the apr_xlate handle for converting native characters to UTF-8. */
455static svn_error_t *
456get_ntou_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
457{
458  return get_xlate_handle_node(ret, SVN_APR_UTF8_CHARSET,
459                               assume_native_charset_is_utf8
460                                 ? SVN_APR_UTF8_CHARSET
461                                 : SVN_APR_LOCALE_CHARSET,
462                               SVN_UTF_NTOU_XLATE_HANDLE, pool);
463}
464
465
466/* Return the apr_xlate handle for converting UTF-8 to native characters.
467   Create one if it doesn't exist.  If unable to find a handle, or
468   unable to create one because apr_xlate_open returned APR_EINVAL, then
469   set *RET to null and return SVN_NO_ERROR; if fail for some other
470   reason, return error. */
471static svn_error_t *
472get_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
473{
474  return get_xlate_handle_node(ret,
475                               assume_native_charset_is_utf8
476                                 ? SVN_APR_UTF8_CHARSET
477                                 : SVN_APR_LOCALE_CHARSET,
478                               SVN_APR_UTF8_CHARSET,
479                               SVN_UTF_UTON_XLATE_HANDLE, pool);
480}
481
482
483/* Copy LEN bytes of SRC, converting non-ASCII and zero bytes to ?\nnn
484   sequences, allocating the result in POOL. */
485static const char *
486fuzzy_escape(const char *src, apr_size_t len, apr_pool_t *pool)
487{
488  const char *src_orig = src, *src_end = src + len;
489  apr_size_t new_len = 0;
490  char *new;
491  const char *new_orig;
492
493  /* First count how big a dest string we'll need. */
494  while (src < src_end)
495    {
496      if (! svn_ctype_isascii(*src) || *src == '\0')
497        new_len += 5;  /* 5 slots, for "?\XXX" */
498      else
499        new_len += 1;  /* one slot for the 7-bit char */
500
501      src++;
502    }
503
504  /* Allocate that amount, plus one slot for '\0' character. */
505  new = apr_palloc(pool, new_len + 1);
506
507  new_orig = new;
508
509  /* And fill it up. */
510  while (src_orig < src_end)
511    {
512      if (! svn_ctype_isascii(*src_orig) || src_orig == '\0')
513        {
514          /* This is the same format as svn_xml_fuzzy_escape uses, but that
515             function escapes different characters.  Please keep in sync!
516             ### If we add another fuzzy escape somewhere, we should abstract
517             ### this out to a common function. */
518          apr_snprintf(new, 6, "?\\%03u", (unsigned char) *src_orig);
519          new += 5;
520        }
521      else
522        {
523          *new = *src_orig;
524          new += 1;
525        }
526
527      src_orig++;
528    }
529
530  *new = '\0';
531
532  return new_orig;
533}
534
535/* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result
536   in *DEST, which is allocated in POOL. */
537static svn_error_t *
538convert_to_stringbuf(xlate_handle_node_t *node,
539                     const char *src_data,
540                     apr_size_t src_length,
541                     svn_stringbuf_t **dest,
542                     apr_pool_t *pool)
543{
544#ifdef WIN32
545  apr_status_t apr_err;
546
547  apr_err = svn_subr__win32_xlate_to_stringbuf((win32_xlate_t *) node->handle,
548                                               src_data, src_length,
549                                               dest, pool);
550#else
551  apr_size_t buflen = src_length * 2;
552  apr_status_t apr_err;
553  apr_size_t srclen = src_length;
554  apr_size_t destlen = buflen;
555
556  /* Initialize *DEST to an empty stringbuf.
557     A 1:2 ratio of input bytes to output bytes (as assigned above)
558     should be enough for most translations, and if it turns out not
559     to be enough, we'll grow the buffer again, sizing it based on a
560     1:3 ratio of the remainder of the string. */
561  *dest = svn_stringbuf_create_ensure(buflen + 1, pool);
562
563  /* Not only does it not make sense to convert an empty string, but
564     apr-iconv is quite unreasonable about not allowing that. */
565  if (src_length == 0)
566    return SVN_NO_ERROR;
567
568  do
569    {
570      /* Set up state variables for xlate. */
571      destlen = buflen - (*dest)->len;
572
573      /* Attempt the conversion. */
574      apr_err = apr_xlate_conv_buffer(node->handle,
575                                      src_data + (src_length - srclen),
576                                      &srclen,
577                                      (*dest)->data + (*dest)->len,
578                                      &destlen);
579
580      /* Now, update the *DEST->len to track the amount of output data
581         churned out so far from this loop. */
582      (*dest)->len += ((buflen - (*dest)->len) - destlen);
583      buflen += srclen * 3; /* 3 is middle ground, 2 wasn't enough
584                               for all characters in the buffer, 4 is
585                               maximum character size (currently) */
586
587
588    } while (apr_err == APR_SUCCESS && srclen != 0);
589#endif
590
591  /* If we exited the loop with an error, return the error. */
592  if (apr_err)
593    {
594      const char *errstr;
595      svn_error_t *err;
596
597      /* Can't use svn_error_wrap_apr here because it calls functions in
598         this file, leading to infinite recursion. */
599      if (node->frompage == SVN_APR_LOCALE_CHARSET)
600        errstr = apr_psprintf
601          (pool, _("Can't convert string from native encoding to '%s':"),
602           node->topage);
603      else if (node->topage == SVN_APR_LOCALE_CHARSET)
604        errstr = apr_psprintf
605          (pool, _("Can't convert string from '%s' to native encoding:"),
606           node->frompage);
607      else
608        errstr = apr_psprintf
609          (pool, _("Can't convert string from '%s' to '%s':"),
610           node->frompage, node->topage);
611
612      err = svn_error_create(apr_err, NULL, fuzzy_escape(src_data,
613                                                         src_length, pool));
614      return svn_error_create(apr_err, err, errstr);
615    }
616  /* Else, exited due to success.  Trim the result buffer down to the
617     right length. */
618  (*dest)->data[(*dest)->len] = '\0';
619
620  return SVN_NO_ERROR;
621}
622
623
624/* Return APR_EINVAL if the first LEN bytes of DATA contain anything
625   other than seven-bit, non-control (except for whitespace) ASCII
626   characters, finding the error pool from POOL.  Otherwise, return
627   SVN_NO_ERROR. */
628static svn_error_t *
629check_non_ascii(const char *data, apr_size_t len, apr_pool_t *pool)
630{
631  const char *data_start = data;
632
633  for (; len > 0; --len, data++)
634    {
635      if ((! svn_ctype_isascii(*data))
636          || ((! svn_ctype_isspace(*data))
637              && svn_ctype_iscntrl(*data)))
638        {
639          /* Show the printable part of the data, followed by the
640             decimal code of the questionable character.  Because if a
641             user ever gets this error, she's going to have to spend
642             time tracking down the non-ASCII data, so we want to help
643             as much as possible.  And yes, we just call the unsafe
644             data "non-ASCII", even though the actual constraint is
645             somewhat more complex than that. */
646
647          if (data - data_start)
648            {
649              const char *error_data
650                = apr_pstrndup(pool, data_start, (data - data_start));
651
652              return svn_error_createf
653                (APR_EINVAL, NULL,
654                 _("Safe data '%s' was followed by non-ASCII byte %d: "
655                   "unable to convert to/from UTF-8"),
656                 error_data, *((const unsigned char *) data));
657            }
658          else
659            {
660              return svn_error_createf
661                (APR_EINVAL, NULL,
662                 _("Non-ASCII character (code %d) detected, "
663                   "and unable to convert to/from UTF-8"),
664                 *((const unsigned char *) data));
665            }
666        }
667    }
668
669  return SVN_NO_ERROR;
670}
671
672/* Construct an error with code APR_EINVAL and with a suitable message
673 * to describe the invalid UTF-8 sequence DATA of length LEN (which
674 * may have embedded NULLs).  We can't simply print the data, almost
675 * by definition we don't really know how it is encoded.
676 */
677static svn_error_t *
678invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
679{
680  const char *last = svn_utf__last_valid(data, len);
681  const char *valid_txt = "", *invalid_txt = "";
682  apr_size_t i;
683  size_t valid, invalid;
684
685  /* We will display at most 24 valid octets (this may split a leading
686     multi-byte character) as that should fit on one 80 character line. */
687  valid = last - data;
688  if (valid > 24)
689    valid = 24;
690  for (i = 0; i < valid; ++i)
691    valid_txt = apr_pstrcat(pool, valid_txt,
692                            apr_psprintf(pool, " %02x",
693                                         (unsigned char)last[i-valid]),
694                                         (char *)NULL);
695
696  /* 4 invalid octets will guarantee that the faulty octet is displayed */
697  invalid = data + len - last;
698  if (invalid > 4)
699    invalid = 4;
700  for (i = 0; i < invalid; ++i)
701    invalid_txt = apr_pstrcat(pool, invalid_txt,
702                              apr_psprintf(pool, " %02x",
703                                           (unsigned char)last[i]),
704                                           (char *)NULL);
705
706  return svn_error_createf(APR_EINVAL, NULL,
707                           _("Valid UTF-8 data\n(hex:%s)\n"
708                             "followed by invalid UTF-8 sequence\n(hex:%s)"),
709                           valid_txt, invalid_txt);
710}
711
712/* Verify that the sequence DATA of length LEN is valid UTF-8.
713   If it is not, return an error with code APR_EINVAL. */
714static svn_error_t *
715check_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
716{
717  if (! svn_utf__is_valid(data, len))
718    return invalid_utf8(data, len, pool);
719  return SVN_NO_ERROR;
720}
721
722/* Verify that the NULL terminated sequence DATA is valid UTF-8.
723   If it is not, return an error with code APR_EINVAL. */
724static svn_error_t *
725check_cstring_utf8(const char *data, apr_pool_t *pool)
726{
727
728  if (! svn_utf__cstring_is_valid(data))
729    return invalid_utf8(data, strlen(data), pool);
730  return SVN_NO_ERROR;
731}
732
733
734svn_error_t *
735svn_utf_stringbuf_to_utf8(svn_stringbuf_t **dest,
736                          const svn_stringbuf_t *src,
737                          apr_pool_t *pool)
738{
739  xlate_handle_node_t *node;
740  svn_error_t *err;
741
742  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
743
744  if (node->handle)
745    {
746      err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
747      if (! err)
748        err = check_utf8((*dest)->data, (*dest)->len, pool);
749    }
750  else
751    {
752      err = check_non_ascii(src->data, src->len, pool);
753      if (! err)
754        *dest = svn_stringbuf_dup(src, pool);
755    }
756
757  return svn_error_compose_create(err,
758                                  put_xlate_handle_node
759                                     (node,
760                                      SVN_UTF_NTOU_XLATE_HANDLE,
761                                      pool));
762}
763
764
765svn_error_t *
766svn_utf_string_to_utf8(const svn_string_t **dest,
767                       const svn_string_t *src,
768                       apr_pool_t *pool)
769{
770  svn_stringbuf_t *destbuf;
771  xlate_handle_node_t *node;
772  svn_error_t *err;
773
774  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
775
776  if (node->handle)
777    {
778      err = convert_to_stringbuf(node, src->data, src->len, &destbuf, pool);
779      if (! err)
780        err = check_utf8(destbuf->data, destbuf->len, pool);
781      if (! err)
782        *dest = svn_stringbuf__morph_into_string(destbuf);
783    }
784  else
785    {
786      err = check_non_ascii(src->data, src->len, pool);
787      if (! err)
788        *dest = svn_string_dup(src, pool);
789    }
790
791  return svn_error_compose_create(err,
792                                  put_xlate_handle_node
793                                     (node,
794                                      SVN_UTF_NTOU_XLATE_HANDLE,
795                                      pool));
796}
797
798
799/* Common implementation for svn_utf_cstring_to_utf8,
800   svn_utf_cstring_to_utf8_ex, svn_utf_cstring_from_utf8 and
801   svn_utf_cstring_from_utf8_ex. Convert SRC to DEST using NODE->handle as
802   the translator and allocating from POOL. */
803static svn_error_t *
804convert_cstring(const char **dest,
805                const char *src,
806                xlate_handle_node_t *node,
807                apr_pool_t *pool)
808{
809  if (node->handle)
810    {
811      svn_stringbuf_t *destbuf;
812      SVN_ERR(convert_to_stringbuf(node, src, strlen(src),
813                                   &destbuf, pool));
814      *dest = destbuf->data;
815    }
816  else
817    {
818      apr_size_t len = strlen(src);
819      SVN_ERR(check_non_ascii(src, len, pool));
820      *dest = apr_pstrmemdup(pool, src, len);
821    }
822  return SVN_NO_ERROR;
823}
824
825
826svn_error_t *
827svn_utf_cstring_to_utf8(const char **dest,
828                        const char *src,
829                        apr_pool_t *pool)
830{
831  xlate_handle_node_t *node;
832  svn_error_t *err;
833
834  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
835  err = convert_cstring(dest, src, node, pool);
836  SVN_ERR(svn_error_compose_create(err,
837                                   put_xlate_handle_node
838                                      (node,
839                                       SVN_UTF_NTOU_XLATE_HANDLE,
840                                       pool)));
841  return check_cstring_utf8(*dest, pool);
842}
843
844
845svn_error_t *
846svn_utf_cstring_to_utf8_ex2(const char **dest,
847                            const char *src,
848                            const char *frompage,
849                            apr_pool_t *pool)
850{
851  xlate_handle_node_t *node;
852  svn_error_t *err;
853  const char *convset_key = get_xlate_key(SVN_APR_UTF8_CHARSET, frompage,
854                                          pool);
855
856  SVN_ERR(get_xlate_handle_node(&node, SVN_APR_UTF8_CHARSET, frompage,
857                                convset_key, pool));
858  err = convert_cstring(dest, src, node, pool);
859  SVN_ERR(svn_error_compose_create(err,
860                                   put_xlate_handle_node
861                                      (node,
862                                       SVN_UTF_NTOU_XLATE_HANDLE,
863                                       pool)));
864
865  return check_cstring_utf8(*dest, pool);
866}
867
868
869svn_error_t *
870svn_utf_cstring_to_utf8_ex(const char **dest,
871                           const char *src,
872                           const char *frompage,
873                           const char *convset_key,
874                           apr_pool_t *pool)
875{
876  return svn_utf_cstring_to_utf8_ex2(dest, src, frompage, pool);
877}
878
879
880svn_error_t *
881svn_utf_stringbuf_from_utf8(svn_stringbuf_t **dest,
882                            const svn_stringbuf_t *src,
883                            apr_pool_t *pool)
884{
885  xlate_handle_node_t *node;
886  svn_error_t *err;
887
888  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
889
890  if (node->handle)
891    {
892      err = check_utf8(src->data, src->len, pool);
893      if (! err)
894        err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
895    }
896  else
897    {
898      err = check_non_ascii(src->data, src->len, pool);
899      if (! err)
900        *dest = svn_stringbuf_dup(src, pool);
901    }
902
903  err = svn_error_compose_create(
904          err,
905          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
906
907  return err;
908}
909
910
911svn_error_t *
912svn_utf_string_from_utf8(const svn_string_t **dest,
913                         const svn_string_t *src,
914                         apr_pool_t *pool)
915{
916  svn_stringbuf_t *dbuf;
917  xlate_handle_node_t *node;
918  svn_error_t *err;
919
920  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
921
922  if (node->handle)
923    {
924      err = check_utf8(src->data, src->len, pool);
925      if (! err)
926        err = convert_to_stringbuf(node, src->data, src->len,
927                                   &dbuf, pool);
928      if (! err)
929        *dest = svn_stringbuf__morph_into_string(dbuf);
930    }
931  else
932    {
933      err = check_non_ascii(src->data, src->len, pool);
934      if (! err)
935        *dest = svn_string_dup(src, pool);
936    }
937
938  err = svn_error_compose_create(
939          err,
940          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
941
942  return err;
943}
944
945
946svn_error_t *
947svn_utf_cstring_from_utf8(const char **dest,
948                          const char *src,
949                          apr_pool_t *pool)
950{
951  xlate_handle_node_t *node;
952  svn_error_t *err;
953
954  SVN_ERR(check_cstring_utf8(src, pool));
955
956  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
957  err = convert_cstring(dest, src, node, pool);
958  err = svn_error_compose_create(
959          err,
960          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
961
962  return err;
963}
964
965
966svn_error_t *
967svn_utf_cstring_from_utf8_ex2(const char **dest,
968                              const char *src,
969                              const char *topage,
970                              apr_pool_t *pool)
971{
972  xlate_handle_node_t *node;
973  svn_error_t *err;
974  const char *convset_key = get_xlate_key(topage, SVN_APR_UTF8_CHARSET,
975                                          pool);
976
977  SVN_ERR(check_cstring_utf8(src, pool));
978
979  SVN_ERR(get_xlate_handle_node(&node, topage, SVN_APR_UTF8_CHARSET,
980                                convset_key, pool));
981  err = convert_cstring(dest, src, node, pool);
982  err = svn_error_compose_create(
983          err,
984          put_xlate_handle_node(node, convset_key, pool));
985
986  return err;
987}
988
989
990svn_error_t *
991svn_utf_cstring_from_utf8_ex(const char **dest,
992                             const char *src,
993                             const char *topage,
994                             const char *convset_key,
995                             apr_pool_t *pool)
996{
997  return svn_utf_cstring_from_utf8_ex2(dest, src, topage, pool);
998}
999
1000
1001const char *
1002svn_utf__cstring_from_utf8_fuzzy(const char *src,
1003                                 apr_pool_t *pool,
1004                                 svn_error_t *(*convert_from_utf8)
1005                                 (const char **, const char *, apr_pool_t *))
1006{
1007  const char *escaped, *converted;
1008  svn_error_t *err;
1009
1010  escaped = fuzzy_escape(src, strlen(src), pool);
1011
1012  /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to
1013     contain only 7-bit bytes :-).  Recode to native... */
1014  err = convert_from_utf8(((const char **) &converted), escaped, pool);
1015
1016  if (err)
1017    {
1018      svn_error_clear(err);
1019      return escaped;
1020    }
1021  else
1022    return converted;
1023
1024  /* ### Check the client locale, maybe we can avoid that second
1025   * conversion!  See Ulrich Drepper's patch at
1026   * http://subversion.tigris.org/issues/show_bug.cgi?id=807.
1027   */
1028}
1029
1030
1031const char *
1032svn_utf_cstring_from_utf8_fuzzy(const char *src,
1033                                apr_pool_t *pool)
1034{
1035  return svn_utf__cstring_from_utf8_fuzzy(src, pool,
1036                                          svn_utf_cstring_from_utf8);
1037}
1038
1039
1040svn_error_t *
1041svn_utf_cstring_from_utf8_stringbuf(const char **dest,
1042                                    const svn_stringbuf_t *src,
1043                                    apr_pool_t *pool)
1044{
1045  svn_stringbuf_t *destbuf;
1046
1047  SVN_ERR(svn_utf_stringbuf_from_utf8(&destbuf, src, pool));
1048  *dest = destbuf->data;
1049
1050  return SVN_NO_ERROR;
1051}
1052
1053
1054svn_error_t *
1055svn_utf_cstring_from_utf8_string(const char **dest,
1056                                 const svn_string_t *src,
1057                                 apr_pool_t *pool)
1058{
1059  svn_stringbuf_t *dbuf;
1060  xlate_handle_node_t *node;
1061  svn_error_t *err;
1062
1063  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
1064
1065  if (node->handle)
1066    {
1067      err = check_utf8(src->data, src->len, pool);
1068      if (! err)
1069        err = convert_to_stringbuf(node, src->data, src->len,
1070                                   &dbuf, pool);
1071      if (! err)
1072        *dest = dbuf->data;
1073    }
1074  else
1075    {
1076      err = check_non_ascii(src->data, src->len, pool);
1077      if (! err)
1078        *dest = apr_pstrmemdup(pool, src->data, src->len);
1079    }
1080
1081  err = svn_error_compose_create(
1082          err,
1083          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
1084
1085  return err;
1086}
1087