1/*
2 * utf.c:  UTF-8 conversion routines
3 *
4 * ====================================================================
5 *    Licensed to the Apache Software Foundation (ASF) under one
6 *    or more contributor license agreements.  See the NOTICE file
7 *    distributed with this work for additional information
8 *    regarding copyright ownership.  The ASF licenses this file
9 *    to you under the Apache License, Version 2.0 (the
10 *    "License"); you may not use this file except in compliance
11 *    with the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 *    Unless required by applicable law or agreed to in writing,
16 *    software distributed under the License is distributed on an
17 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 *    KIND, either express or implied.  See the License for the
19 *    specific language governing permissions and limitations
20 *    under the License.
21 * ====================================================================
22 */
23
24
25
26#include <stdlib.h>
27#include <string.h>
28#include <assert.h>
29
30#include <apr_strings.h>
31#include <apr_lib.h>
32#include <apr_xlate.h>
33#include <apr_atomic.h>
34
35#include "svn_hash.h"
36#include "svn_string.h"
37#include "svn_error.h"
38#include "svn_pools.h"
39#include "svn_ctype.h"
40#include "svn_utf.h"
41#include "svn_private_config.h"
42#include "win32_xlate.h"
43
44#include "private/svn_utf_private.h"
45#include "private/svn_dep_compat.h"
46#include "private/svn_string_private.h"
47#include "private/svn_mutex.h"
48
49
50
51/* Use these static strings to maximize performance on standard conversions.
52 * Any strings on other locations are still valid, however.
53 */
54static const char *SVN_UTF_NTOU_XLATE_HANDLE = "svn-utf-ntou-xlate-handle";
55static const char *SVN_UTF_UTON_XLATE_HANDLE = "svn-utf-uton-xlate-handle";
56
57static const char *SVN_APR_UTF8_CHARSET = "UTF-8";
58
59static svn_mutex__t *xlate_handle_mutex = NULL;
60static svn_boolean_t assume_native_charset_is_utf8 = FALSE;
61
62/* The xlate handle cache is a global hash table with linked lists of xlate
63 * handles.  In multi-threaded environments, a thread "borrows" an xlate
64 * handle from the cache during a translation and puts it back afterwards.
65 * This avoids holding a global lock for all translations.
66 * If there is no handle for a particular key when needed, a new is
67 * handle is created and put in the cache after use.
68 * This means that there will be at most N handles open for a key, where N
69 * is the number of simultanous handles in use for that key. */
70
71typedef struct xlate_handle_node_t {
72  apr_xlate_t *handle;
73  /* FALSE if the handle is not valid, since its pool is being
74     destroyed. */
75  svn_boolean_t valid;
76  /* The name of a char encoding or APR_LOCALE_CHARSET. */
77  const char *frompage, *topage;
78  struct xlate_handle_node_t *next;
79} xlate_handle_node_t;
80
81/* This maps const char * userdata_key strings to xlate_handle_node_t **
82   handles to the first entry in the linked list of xlate handles.  We don't
83   store the pointer to the list head directly in the hash table, since we
84   remove/insert entries at the head in the list in the code below, and
85   we can't use apr_hash_set() in each character translation because that
86   function allocates memory in each call where the value is non-NULL.
87   Since these allocations take place in a global pool, this would be a
88   memory leak. */
89static apr_hash_t *xlate_handle_hash = NULL;
90
91/* "1st level cache" to standard conversion maps. We may access these
92 * using atomic xchange ops, i.e. without further thread synchronization.
93 * If the respective item is NULL, fallback to hash lookup.
94 */
95static void * volatile xlat_ntou_static_handle = NULL;
96static void * volatile xlat_uton_static_handle = NULL;
97
98/* Clean up the xlate handle cache. */
99static apr_status_t
100xlate_cleanup(void *arg)
101{
102  /* We set the cache variables to NULL so that translation works in other
103     cleanup functions, even if it isn't cached then. */
104  xlate_handle_hash = NULL;
105
106  /* ensure no stale objects get accessed */
107  xlat_ntou_static_handle = NULL;
108  xlat_uton_static_handle = NULL;
109
110  return APR_SUCCESS;
111}
112
113/* Set the handle of ARG to NULL. */
114static apr_status_t
115xlate_handle_node_cleanup(void *arg)
116{
117  xlate_handle_node_t *node = arg;
118
119  node->valid = FALSE;
120  return APR_SUCCESS;
121}
122
123void
124svn_utf_initialize2(svn_boolean_t assume_native_utf8,
125                    apr_pool_t *pool)
126{
127  if (!xlate_handle_hash)
128    {
129      /* We create our own subpool, which we protect with the mutex.
130         We can't use the pool passed to us by the caller, since we will
131         use it for xlate handle allocations, possibly in multiple threads,
132         and pool allocation is not thread-safe. */
133      apr_pool_t *subpool = svn_pool_create(pool);
134      svn_mutex__t *mutex;
135      svn_error_t *err = svn_mutex__init(&mutex, TRUE, subpool);
136      if (err)
137        {
138          svn_error_clear(err);
139          return;
140        }
141
142      xlate_handle_mutex = mutex;
143      xlate_handle_hash = apr_hash_make(subpool);
144
145      apr_pool_cleanup_register(subpool, NULL, xlate_cleanup,
146                                apr_pool_cleanup_null);
147    }
148
149    if (!assume_native_charset_is_utf8)
150      assume_native_charset_is_utf8 = assume_native_utf8;
151}
152
153/* Return a unique string key based on TOPAGE and FROMPAGE.  TOPAGE and
154 * FROMPAGE can be any valid arguments of the same name to
155 * apr_xlate_open().  Allocate the returned string in POOL. */
156static const char*
157get_xlate_key(const char *topage,
158              const char *frompage,
159              apr_pool_t *pool)
160{
161  /* In the cases of SVN_APR_LOCALE_CHARSET and SVN_APR_DEFAULT_CHARSET
162   * topage/frompage is really an int, not a valid string.  So generate a
163   * unique key accordingly. */
164  if (frompage == SVN_APR_LOCALE_CHARSET)
165    frompage = "APR_LOCALE_CHARSET";
166  else if (frompage == SVN_APR_DEFAULT_CHARSET)
167    frompage = "APR_DEFAULT_CHARSET";
168
169  if (topage == SVN_APR_LOCALE_CHARSET)
170    topage = "APR_LOCALE_CHARSET";
171  else if (topage == SVN_APR_DEFAULT_CHARSET)
172    topage = "APR_DEFAULT_CHARSET";
173
174  return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage,
175                     "-xlate-handle", (char *)NULL);
176}
177
178/* Atomically replace the content in *MEM with NEW_VALUE and return
179 * the previous content of *MEM. If atomicy cannot be guaranteed,
180 * *MEM will not be modified and NEW_VALUE is simply returned to
181 * the caller.
182 */
183static APR_INLINE void*
184atomic_swap(void * volatile * mem, void *new_value)
185{
186#if APR_HAS_THREADS
187#if APR_VERSION_AT_LEAST(1,3,0)
188  /* Cast is necessary because of APR bug:
189     https://issues.apache.org/bugzilla/show_bug.cgi?id=50731 */
190   return apr_atomic_xchgptr((volatile void **)mem, new_value);
191#else
192   /* old APRs don't support atomic swaps. Simply return the
193    * input to the caller for further proccessing. */
194   return new_value;
195#endif
196#else
197   /* no threads - no sync. necessary */
198   void *old_value = (void*)*mem;
199   *mem = new_value;
200   return old_value;
201#endif
202}
203
204/* Set *RET to a newly created handle node for converting from FROMPAGE
205   to TOPAGE, If apr_xlate_open() returns APR_EINVAL or APR_ENOTIMPL, set
206   (*RET)->handle to NULL.  If fail for any other reason, return the error.
207   Allocate *RET and its xlate handle in POOL. */
208static svn_error_t *
209xlate_alloc_handle(xlate_handle_node_t **ret,
210                   const char *topage, const char *frompage,
211                   apr_pool_t *pool)
212{
213  apr_status_t apr_err;
214  apr_xlate_t *handle;
215
216  /* The error handling doesn't support the following cases, since we don't
217     use them currently.  Catch this here. */
218  SVN_ERR_ASSERT(frompage != SVN_APR_DEFAULT_CHARSET
219                 && topage != SVN_APR_DEFAULT_CHARSET
220                 && (frompage != SVN_APR_LOCALE_CHARSET
221                     || topage != SVN_APR_LOCALE_CHARSET));
222
223  /* Try to create a handle. */
224#if defined(WIN32)
225  apr_err = svn_subr__win32_xlate_open((win32_xlate_t **)&handle, topage,
226                                       frompage, pool);
227#else
228  apr_err = apr_xlate_open(&handle, topage, frompage, pool);
229#endif
230
231  if (APR_STATUS_IS_EINVAL(apr_err) || APR_STATUS_IS_ENOTIMPL(apr_err))
232    handle = NULL;
233  else if (apr_err != APR_SUCCESS)
234    {
235      const char *errstr;
236      char apr_strerr[512];
237
238      /* Can't use svn_error_wrap_apr here because it calls functions in
239         this file, leading to infinite recursion. */
240      if (frompage == SVN_APR_LOCALE_CHARSET)
241        errstr = apr_psprintf(pool,
242                              _("Can't create a character converter from "
243                                "native encoding to '%s'"), topage);
244      else if (topage == SVN_APR_LOCALE_CHARSET)
245        errstr = apr_psprintf(pool,
246                              _("Can't create a character converter from "
247                                "'%s' to native encoding"), frompage);
248      else
249        errstr = apr_psprintf(pool,
250                              _("Can't create a character converter from "
251                                "'%s' to '%s'"), frompage, topage);
252
253      /* Just put the error on the stack, since svn_error_create duplicates it
254         later.  APR_STRERR will be in the local encoding, not in UTF-8, though.
255       */
256      svn_strerror(apr_err, apr_strerr, sizeof(apr_strerr));
257      return svn_error_create(apr_err,
258                              svn_error_create(apr_err, NULL, apr_strerr),
259                              errstr);
260    }
261
262  /* Allocate and initialize the node. */
263  *ret = apr_palloc(pool, sizeof(xlate_handle_node_t));
264  (*ret)->handle = handle;
265  (*ret)->valid = TRUE;
266  (*ret)->frompage = ((frompage != SVN_APR_LOCALE_CHARSET)
267                      ? apr_pstrdup(pool, frompage) : frompage);
268  (*ret)->topage = ((topage != SVN_APR_LOCALE_CHARSET)
269                    ? apr_pstrdup(pool, topage) : topage);
270  (*ret)->next = NULL;
271
272  /* If we are called from inside a pool cleanup handler, the just created
273     xlate handle will be closed when that handler returns by a newly
274     registered cleanup handler, however, the handle is still cached by us.
275     To prevent this, we register a cleanup handler that will reset the valid
276     flag of our node, so we don't use an invalid handle. */
277  if (handle)
278    apr_pool_cleanup_register(pool, *ret, xlate_handle_node_cleanup,
279                              apr_pool_cleanup_null);
280
281  return SVN_NO_ERROR;
282}
283
284/* Extend xlate_alloc_handle by using USERDATA_KEY as a key in our
285   global hash map, if available.
286
287   Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
288   hasn't been called or USERDATA_KEY is NULL.  Else, allocate them
289   in the pool of xlate_handle_hash.
290
291   Note: this function is not thread-safe. Call get_xlate_handle_node
292   instead. */
293static svn_error_t *
294get_xlate_handle_node_internal(xlate_handle_node_t **ret,
295                               const char *topage, const char *frompage,
296                               const char *userdata_key, apr_pool_t *pool)
297{
298  /* If we already have a handle, just return it. */
299  if (userdata_key && xlate_handle_hash)
300    {
301      xlate_handle_node_t *old_node = NULL;
302
303      /* 2nd level: hash lookup */
304      xlate_handle_node_t **old_node_p = svn_hash_gets(xlate_handle_hash,
305                                                       userdata_key);
306      if (old_node_p)
307        old_node = *old_node_p;
308      if (old_node)
309        {
310          /* Ensure that the handle is still valid. */
311          if (old_node->valid)
312            {
313              /* Remove from the list. */
314              *old_node_p = old_node->next;
315              old_node->next = NULL;
316              *ret = old_node;
317              return SVN_NO_ERROR;
318            }
319        }
320    }
321
322  /* Note that we still have the mutex locked (if it is initialized), so we
323     can use the global pool for creating the new xlate handle. */
324
325  /* Use the correct pool for creating the handle. */
326  pool = apr_hash_pool_get(xlate_handle_hash);
327
328  return xlate_alloc_handle(ret, topage, frompage, pool);
329}
330
331/* Set *RET to a handle node for converting from FROMPAGE to TOPAGE,
332   creating the handle node if it doesn't exist in USERDATA_KEY.
333   If a node is not cached and apr_xlate_open() returns APR_EINVAL or
334   APR_ENOTIMPL, set (*RET)->handle to NULL.  If fail for any other
335   reason, return the error.
336
337   Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
338   hasn't been called or USERDATA_KEY is NULL.  Else, allocate them
339   in the pool of xlate_handle_hash. */
340static svn_error_t *
341get_xlate_handle_node(xlate_handle_node_t **ret,
342                      const char *topage, const char *frompage,
343                      const char *userdata_key, apr_pool_t *pool)
344{
345  xlate_handle_node_t *old_node = NULL;
346
347  /* If we already have a handle, just return it. */
348  if (userdata_key)
349    {
350      if (xlate_handle_hash)
351        {
352          /* 1st level: global, static items */
353          if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
354            old_node = atomic_swap(&xlat_ntou_static_handle, NULL);
355          else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
356            old_node = atomic_swap(&xlat_uton_static_handle, NULL);
357
358          if (old_node && old_node->valid)
359            {
360              *ret = old_node;
361              return SVN_NO_ERROR;
362            }
363        }
364      else
365        {
366          void *p;
367          /* We fall back on a per-pool cache instead. */
368          apr_pool_userdata_get(&p, userdata_key, pool);
369          old_node = p;
370          /* Ensure that the handle is still valid. */
371          if (old_node && old_node->valid)
372            {
373              *ret = old_node;
374              return SVN_NO_ERROR;
375            }
376
377          return xlate_alloc_handle(ret, topage, frompage, pool);
378        }
379    }
380
381  SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
382                       get_xlate_handle_node_internal(ret,
383                                                      topage,
384                                                      frompage,
385                                                      userdata_key,
386                                                      pool));
387
388  return SVN_NO_ERROR;
389}
390
391/* Put back NODE into the xlate handle cache for use by other calls.
392
393   Note: this function is not thread-safe. Call put_xlate_handle_node
394   instead. */
395static svn_error_t *
396put_xlate_handle_node_internal(xlate_handle_node_t *node,
397                               const char *userdata_key)
398{
399  xlate_handle_node_t **node_p = svn_hash_gets(xlate_handle_hash, userdata_key);
400  if (node_p == NULL)
401    {
402      userdata_key = apr_pstrdup(apr_hash_pool_get(xlate_handle_hash),
403                                  userdata_key);
404      node_p = apr_palloc(apr_hash_pool_get(xlate_handle_hash),
405                          sizeof(*node_p));
406      *node_p = NULL;
407      svn_hash_sets(xlate_handle_hash, userdata_key, node_p);
408    }
409  node->next = *node_p;
410  *node_p = node;
411
412  return SVN_NO_ERROR;
413}
414
415/* Put back NODE into the xlate handle cache for use by other calls.
416   If there is no global cache, store the handle in POOL.
417   Ignore errors related to locking/unlocking the mutex. */
418static svn_error_t *
419put_xlate_handle_node(xlate_handle_node_t *node,
420                      const char *userdata_key,
421                      apr_pool_t *pool)
422{
423  assert(node->next == NULL);
424  if (!userdata_key)
425    return SVN_NO_ERROR;
426
427  /* push previous global node to the hash */
428  if (xlate_handle_hash)
429    {
430      /* 1st level: global, static items */
431      if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
432        node = atomic_swap(&xlat_ntou_static_handle, node);
433      else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
434        node = atomic_swap(&xlat_uton_static_handle, node);
435      if (node == NULL)
436        return SVN_NO_ERROR;
437
438      SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
439                           put_xlate_handle_node_internal(node,
440                                                          userdata_key));
441    }
442  else
443    {
444      /* Store it in the per-pool cache. */
445      apr_pool_userdata_set(node, userdata_key, apr_pool_cleanup_null, pool);
446    }
447
448  return SVN_NO_ERROR;
449}
450
451/* Return the apr_xlate handle for converting native characters to UTF-8. */
452static svn_error_t *
453get_ntou_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
454{
455  return get_xlate_handle_node(ret, SVN_APR_UTF8_CHARSET,
456                               assume_native_charset_is_utf8
457                                 ? SVN_APR_UTF8_CHARSET
458                                 : SVN_APR_LOCALE_CHARSET,
459                               SVN_UTF_NTOU_XLATE_HANDLE, pool);
460}
461
462
463/* Return the apr_xlate handle for converting UTF-8 to native characters.
464   Create one if it doesn't exist.  If unable to find a handle, or
465   unable to create one because apr_xlate_open returned APR_EINVAL, then
466   set *RET to null and return SVN_NO_ERROR; if fail for some other
467   reason, return error. */
468static svn_error_t *
469get_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
470{
471  return get_xlate_handle_node(ret,
472                               assume_native_charset_is_utf8
473                                 ? SVN_APR_UTF8_CHARSET
474                                 : SVN_APR_LOCALE_CHARSET,
475                               SVN_APR_UTF8_CHARSET,
476                               SVN_UTF_UTON_XLATE_HANDLE, pool);
477}
478
479
480/* Copy LEN bytes of SRC, converting non-ASCII and zero bytes to ?\nnn
481   sequences, allocating the result in POOL. */
482static const char *
483fuzzy_escape(const char *src, apr_size_t len, apr_pool_t *pool)
484{
485  const char *src_orig = src, *src_end = src + len;
486  apr_size_t new_len = 0;
487  char *new;
488  const char *new_orig;
489
490  /* First count how big a dest string we'll need. */
491  while (src < src_end)
492    {
493      if (! svn_ctype_isascii(*src) || *src == '\0')
494        new_len += 5;  /* 5 slots, for "?\XXX" */
495      else
496        new_len += 1;  /* one slot for the 7-bit char */
497
498      src++;
499    }
500
501  /* Allocate that amount, plus one slot for '\0' character. */
502  new = apr_palloc(pool, new_len + 1);
503
504  new_orig = new;
505
506  /* And fill it up. */
507  while (src_orig < src_end)
508    {
509      if (! svn_ctype_isascii(*src_orig) || src_orig == '\0')
510        {
511          /* This is the same format as svn_xml_fuzzy_escape uses, but that
512             function escapes different characters.  Please keep in sync!
513             ### If we add another fuzzy escape somewhere, we should abstract
514             ### this out to a common function. */
515          apr_snprintf(new, 6, "?\\%03u", (unsigned char) *src_orig);
516          new += 5;
517        }
518      else
519        {
520          *new = *src_orig;
521          new += 1;
522        }
523
524      src_orig++;
525    }
526
527  *new = '\0';
528
529  return new_orig;
530}
531
532/* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result
533   in *DEST, which is allocated in POOL. */
534static svn_error_t *
535convert_to_stringbuf(xlate_handle_node_t *node,
536                     const char *src_data,
537                     apr_size_t src_length,
538                     svn_stringbuf_t **dest,
539                     apr_pool_t *pool)
540{
541#ifdef WIN32
542  apr_status_t apr_err;
543
544  apr_err = svn_subr__win32_xlate_to_stringbuf((win32_xlate_t *) node->handle,
545                                               src_data, src_length,
546                                               dest, pool);
547#else
548  apr_size_t buflen = src_length * 2;
549  apr_status_t apr_err;
550  apr_size_t srclen = src_length;
551  apr_size_t destlen = buflen;
552
553  /* Initialize *DEST to an empty stringbuf.
554     A 1:2 ratio of input bytes to output bytes (as assigned above)
555     should be enough for most translations, and if it turns out not
556     to be enough, we'll grow the buffer again, sizing it based on a
557     1:3 ratio of the remainder of the string. */
558  *dest = svn_stringbuf_create_ensure(buflen + 1, pool);
559
560  /* Not only does it not make sense to convert an empty string, but
561     apr-iconv is quite unreasonable about not allowing that. */
562  if (src_length == 0)
563    return SVN_NO_ERROR;
564
565  do
566    {
567      /* Set up state variables for xlate. */
568      destlen = buflen - (*dest)->len;
569
570      /* Attempt the conversion. */
571      apr_err = apr_xlate_conv_buffer(node->handle,
572                                      src_data + (src_length - srclen),
573                                      &srclen,
574                                      (*dest)->data + (*dest)->len,
575                                      &destlen);
576
577      /* Now, update the *DEST->len to track the amount of output data
578         churned out so far from this loop. */
579      (*dest)->len += ((buflen - (*dest)->len) - destlen);
580      buflen += srclen * 3; /* 3 is middle ground, 2 wasn't enough
581                               for all characters in the buffer, 4 is
582                               maximum character size (currently) */
583
584
585    } while (apr_err == APR_SUCCESS && srclen != 0);
586#endif
587
588  /* If we exited the loop with an error, return the error. */
589  if (apr_err)
590    {
591      const char *errstr;
592      svn_error_t *err;
593
594      /* Can't use svn_error_wrap_apr here because it calls functions in
595         this file, leading to infinite recursion. */
596      if (node->frompage == SVN_APR_LOCALE_CHARSET)
597        errstr = apr_psprintf
598          (pool, _("Can't convert string from native encoding to '%s':"),
599           node->topage);
600      else if (node->topage == SVN_APR_LOCALE_CHARSET)
601        errstr = apr_psprintf
602          (pool, _("Can't convert string from '%s' to native encoding:"),
603           node->frompage);
604      else
605        errstr = apr_psprintf
606          (pool, _("Can't convert string from '%s' to '%s':"),
607           node->frompage, node->topage);
608
609      err = svn_error_create(apr_err, NULL, fuzzy_escape(src_data,
610                                                         src_length, pool));
611      return svn_error_create(apr_err, err, errstr);
612    }
613  /* Else, exited due to success.  Trim the result buffer down to the
614     right length. */
615  (*dest)->data[(*dest)->len] = '\0';
616
617  return SVN_NO_ERROR;
618}
619
620
621/* Return APR_EINVAL if the first LEN bytes of DATA contain anything
622   other than seven-bit, non-control (except for whitespace) ASCII
623   characters, finding the error pool from POOL.  Otherwise, return
624   SVN_NO_ERROR. */
625static svn_error_t *
626check_non_ascii(const char *data, apr_size_t len, apr_pool_t *pool)
627{
628  const char *data_start = data;
629
630  for (; len > 0; --len, data++)
631    {
632      if ((! svn_ctype_isascii(*data))
633          || ((! svn_ctype_isspace(*data))
634              && svn_ctype_iscntrl(*data)))
635        {
636          /* Show the printable part of the data, followed by the
637             decimal code of the questionable character.  Because if a
638             user ever gets this error, she's going to have to spend
639             time tracking down the non-ASCII data, so we want to help
640             as much as possible.  And yes, we just call the unsafe
641             data "non-ASCII", even though the actual constraint is
642             somewhat more complex than that. */
643
644          if (data - data_start)
645            {
646              const char *error_data
647                = apr_pstrndup(pool, data_start, (data - data_start));
648
649              return svn_error_createf
650                (APR_EINVAL, NULL,
651                 _("Safe data '%s' was followed by non-ASCII byte %d: "
652                   "unable to convert to/from UTF-8"),
653                 error_data, *((const unsigned char *) data));
654            }
655          else
656            {
657              return svn_error_createf
658                (APR_EINVAL, NULL,
659                 _("Non-ASCII character (code %d) detected, "
660                   "and unable to convert to/from UTF-8"),
661                 *((const unsigned char *) data));
662            }
663        }
664    }
665
666  return SVN_NO_ERROR;
667}
668
669/* Construct an error with code APR_EINVAL and with a suitable message
670 * to describe the invalid UTF-8 sequence DATA of length LEN (which
671 * may have embedded NULLs).  We can't simply print the data, almost
672 * by definition we don't really know how it is encoded.
673 */
674static svn_error_t *
675invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
676{
677  const char *last = svn_utf__last_valid(data, len);
678  const char *valid_txt = "", *invalid_txt = "";
679  apr_size_t i;
680  size_t valid, invalid;
681
682  /* We will display at most 24 valid octets (this may split a leading
683     multi-byte character) as that should fit on one 80 character line. */
684  valid = last - data;
685  if (valid > 24)
686    valid = 24;
687  for (i = 0; i < valid; ++i)
688    valid_txt = apr_pstrcat(pool, valid_txt,
689                            apr_psprintf(pool, " %02x",
690                                         (unsigned char)last[i-valid]),
691                                         (char *)NULL);
692
693  /* 4 invalid octets will guarantee that the faulty octet is displayed */
694  invalid = data + len - last;
695  if (invalid > 4)
696    invalid = 4;
697  for (i = 0; i < invalid; ++i)
698    invalid_txt = apr_pstrcat(pool, invalid_txt,
699                              apr_psprintf(pool, " %02x",
700                                           (unsigned char)last[i]),
701                                           (char *)NULL);
702
703  return svn_error_createf(APR_EINVAL, NULL,
704                           _("Valid UTF-8 data\n(hex:%s)\n"
705                             "followed by invalid UTF-8 sequence\n(hex:%s)"),
706                           valid_txt, invalid_txt);
707}
708
709/* Verify that the sequence DATA of length LEN is valid UTF-8.
710   If it is not, return an error with code APR_EINVAL. */
711static svn_error_t *
712check_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
713{
714  if (! svn_utf__is_valid(data, len))
715    return invalid_utf8(data, len, pool);
716  return SVN_NO_ERROR;
717}
718
719/* Verify that the NULL terminated sequence DATA is valid UTF-8.
720   If it is not, return an error with code APR_EINVAL. */
721static svn_error_t *
722check_cstring_utf8(const char *data, apr_pool_t *pool)
723{
724
725  if (! svn_utf__cstring_is_valid(data))
726    return invalid_utf8(data, strlen(data), pool);
727  return SVN_NO_ERROR;
728}
729
730
731svn_error_t *
732svn_utf_stringbuf_to_utf8(svn_stringbuf_t **dest,
733                          const svn_stringbuf_t *src,
734                          apr_pool_t *pool)
735{
736  xlate_handle_node_t *node;
737  svn_error_t *err;
738
739  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
740
741  if (node->handle)
742    {
743      err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
744      if (! err)
745        err = check_utf8((*dest)->data, (*dest)->len, pool);
746    }
747  else
748    {
749      err = check_non_ascii(src->data, src->len, pool);
750      if (! err)
751        *dest = svn_stringbuf_dup(src, pool);
752    }
753
754  return svn_error_compose_create(err,
755                                  put_xlate_handle_node
756                                     (node,
757                                      SVN_UTF_NTOU_XLATE_HANDLE,
758                                      pool));
759}
760
761
762svn_error_t *
763svn_utf_string_to_utf8(const svn_string_t **dest,
764                       const svn_string_t *src,
765                       apr_pool_t *pool)
766{
767  svn_stringbuf_t *destbuf;
768  xlate_handle_node_t *node;
769  svn_error_t *err;
770
771  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
772
773  if (node->handle)
774    {
775      err = convert_to_stringbuf(node, src->data, src->len, &destbuf, pool);
776      if (! err)
777        err = check_utf8(destbuf->data, destbuf->len, pool);
778      if (! err)
779        *dest = svn_stringbuf__morph_into_string(destbuf);
780    }
781  else
782    {
783      err = check_non_ascii(src->data, src->len, pool);
784      if (! err)
785        *dest = svn_string_dup(src, pool);
786    }
787
788  return svn_error_compose_create(err,
789                                  put_xlate_handle_node
790                                     (node,
791                                      SVN_UTF_NTOU_XLATE_HANDLE,
792                                      pool));
793}
794
795
796/* Common implementation for svn_utf_cstring_to_utf8,
797   svn_utf_cstring_to_utf8_ex, svn_utf_cstring_from_utf8 and
798   svn_utf_cstring_from_utf8_ex. Convert SRC to DEST using NODE->handle as
799   the translator and allocating from POOL. */
800static svn_error_t *
801convert_cstring(const char **dest,
802                const char *src,
803                xlate_handle_node_t *node,
804                apr_pool_t *pool)
805{
806  if (node->handle)
807    {
808      svn_stringbuf_t *destbuf;
809      SVN_ERR(convert_to_stringbuf(node, src, strlen(src),
810                                   &destbuf, pool));
811      *dest = destbuf->data;
812    }
813  else
814    {
815      apr_size_t len = strlen(src);
816      SVN_ERR(check_non_ascii(src, len, pool));
817      *dest = apr_pstrmemdup(pool, src, len);
818    }
819  return SVN_NO_ERROR;
820}
821
822
823svn_error_t *
824svn_utf_cstring_to_utf8(const char **dest,
825                        const char *src,
826                        apr_pool_t *pool)
827{
828  xlate_handle_node_t *node;
829  svn_error_t *err;
830
831  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
832  err = convert_cstring(dest, src, node, pool);
833  SVN_ERR(svn_error_compose_create(err,
834                                   put_xlate_handle_node
835                                      (node,
836                                       SVN_UTF_NTOU_XLATE_HANDLE,
837                                       pool)));
838  return check_cstring_utf8(*dest, pool);
839}
840
841
842svn_error_t *
843svn_utf_cstring_to_utf8_ex2(const char **dest,
844                            const char *src,
845                            const char *frompage,
846                            apr_pool_t *pool)
847{
848  xlate_handle_node_t *node;
849  svn_error_t *err;
850  const char *convset_key = get_xlate_key(SVN_APR_UTF8_CHARSET, frompage,
851                                          pool);
852
853  SVN_ERR(get_xlate_handle_node(&node, SVN_APR_UTF8_CHARSET, frompage,
854                                convset_key, pool));
855  err = convert_cstring(dest, src, node, pool);
856  SVN_ERR(svn_error_compose_create(err,
857                                   put_xlate_handle_node
858                                      (node,
859                                       SVN_UTF_NTOU_XLATE_HANDLE,
860                                       pool)));
861
862  return check_cstring_utf8(*dest, pool);
863}
864
865
866svn_error_t *
867svn_utf_cstring_to_utf8_ex(const char **dest,
868                           const char *src,
869                           const char *frompage,
870                           const char *convset_key,
871                           apr_pool_t *pool)
872{
873  return svn_utf_cstring_to_utf8_ex2(dest, src, frompage, pool);
874}
875
876
877svn_error_t *
878svn_utf_stringbuf_from_utf8(svn_stringbuf_t **dest,
879                            const svn_stringbuf_t *src,
880                            apr_pool_t *pool)
881{
882  xlate_handle_node_t *node;
883  svn_error_t *err;
884
885  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
886
887  if (node->handle)
888    {
889      err = check_utf8(src->data, src->len, pool);
890      if (! err)
891        err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
892    }
893  else
894    {
895      err = check_non_ascii(src->data, src->len, pool);
896      if (! err)
897        *dest = svn_stringbuf_dup(src, pool);
898    }
899
900  err = svn_error_compose_create(
901          err,
902          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
903
904  return err;
905}
906
907
908svn_error_t *
909svn_utf_string_from_utf8(const svn_string_t **dest,
910                         const svn_string_t *src,
911                         apr_pool_t *pool)
912{
913  svn_stringbuf_t *dbuf;
914  xlate_handle_node_t *node;
915  svn_error_t *err;
916
917  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
918
919  if (node->handle)
920    {
921      err = check_utf8(src->data, src->len, pool);
922      if (! err)
923        err = convert_to_stringbuf(node, src->data, src->len,
924                                   &dbuf, pool);
925      if (! err)
926        *dest = svn_stringbuf__morph_into_string(dbuf);
927    }
928  else
929    {
930      err = check_non_ascii(src->data, src->len, pool);
931      if (! err)
932        *dest = svn_string_dup(src, pool);
933    }
934
935  err = svn_error_compose_create(
936          err,
937          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
938
939  return err;
940}
941
942
943svn_error_t *
944svn_utf_cstring_from_utf8(const char **dest,
945                          const char *src,
946                          apr_pool_t *pool)
947{
948  xlate_handle_node_t *node;
949  svn_error_t *err;
950
951  SVN_ERR(check_cstring_utf8(src, pool));
952
953  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
954  err = convert_cstring(dest, src, node, pool);
955  err = svn_error_compose_create(
956          err,
957          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
958
959  return err;
960}
961
962
963svn_error_t *
964svn_utf_cstring_from_utf8_ex2(const char **dest,
965                              const char *src,
966                              const char *topage,
967                              apr_pool_t *pool)
968{
969  xlate_handle_node_t *node;
970  svn_error_t *err;
971  const char *convset_key = get_xlate_key(topage, SVN_APR_UTF8_CHARSET,
972                                          pool);
973
974  SVN_ERR(check_cstring_utf8(src, pool));
975
976  SVN_ERR(get_xlate_handle_node(&node, topage, SVN_APR_UTF8_CHARSET,
977                                convset_key, pool));
978  err = convert_cstring(dest, src, node, pool);
979  err = svn_error_compose_create(
980          err,
981          put_xlate_handle_node(node, convset_key, pool));
982
983  return err;
984}
985
986
987svn_error_t *
988svn_utf_cstring_from_utf8_ex(const char **dest,
989                             const char *src,
990                             const char *topage,
991                             const char *convset_key,
992                             apr_pool_t *pool)
993{
994  return svn_utf_cstring_from_utf8_ex2(dest, src, topage, pool);
995}
996
997
998const char *
999svn_utf__cstring_from_utf8_fuzzy(const char *src,
1000                                 apr_pool_t *pool,
1001                                 svn_error_t *(*convert_from_utf8)
1002                                 (const char **, const char *, apr_pool_t *))
1003{
1004  const char *escaped, *converted;
1005  svn_error_t *err;
1006
1007  escaped = fuzzy_escape(src, strlen(src), pool);
1008
1009  /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to
1010     contain only 7-bit bytes :-).  Recode to native... */
1011  err = convert_from_utf8(((const char **) &converted), escaped, pool);
1012
1013  if (err)
1014    {
1015      svn_error_clear(err);
1016      return escaped;
1017    }
1018  else
1019    return converted;
1020
1021  /* ### Check the client locale, maybe we can avoid that second
1022   * conversion!  See Ulrich Drepper's patch at
1023   * http://subversion.tigris.org/issues/show_bug.cgi?id=807.
1024   */
1025}
1026
1027
1028const char *
1029svn_utf_cstring_from_utf8_fuzzy(const char *src,
1030                                apr_pool_t *pool)
1031{
1032  return svn_utf__cstring_from_utf8_fuzzy(src, pool,
1033                                          svn_utf_cstring_from_utf8);
1034}
1035
1036
1037svn_error_t *
1038svn_utf_cstring_from_utf8_stringbuf(const char **dest,
1039                                    const svn_stringbuf_t *src,
1040                                    apr_pool_t *pool)
1041{
1042  svn_stringbuf_t *destbuf;
1043
1044  SVN_ERR(svn_utf_stringbuf_from_utf8(&destbuf, src, pool));
1045  *dest = destbuf->data;
1046
1047  return SVN_NO_ERROR;
1048}
1049
1050
1051svn_error_t *
1052svn_utf_cstring_from_utf8_string(const char **dest,
1053                                 const svn_string_t *src,
1054                                 apr_pool_t *pool)
1055{
1056  svn_stringbuf_t *dbuf;
1057  xlate_handle_node_t *node;
1058  svn_error_t *err;
1059
1060  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
1061
1062  if (node->handle)
1063    {
1064      err = check_utf8(src->data, src->len, pool);
1065      if (! err)
1066        err = convert_to_stringbuf(node, src->data, src->len,
1067                                   &dbuf, pool);
1068      if (! err)
1069        *dest = dbuf->data;
1070    }
1071  else
1072    {
1073      err = check_non_ascii(src->data, src->len, pool);
1074      if (! err)
1075        *dest = apr_pstrmemdup(pool, src->data, src->len);
1076    }
1077
1078  err = svn_error_compose_create(
1079          err,
1080          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
1081
1082  return err;
1083}
1084