1/* 2 * utf.c: UTF-8 conversion routines 3 * 4 * ==================================================================== 5 * Licensed to the Apache Software Foundation (ASF) under one 6 * or more contributor license agreements. See the NOTICE file 7 * distributed with this work for additional information 8 * regarding copyright ownership. The ASF licenses this file 9 * to you under the Apache License, Version 2.0 (the 10 * "License"); you may not use this file except in compliance 11 * with the License. You may obtain a copy of the License at 12 * 13 * http://www.apache.org/licenses/LICENSE-2.0 14 * 15 * Unless required by applicable law or agreed to in writing, 16 * software distributed under the License is distributed on an 17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 * KIND, either express or implied. See the License for the 19 * specific language governing permissions and limitations 20 * under the License. 21 * ==================================================================== 22 */ 23 24 25 26#include <stdlib.h> 27#include <string.h> 28#include <assert.h> 29 30#include <apr_strings.h> 31#include <apr_lib.h> 32#include <apr_xlate.h> 33#include <apr_atomic.h> 34 35#include "svn_hash.h" 36#include "svn_string.h" 37#include "svn_error.h" 38#include "svn_pools.h" 39#include "svn_ctype.h" 40#include "svn_utf.h" 41#include "svn_private_config.h" 42#include "win32_xlate.h" 43 44#include "private/svn_utf_private.h" 45#include "private/svn_dep_compat.h" 46#include "private/svn_string_private.h" 47#include "private/svn_mutex.h" 48 49 50 51/* Use these static strings to maximize performance on standard conversions. 52 * Any strings on other locations are still valid, however. 53 */ 54static const char *SVN_UTF_NTOU_XLATE_HANDLE = "svn-utf-ntou-xlate-handle"; 55static const char *SVN_UTF_UTON_XLATE_HANDLE = "svn-utf-uton-xlate-handle"; 56 57static const char *SVN_APR_UTF8_CHARSET = "UTF-8"; 58 59static svn_mutex__t *xlate_handle_mutex = NULL; 60static svn_boolean_t assume_native_charset_is_utf8 = FALSE; 61 62/* The xlate handle cache is a global hash table with linked lists of xlate 63 * handles. In multi-threaded environments, a thread "borrows" an xlate 64 * handle from the cache during a translation and puts it back afterwards. 65 * This avoids holding a global lock for all translations. 66 * If there is no handle for a particular key when needed, a new is 67 * handle is created and put in the cache after use. 68 * This means that there will be at most N handles open for a key, where N 69 * is the number of simultanous handles in use for that key. */ 70 71typedef struct xlate_handle_node_t { 72 apr_xlate_t *handle; 73 /* FALSE if the handle is not valid, since its pool is being 74 destroyed. */ 75 svn_boolean_t valid; 76 /* The name of a char encoding or APR_LOCALE_CHARSET. */ 77 const char *frompage, *topage; 78 struct xlate_handle_node_t *next; 79} xlate_handle_node_t; 80 81/* This maps const char * userdata_key strings to xlate_handle_node_t ** 82 handles to the first entry in the linked list of xlate handles. We don't 83 store the pointer to the list head directly in the hash table, since we 84 remove/insert entries at the head in the list in the code below, and 85 we can't use apr_hash_set() in each character translation because that 86 function allocates memory in each call where the value is non-NULL. 87 Since these allocations take place in a global pool, this would be a 88 memory leak. */ 89static apr_hash_t *xlate_handle_hash = NULL; 90 91/* "1st level cache" to standard conversion maps. We may access these 92 * using atomic xchange ops, i.e. without further thread synchronization. 93 * If the respective item is NULL, fallback to hash lookup. 94 */ 95static void * volatile xlat_ntou_static_handle = NULL; 96static void * volatile xlat_uton_static_handle = NULL; 97 98/* Clean up the xlate handle cache. */ 99static apr_status_t 100xlate_cleanup(void *arg) 101{ 102 /* We set the cache variables to NULL so that translation works in other 103 cleanup functions, even if it isn't cached then. */ 104 xlate_handle_hash = NULL; 105 106 /* ensure no stale objects get accessed */ 107 xlat_ntou_static_handle = NULL; 108 xlat_uton_static_handle = NULL; 109 110 return APR_SUCCESS; 111} 112 113/* Set the handle of ARG to NULL. */ 114static apr_status_t 115xlate_handle_node_cleanup(void *arg) 116{ 117 xlate_handle_node_t *node = arg; 118 119 node->valid = FALSE; 120 return APR_SUCCESS; 121} 122 123void 124svn_utf_initialize2(svn_boolean_t assume_native_utf8, 125 apr_pool_t *pool) 126{ 127 if (!xlate_handle_hash) 128 { 129 /* We create our own subpool, which we protect with the mutex. 130 We can't use the pool passed to us by the caller, since we will 131 use it for xlate handle allocations, possibly in multiple threads, 132 and pool allocation is not thread-safe. */ 133 apr_pool_t *subpool = svn_pool_create(pool); 134 svn_mutex__t *mutex; 135 svn_error_t *err = svn_mutex__init(&mutex, TRUE, subpool); 136 if (err) 137 { 138 svn_error_clear(err); 139 return; 140 } 141 142 xlate_handle_mutex = mutex; 143 xlate_handle_hash = apr_hash_make(subpool); 144 145 apr_pool_cleanup_register(subpool, NULL, xlate_cleanup, 146 apr_pool_cleanup_null); 147 } 148 149 if (!assume_native_charset_is_utf8) 150 assume_native_charset_is_utf8 = assume_native_utf8; 151} 152 153/* Return a unique string key based on TOPAGE and FROMPAGE. TOPAGE and 154 * FROMPAGE can be any valid arguments of the same name to 155 * apr_xlate_open(). Allocate the returned string in POOL. */ 156static const char* 157get_xlate_key(const char *topage, 158 const char *frompage, 159 apr_pool_t *pool) 160{ 161 /* In the cases of SVN_APR_LOCALE_CHARSET and SVN_APR_DEFAULT_CHARSET 162 * topage/frompage is really an int, not a valid string. So generate a 163 * unique key accordingly. */ 164 if (frompage == SVN_APR_LOCALE_CHARSET) 165 frompage = "APR_LOCALE_CHARSET"; 166 else if (frompage == SVN_APR_DEFAULT_CHARSET) 167 frompage = "APR_DEFAULT_CHARSET"; 168 169 if (topage == SVN_APR_LOCALE_CHARSET) 170 topage = "APR_LOCALE_CHARSET"; 171 else if (topage == SVN_APR_DEFAULT_CHARSET) 172 topage = "APR_DEFAULT_CHARSET"; 173 174 return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage, 175 "-xlate-handle", (char *)NULL); 176} 177 178/* Atomically replace the content in *MEM with NEW_VALUE and return 179 * the previous content of *MEM. If atomicy cannot be guaranteed, 180 * *MEM will not be modified and NEW_VALUE is simply returned to 181 * the caller. 182 */ 183static APR_INLINE void* 184atomic_swap(void * volatile * mem, void *new_value) 185{ 186#if APR_HAS_THREADS 187#if APR_VERSION_AT_LEAST(1,3,0) 188 /* Cast is necessary because of APR bug: 189 https://issues.apache.org/bugzilla/show_bug.cgi?id=50731 */ 190 return apr_atomic_xchgptr((volatile void **)mem, new_value); 191#else 192 /* old APRs don't support atomic swaps. Simply return the 193 * input to the caller for further proccessing. */ 194 return new_value; 195#endif 196#else 197 /* no threads - no sync. necessary */ 198 void *old_value = (void*)*mem; 199 *mem = new_value; 200 return old_value; 201#endif 202} 203 204/* Set *RET to a newly created handle node for converting from FROMPAGE 205 to TOPAGE, If apr_xlate_open() returns APR_EINVAL or APR_ENOTIMPL, set 206 (*RET)->handle to NULL. If fail for any other reason, return the error. 207 Allocate *RET and its xlate handle in POOL. */ 208static svn_error_t * 209xlate_alloc_handle(xlate_handle_node_t **ret, 210 const char *topage, const char *frompage, 211 apr_pool_t *pool) 212{ 213 apr_status_t apr_err; 214 apr_xlate_t *handle; 215 216 /* The error handling doesn't support the following cases, since we don't 217 use them currently. Catch this here. */ 218 SVN_ERR_ASSERT(frompage != SVN_APR_DEFAULT_CHARSET 219 && topage != SVN_APR_DEFAULT_CHARSET 220 && (frompage != SVN_APR_LOCALE_CHARSET 221 || topage != SVN_APR_LOCALE_CHARSET)); 222 223 /* Try to create a handle. */ 224#if defined(WIN32) 225 apr_err = svn_subr__win32_xlate_open((win32_xlate_t **)&handle, topage, 226 frompage, pool); 227#else 228 apr_err = apr_xlate_open(&handle, topage, frompage, pool); 229#endif 230 231 if (APR_STATUS_IS_EINVAL(apr_err) || APR_STATUS_IS_ENOTIMPL(apr_err)) 232 handle = NULL; 233 else if (apr_err != APR_SUCCESS) 234 { 235 const char *errstr; 236 char apr_strerr[512]; 237 238 /* Can't use svn_error_wrap_apr here because it calls functions in 239 this file, leading to infinite recursion. */ 240 if (frompage == SVN_APR_LOCALE_CHARSET) 241 errstr = apr_psprintf(pool, 242 _("Can't create a character converter from " 243 "native encoding to '%s'"), topage); 244 else if (topage == SVN_APR_LOCALE_CHARSET) 245 errstr = apr_psprintf(pool, 246 _("Can't create a character converter from " 247 "'%s' to native encoding"), frompage); 248 else 249 errstr = apr_psprintf(pool, 250 _("Can't create a character converter from " 251 "'%s' to '%s'"), frompage, topage); 252 253 /* Just put the error on the stack, since svn_error_create duplicates it 254 later. APR_STRERR will be in the local encoding, not in UTF-8, though. 255 */ 256 svn_strerror(apr_err, apr_strerr, sizeof(apr_strerr)); 257 return svn_error_create(apr_err, 258 svn_error_create(apr_err, NULL, apr_strerr), 259 errstr); 260 } 261 262 /* Allocate and initialize the node. */ 263 *ret = apr_palloc(pool, sizeof(xlate_handle_node_t)); 264 (*ret)->handle = handle; 265 (*ret)->valid = TRUE; 266 (*ret)->frompage = ((frompage != SVN_APR_LOCALE_CHARSET) 267 ? apr_pstrdup(pool, frompage) : frompage); 268 (*ret)->topage = ((topage != SVN_APR_LOCALE_CHARSET) 269 ? apr_pstrdup(pool, topage) : topage); 270 (*ret)->next = NULL; 271 272 /* If we are called from inside a pool cleanup handler, the just created 273 xlate handle will be closed when that handler returns by a newly 274 registered cleanup handler, however, the handle is still cached by us. 275 To prevent this, we register a cleanup handler that will reset the valid 276 flag of our node, so we don't use an invalid handle. */ 277 if (handle) 278 apr_pool_cleanup_register(pool, *ret, xlate_handle_node_cleanup, 279 apr_pool_cleanup_null); 280 281 return SVN_NO_ERROR; 282} 283 284/* Extend xlate_alloc_handle by using USERDATA_KEY as a key in our 285 global hash map, if available. 286 287 Allocate *RET and its xlate handle in POOL if svn_utf_initialize() 288 hasn't been called or USERDATA_KEY is NULL. Else, allocate them 289 in the pool of xlate_handle_hash. 290 291 Note: this function is not thread-safe. Call get_xlate_handle_node 292 instead. */ 293static svn_error_t * 294get_xlate_handle_node_internal(xlate_handle_node_t **ret, 295 const char *topage, const char *frompage, 296 const char *userdata_key, apr_pool_t *pool) 297{ 298 /* If we already have a handle, just return it. */ 299 if (userdata_key && xlate_handle_hash) 300 { 301 xlate_handle_node_t *old_node = NULL; 302 303 /* 2nd level: hash lookup */ 304 xlate_handle_node_t **old_node_p = svn_hash_gets(xlate_handle_hash, 305 userdata_key); 306 if (old_node_p) 307 old_node = *old_node_p; 308 if (old_node) 309 { 310 /* Ensure that the handle is still valid. */ 311 if (old_node->valid) 312 { 313 /* Remove from the list. */ 314 *old_node_p = old_node->next; 315 old_node->next = NULL; 316 *ret = old_node; 317 return SVN_NO_ERROR; 318 } 319 } 320 } 321 322 /* Note that we still have the mutex locked (if it is initialized), so we 323 can use the global pool for creating the new xlate handle. */ 324 325 /* Use the correct pool for creating the handle. */ 326 pool = apr_hash_pool_get(xlate_handle_hash); 327 328 return xlate_alloc_handle(ret, topage, frompage, pool); 329} 330 331/* Set *RET to a handle node for converting from FROMPAGE to TOPAGE, 332 creating the handle node if it doesn't exist in USERDATA_KEY. 333 If a node is not cached and apr_xlate_open() returns APR_EINVAL or 334 APR_ENOTIMPL, set (*RET)->handle to NULL. If fail for any other 335 reason, return the error. 336 337 Allocate *RET and its xlate handle in POOL if svn_utf_initialize() 338 hasn't been called or USERDATA_KEY is NULL. Else, allocate them 339 in the pool of xlate_handle_hash. */ 340static svn_error_t * 341get_xlate_handle_node(xlate_handle_node_t **ret, 342 const char *topage, const char *frompage, 343 const char *userdata_key, apr_pool_t *pool) 344{ 345 xlate_handle_node_t *old_node = NULL; 346 347 /* If we already have a handle, just return it. */ 348 if (userdata_key) 349 { 350 if (xlate_handle_hash) 351 { 352 /* 1st level: global, static items */ 353 if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE) 354 old_node = atomic_swap(&xlat_ntou_static_handle, NULL); 355 else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE) 356 old_node = atomic_swap(&xlat_uton_static_handle, NULL); 357 358 if (old_node && old_node->valid) 359 { 360 *ret = old_node; 361 return SVN_NO_ERROR; 362 } 363 } 364 else 365 { 366 void *p; 367 /* We fall back on a per-pool cache instead. */ 368 apr_pool_userdata_get(&p, userdata_key, pool); 369 old_node = p; 370 /* Ensure that the handle is still valid. */ 371 if (old_node && old_node->valid) 372 { 373 *ret = old_node; 374 return SVN_NO_ERROR; 375 } 376 377 return xlate_alloc_handle(ret, topage, frompage, pool); 378 } 379 } 380 381 SVN_MUTEX__WITH_LOCK(xlate_handle_mutex, 382 get_xlate_handle_node_internal(ret, 383 topage, 384 frompage, 385 userdata_key, 386 pool)); 387 388 return SVN_NO_ERROR; 389} 390 391/* Put back NODE into the xlate handle cache for use by other calls. 392 393 Note: this function is not thread-safe. Call put_xlate_handle_node 394 instead. */ 395static svn_error_t * 396put_xlate_handle_node_internal(xlate_handle_node_t *node, 397 const char *userdata_key) 398{ 399 xlate_handle_node_t **node_p = svn_hash_gets(xlate_handle_hash, userdata_key); 400 if (node_p == NULL) 401 { 402 userdata_key = apr_pstrdup(apr_hash_pool_get(xlate_handle_hash), 403 userdata_key); 404 node_p = apr_palloc(apr_hash_pool_get(xlate_handle_hash), 405 sizeof(*node_p)); 406 *node_p = NULL; 407 svn_hash_sets(xlate_handle_hash, userdata_key, node_p); 408 } 409 node->next = *node_p; 410 *node_p = node; 411 412 return SVN_NO_ERROR; 413} 414 415/* Put back NODE into the xlate handle cache for use by other calls. 416 If there is no global cache, store the handle in POOL. 417 Ignore errors related to locking/unlocking the mutex. */ 418static svn_error_t * 419put_xlate_handle_node(xlate_handle_node_t *node, 420 const char *userdata_key, 421 apr_pool_t *pool) 422{ 423 assert(node->next == NULL); 424 if (!userdata_key) 425 return SVN_NO_ERROR; 426 427 /* push previous global node to the hash */ 428 if (xlate_handle_hash) 429 { 430 /* 1st level: global, static items */ 431 if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE) 432 node = atomic_swap(&xlat_ntou_static_handle, node); 433 else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE) 434 node = atomic_swap(&xlat_uton_static_handle, node); 435 if (node == NULL) 436 return SVN_NO_ERROR; 437 438 SVN_MUTEX__WITH_LOCK(xlate_handle_mutex, 439 put_xlate_handle_node_internal(node, 440 userdata_key)); 441 } 442 else 443 { 444 /* Store it in the per-pool cache. */ 445 apr_pool_userdata_set(node, userdata_key, apr_pool_cleanup_null, pool); 446 } 447 448 return SVN_NO_ERROR; 449} 450 451/* Return the apr_xlate handle for converting native characters to UTF-8. */ 452static svn_error_t * 453get_ntou_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool) 454{ 455 return get_xlate_handle_node(ret, SVN_APR_UTF8_CHARSET, 456 assume_native_charset_is_utf8 457 ? SVN_APR_UTF8_CHARSET 458 : SVN_APR_LOCALE_CHARSET, 459 SVN_UTF_NTOU_XLATE_HANDLE, pool); 460} 461 462 463/* Return the apr_xlate handle for converting UTF-8 to native characters. 464 Create one if it doesn't exist. If unable to find a handle, or 465 unable to create one because apr_xlate_open returned APR_EINVAL, then 466 set *RET to null and return SVN_NO_ERROR; if fail for some other 467 reason, return error. */ 468static svn_error_t * 469get_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool) 470{ 471 return get_xlate_handle_node(ret, 472 assume_native_charset_is_utf8 473 ? SVN_APR_UTF8_CHARSET 474 : SVN_APR_LOCALE_CHARSET, 475 SVN_APR_UTF8_CHARSET, 476 SVN_UTF_UTON_XLATE_HANDLE, pool); 477} 478 479 480/* Copy LEN bytes of SRC, converting non-ASCII and zero bytes to ?\nnn 481 sequences, allocating the result in POOL. */ 482static const char * 483fuzzy_escape(const char *src, apr_size_t len, apr_pool_t *pool) 484{ 485 const char *src_orig = src, *src_end = src + len; 486 apr_size_t new_len = 0; 487 char *new; 488 const char *new_orig; 489 490 /* First count how big a dest string we'll need. */ 491 while (src < src_end) 492 { 493 if (! svn_ctype_isascii(*src) || *src == '\0') 494 new_len += 5; /* 5 slots, for "?\XXX" */ 495 else 496 new_len += 1; /* one slot for the 7-bit char */ 497 498 src++; 499 } 500 501 /* Allocate that amount, plus one slot for '\0' character. */ 502 new = apr_palloc(pool, new_len + 1); 503 504 new_orig = new; 505 506 /* And fill it up. */ 507 while (src_orig < src_end) 508 { 509 if (! svn_ctype_isascii(*src_orig) || src_orig == '\0') 510 { 511 /* This is the same format as svn_xml_fuzzy_escape uses, but that 512 function escapes different characters. Please keep in sync! 513 ### If we add another fuzzy escape somewhere, we should abstract 514 ### this out to a common function. */ 515 apr_snprintf(new, 6, "?\\%03u", (unsigned char) *src_orig); 516 new += 5; 517 } 518 else 519 { 520 *new = *src_orig; 521 new += 1; 522 } 523 524 src_orig++; 525 } 526 527 *new = '\0'; 528 529 return new_orig; 530} 531 532/* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result 533 in *DEST, which is allocated in POOL. */ 534static svn_error_t * 535convert_to_stringbuf(xlate_handle_node_t *node, 536 const char *src_data, 537 apr_size_t src_length, 538 svn_stringbuf_t **dest, 539 apr_pool_t *pool) 540{ 541#ifdef WIN32 542 apr_status_t apr_err; 543 544 apr_err = svn_subr__win32_xlate_to_stringbuf((win32_xlate_t *) node->handle, 545 src_data, src_length, 546 dest, pool); 547#else 548 apr_size_t buflen = src_length * 2; 549 apr_status_t apr_err; 550 apr_size_t srclen = src_length; 551 apr_size_t destlen = buflen; 552 553 /* Initialize *DEST to an empty stringbuf. 554 A 1:2 ratio of input bytes to output bytes (as assigned above) 555 should be enough for most translations, and if it turns out not 556 to be enough, we'll grow the buffer again, sizing it based on a 557 1:3 ratio of the remainder of the string. */ 558 *dest = svn_stringbuf_create_ensure(buflen + 1, pool); 559 560 /* Not only does it not make sense to convert an empty string, but 561 apr-iconv is quite unreasonable about not allowing that. */ 562 if (src_length == 0) 563 return SVN_NO_ERROR; 564 565 do 566 { 567 /* Set up state variables for xlate. */ 568 destlen = buflen - (*dest)->len; 569 570 /* Attempt the conversion. */ 571 apr_err = apr_xlate_conv_buffer(node->handle, 572 src_data + (src_length - srclen), 573 &srclen, 574 (*dest)->data + (*dest)->len, 575 &destlen); 576 577 /* Now, update the *DEST->len to track the amount of output data 578 churned out so far from this loop. */ 579 (*dest)->len += ((buflen - (*dest)->len) - destlen); 580 buflen += srclen * 3; /* 3 is middle ground, 2 wasn't enough 581 for all characters in the buffer, 4 is 582 maximum character size (currently) */ 583 584 585 } while (apr_err == APR_SUCCESS && srclen != 0); 586#endif 587 588 /* If we exited the loop with an error, return the error. */ 589 if (apr_err) 590 { 591 const char *errstr; 592 svn_error_t *err; 593 594 /* Can't use svn_error_wrap_apr here because it calls functions in 595 this file, leading to infinite recursion. */ 596 if (node->frompage == SVN_APR_LOCALE_CHARSET) 597 errstr = apr_psprintf 598 (pool, _("Can't convert string from native encoding to '%s':"), 599 node->topage); 600 else if (node->topage == SVN_APR_LOCALE_CHARSET) 601 errstr = apr_psprintf 602 (pool, _("Can't convert string from '%s' to native encoding:"), 603 node->frompage); 604 else 605 errstr = apr_psprintf 606 (pool, _("Can't convert string from '%s' to '%s':"), 607 node->frompage, node->topage); 608 609 err = svn_error_create(apr_err, NULL, fuzzy_escape(src_data, 610 src_length, pool)); 611 return svn_error_create(apr_err, err, errstr); 612 } 613 /* Else, exited due to success. Trim the result buffer down to the 614 right length. */ 615 (*dest)->data[(*dest)->len] = '\0'; 616 617 return SVN_NO_ERROR; 618} 619 620 621/* Return APR_EINVAL if the first LEN bytes of DATA contain anything 622 other than seven-bit, non-control (except for whitespace) ASCII 623 characters, finding the error pool from POOL. Otherwise, return 624 SVN_NO_ERROR. */ 625static svn_error_t * 626check_non_ascii(const char *data, apr_size_t len, apr_pool_t *pool) 627{ 628 const char *data_start = data; 629 630 for (; len > 0; --len, data++) 631 { 632 if ((! svn_ctype_isascii(*data)) 633 || ((! svn_ctype_isspace(*data)) 634 && svn_ctype_iscntrl(*data))) 635 { 636 /* Show the printable part of the data, followed by the 637 decimal code of the questionable character. Because if a 638 user ever gets this error, she's going to have to spend 639 time tracking down the non-ASCII data, so we want to help 640 as much as possible. And yes, we just call the unsafe 641 data "non-ASCII", even though the actual constraint is 642 somewhat more complex than that. */ 643 644 if (data - data_start) 645 { 646 const char *error_data 647 = apr_pstrndup(pool, data_start, (data - data_start)); 648 649 return svn_error_createf 650 (APR_EINVAL, NULL, 651 _("Safe data '%s' was followed by non-ASCII byte %d: " 652 "unable to convert to/from UTF-8"), 653 error_data, *((const unsigned char *) data)); 654 } 655 else 656 { 657 return svn_error_createf 658 (APR_EINVAL, NULL, 659 _("Non-ASCII character (code %d) detected, " 660 "and unable to convert to/from UTF-8"), 661 *((const unsigned char *) data)); 662 } 663 } 664 } 665 666 return SVN_NO_ERROR; 667} 668 669/* Construct an error with code APR_EINVAL and with a suitable message 670 * to describe the invalid UTF-8 sequence DATA of length LEN (which 671 * may have embedded NULLs). We can't simply print the data, almost 672 * by definition we don't really know how it is encoded. 673 */ 674static svn_error_t * 675invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool) 676{ 677 const char *last = svn_utf__last_valid(data, len); 678 const char *valid_txt = "", *invalid_txt = ""; 679 apr_size_t i; 680 size_t valid, invalid; 681 682 /* We will display at most 24 valid octets (this may split a leading 683 multi-byte character) as that should fit on one 80 character line. */ 684 valid = last - data; 685 if (valid > 24) 686 valid = 24; 687 for (i = 0; i < valid; ++i) 688 valid_txt = apr_pstrcat(pool, valid_txt, 689 apr_psprintf(pool, " %02x", 690 (unsigned char)last[i-valid]), 691 (char *)NULL); 692 693 /* 4 invalid octets will guarantee that the faulty octet is displayed */ 694 invalid = data + len - last; 695 if (invalid > 4) 696 invalid = 4; 697 for (i = 0; i < invalid; ++i) 698 invalid_txt = apr_pstrcat(pool, invalid_txt, 699 apr_psprintf(pool, " %02x", 700 (unsigned char)last[i]), 701 (char *)NULL); 702 703 return svn_error_createf(APR_EINVAL, NULL, 704 _("Valid UTF-8 data\n(hex:%s)\n" 705 "followed by invalid UTF-8 sequence\n(hex:%s)"), 706 valid_txt, invalid_txt); 707} 708 709/* Verify that the sequence DATA of length LEN is valid UTF-8. 710 If it is not, return an error with code APR_EINVAL. */ 711static svn_error_t * 712check_utf8(const char *data, apr_size_t len, apr_pool_t *pool) 713{ 714 if (! svn_utf__is_valid(data, len)) 715 return invalid_utf8(data, len, pool); 716 return SVN_NO_ERROR; 717} 718 719/* Verify that the NULL terminated sequence DATA is valid UTF-8. 720 If it is not, return an error with code APR_EINVAL. */ 721static svn_error_t * 722check_cstring_utf8(const char *data, apr_pool_t *pool) 723{ 724 725 if (! svn_utf__cstring_is_valid(data)) 726 return invalid_utf8(data, strlen(data), pool); 727 return SVN_NO_ERROR; 728} 729 730 731svn_error_t * 732svn_utf_stringbuf_to_utf8(svn_stringbuf_t **dest, 733 const svn_stringbuf_t *src, 734 apr_pool_t *pool) 735{ 736 xlate_handle_node_t *node; 737 svn_error_t *err; 738 739 SVN_ERR(get_ntou_xlate_handle_node(&node, pool)); 740 741 if (node->handle) 742 { 743 err = convert_to_stringbuf(node, src->data, src->len, dest, pool); 744 if (! err) 745 err = check_utf8((*dest)->data, (*dest)->len, pool); 746 } 747 else 748 { 749 err = check_non_ascii(src->data, src->len, pool); 750 if (! err) 751 *dest = svn_stringbuf_dup(src, pool); 752 } 753 754 return svn_error_compose_create(err, 755 put_xlate_handle_node 756 (node, 757 SVN_UTF_NTOU_XLATE_HANDLE, 758 pool)); 759} 760 761 762svn_error_t * 763svn_utf_string_to_utf8(const svn_string_t **dest, 764 const svn_string_t *src, 765 apr_pool_t *pool) 766{ 767 svn_stringbuf_t *destbuf; 768 xlate_handle_node_t *node; 769 svn_error_t *err; 770 771 SVN_ERR(get_ntou_xlate_handle_node(&node, pool)); 772 773 if (node->handle) 774 { 775 err = convert_to_stringbuf(node, src->data, src->len, &destbuf, pool); 776 if (! err) 777 err = check_utf8(destbuf->data, destbuf->len, pool); 778 if (! err) 779 *dest = svn_stringbuf__morph_into_string(destbuf); 780 } 781 else 782 { 783 err = check_non_ascii(src->data, src->len, pool); 784 if (! err) 785 *dest = svn_string_dup(src, pool); 786 } 787 788 return svn_error_compose_create(err, 789 put_xlate_handle_node 790 (node, 791 SVN_UTF_NTOU_XLATE_HANDLE, 792 pool)); 793} 794 795 796/* Common implementation for svn_utf_cstring_to_utf8, 797 svn_utf_cstring_to_utf8_ex, svn_utf_cstring_from_utf8 and 798 svn_utf_cstring_from_utf8_ex. Convert SRC to DEST using NODE->handle as 799 the translator and allocating from POOL. */ 800static svn_error_t * 801convert_cstring(const char **dest, 802 const char *src, 803 xlate_handle_node_t *node, 804 apr_pool_t *pool) 805{ 806 if (node->handle) 807 { 808 svn_stringbuf_t *destbuf; 809 SVN_ERR(convert_to_stringbuf(node, src, strlen(src), 810 &destbuf, pool)); 811 *dest = destbuf->data; 812 } 813 else 814 { 815 apr_size_t len = strlen(src); 816 SVN_ERR(check_non_ascii(src, len, pool)); 817 *dest = apr_pstrmemdup(pool, src, len); 818 } 819 return SVN_NO_ERROR; 820} 821 822 823svn_error_t * 824svn_utf_cstring_to_utf8(const char **dest, 825 const char *src, 826 apr_pool_t *pool) 827{ 828 xlate_handle_node_t *node; 829 svn_error_t *err; 830 831 SVN_ERR(get_ntou_xlate_handle_node(&node, pool)); 832 err = convert_cstring(dest, src, node, pool); 833 SVN_ERR(svn_error_compose_create(err, 834 put_xlate_handle_node 835 (node, 836 SVN_UTF_NTOU_XLATE_HANDLE, 837 pool))); 838 return check_cstring_utf8(*dest, pool); 839} 840 841 842svn_error_t * 843svn_utf_cstring_to_utf8_ex2(const char **dest, 844 const char *src, 845 const char *frompage, 846 apr_pool_t *pool) 847{ 848 xlate_handle_node_t *node; 849 svn_error_t *err; 850 const char *convset_key = get_xlate_key(SVN_APR_UTF8_CHARSET, frompage, 851 pool); 852 853 SVN_ERR(get_xlate_handle_node(&node, SVN_APR_UTF8_CHARSET, frompage, 854 convset_key, pool)); 855 err = convert_cstring(dest, src, node, pool); 856 SVN_ERR(svn_error_compose_create(err, 857 put_xlate_handle_node 858 (node, 859 SVN_UTF_NTOU_XLATE_HANDLE, 860 pool))); 861 862 return check_cstring_utf8(*dest, pool); 863} 864 865 866svn_error_t * 867svn_utf_cstring_to_utf8_ex(const char **dest, 868 const char *src, 869 const char *frompage, 870 const char *convset_key, 871 apr_pool_t *pool) 872{ 873 return svn_utf_cstring_to_utf8_ex2(dest, src, frompage, pool); 874} 875 876 877svn_error_t * 878svn_utf_stringbuf_from_utf8(svn_stringbuf_t **dest, 879 const svn_stringbuf_t *src, 880 apr_pool_t *pool) 881{ 882 xlate_handle_node_t *node; 883 svn_error_t *err; 884 885 SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 886 887 if (node->handle) 888 { 889 err = check_utf8(src->data, src->len, pool); 890 if (! err) 891 err = convert_to_stringbuf(node, src->data, src->len, dest, pool); 892 } 893 else 894 { 895 err = check_non_ascii(src->data, src->len, pool); 896 if (! err) 897 *dest = svn_stringbuf_dup(src, pool); 898 } 899 900 err = svn_error_compose_create( 901 err, 902 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 903 904 return err; 905} 906 907 908svn_error_t * 909svn_utf_string_from_utf8(const svn_string_t **dest, 910 const svn_string_t *src, 911 apr_pool_t *pool) 912{ 913 svn_stringbuf_t *dbuf; 914 xlate_handle_node_t *node; 915 svn_error_t *err; 916 917 SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 918 919 if (node->handle) 920 { 921 err = check_utf8(src->data, src->len, pool); 922 if (! err) 923 err = convert_to_stringbuf(node, src->data, src->len, 924 &dbuf, pool); 925 if (! err) 926 *dest = svn_stringbuf__morph_into_string(dbuf); 927 } 928 else 929 { 930 err = check_non_ascii(src->data, src->len, pool); 931 if (! err) 932 *dest = svn_string_dup(src, pool); 933 } 934 935 err = svn_error_compose_create( 936 err, 937 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 938 939 return err; 940} 941 942 943svn_error_t * 944svn_utf_cstring_from_utf8(const char **dest, 945 const char *src, 946 apr_pool_t *pool) 947{ 948 xlate_handle_node_t *node; 949 svn_error_t *err; 950 951 SVN_ERR(check_cstring_utf8(src, pool)); 952 953 SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 954 err = convert_cstring(dest, src, node, pool); 955 err = svn_error_compose_create( 956 err, 957 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 958 959 return err; 960} 961 962 963svn_error_t * 964svn_utf_cstring_from_utf8_ex2(const char **dest, 965 const char *src, 966 const char *topage, 967 apr_pool_t *pool) 968{ 969 xlate_handle_node_t *node; 970 svn_error_t *err; 971 const char *convset_key = get_xlate_key(topage, SVN_APR_UTF8_CHARSET, 972 pool); 973 974 SVN_ERR(check_cstring_utf8(src, pool)); 975 976 SVN_ERR(get_xlate_handle_node(&node, topage, SVN_APR_UTF8_CHARSET, 977 convset_key, pool)); 978 err = convert_cstring(dest, src, node, pool); 979 err = svn_error_compose_create( 980 err, 981 put_xlate_handle_node(node, convset_key, pool)); 982 983 return err; 984} 985 986 987svn_error_t * 988svn_utf_cstring_from_utf8_ex(const char **dest, 989 const char *src, 990 const char *topage, 991 const char *convset_key, 992 apr_pool_t *pool) 993{ 994 return svn_utf_cstring_from_utf8_ex2(dest, src, topage, pool); 995} 996 997 998const char * 999svn_utf__cstring_from_utf8_fuzzy(const char *src, 1000 apr_pool_t *pool, 1001 svn_error_t *(*convert_from_utf8) 1002 (const char **, const char *, apr_pool_t *)) 1003{ 1004 const char *escaped, *converted; 1005 svn_error_t *err; 1006 1007 escaped = fuzzy_escape(src, strlen(src), pool); 1008 1009 /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to 1010 contain only 7-bit bytes :-). Recode to native... */ 1011 err = convert_from_utf8(((const char **) &converted), escaped, pool); 1012 1013 if (err) 1014 { 1015 svn_error_clear(err); 1016 return escaped; 1017 } 1018 else 1019 return converted; 1020 1021 /* ### Check the client locale, maybe we can avoid that second 1022 * conversion! See Ulrich Drepper's patch at 1023 * http://subversion.tigris.org/issues/show_bug.cgi?id=807. 1024 */ 1025} 1026 1027 1028const char * 1029svn_utf_cstring_from_utf8_fuzzy(const char *src, 1030 apr_pool_t *pool) 1031{ 1032 return svn_utf__cstring_from_utf8_fuzzy(src, pool, 1033 svn_utf_cstring_from_utf8); 1034} 1035 1036 1037svn_error_t * 1038svn_utf_cstring_from_utf8_stringbuf(const char **dest, 1039 const svn_stringbuf_t *src, 1040 apr_pool_t *pool) 1041{ 1042 svn_stringbuf_t *destbuf; 1043 1044 SVN_ERR(svn_utf_stringbuf_from_utf8(&destbuf, src, pool)); 1045 *dest = destbuf->data; 1046 1047 return SVN_NO_ERROR; 1048} 1049 1050 1051svn_error_t * 1052svn_utf_cstring_from_utf8_string(const char **dest, 1053 const svn_string_t *src, 1054 apr_pool_t *pool) 1055{ 1056 svn_stringbuf_t *dbuf; 1057 xlate_handle_node_t *node; 1058 svn_error_t *err; 1059 1060 SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 1061 1062 if (node->handle) 1063 { 1064 err = check_utf8(src->data, src->len, pool); 1065 if (! err) 1066 err = convert_to_stringbuf(node, src->data, src->len, 1067 &dbuf, pool); 1068 if (! err) 1069 *dest = dbuf->data; 1070 } 1071 else 1072 { 1073 err = check_non_ascii(src->data, src->len, pool); 1074 if (! err) 1075 *dest = apr_pstrmemdup(pool, src->data, src->len); 1076 } 1077 1078 err = svn_error_compose_create( 1079 err, 1080 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 1081 1082 return err; 1083} 1084