utf.c revision 262253
1/* 2 * utf.c: UTF-8 conversion routines 3 * 4 * ==================================================================== 5 * Licensed to the Apache Software Foundation (ASF) under one 6 * or more contributor license agreements. See the NOTICE file 7 * distributed with this work for additional information 8 * regarding copyright ownership. The ASF licenses this file 9 * to you under the Apache License, Version 2.0 (the 10 * "License"); you may not use this file except in compliance 11 * with the License. You may obtain a copy of the License at 12 * 13 * http://www.apache.org/licenses/LICENSE-2.0 14 * 15 * Unless required by applicable law or agreed to in writing, 16 * software distributed under the License is distributed on an 17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 * KIND, either express or implied. See the License for the 19 * specific language governing permissions and limitations 20 * under the License. 21 * ==================================================================== 22 */ 23 24 25 26#include <stdlib.h> 27#include <string.h> 28#include <assert.h> 29 30#include <apr_strings.h> 31#include <apr_lib.h> 32#include <apr_xlate.h> 33#include <apr_atomic.h> 34 35#include "svn_hash.h" 36#include "svn_string.h" 37#include "svn_error.h" 38#include "svn_pools.h" 39#include "svn_ctype.h" 40#include "svn_utf.h" 41#include "svn_private_config.h" 42#include "win32_xlate.h" 43 44#include "private/svn_utf_private.h" 45#include "private/svn_dep_compat.h" 46#include "private/svn_string_private.h" 47#include "private/svn_mutex.h" 48 49 50 51/* Use these static strings to maximize performance on standard conversions. 52 * Any strings on other locations are still valid, however. 53 */ 54static const char *SVN_UTF_NTOU_XLATE_HANDLE = "svn-utf-ntou-xlate-handle"; 55static const char *SVN_UTF_UTON_XLATE_HANDLE = "svn-utf-uton-xlate-handle"; 56 57static const char *SVN_APR_UTF8_CHARSET = "UTF-8"; 58 59static svn_mutex__t *xlate_handle_mutex = NULL; 60static svn_boolean_t assume_native_charset_is_utf8 = FALSE; 61 62/* The xlate handle cache is a global hash table with linked lists of xlate 63 * handles. In multi-threaded environments, a thread "borrows" an xlate 64 * handle from the cache during a translation and puts it back afterwards. 65 * This avoids holding a global lock for all translations. 66 * If there is no handle for a particular key when needed, a new is 67 * handle is created and put in the cache after use. 68 * This means that there will be at most N handles open for a key, where N 69 * is the number of simultanous handles in use for that key. */ 70 71typedef struct xlate_handle_node_t { 72 apr_xlate_t *handle; 73 /* FALSE if the handle is not valid, since its pool is being 74 destroyed. */ 75 svn_boolean_t valid; 76 /* The name of a char encoding or APR_LOCALE_CHARSET. */ 77 const char *frompage, *topage; 78 struct xlate_handle_node_t *next; 79} xlate_handle_node_t; 80 81/* This maps const char * userdata_key strings to xlate_handle_node_t ** 82 handles to the first entry in the linked list of xlate handles. We don't 83 store the pointer to the list head directly in the hash table, since we 84 remove/insert entries at the head in the list in the code below, and 85 we can't use apr_hash_set() in each character translation because that 86 function allocates memory in each call where the value is non-NULL. 87 Since these allocations take place in a global pool, this would be a 88 memory leak. */ 89static apr_hash_t *xlate_handle_hash = NULL; 90 91/* "1st level cache" to standard conversion maps. We may access these 92 * using atomic xchange ops, i.e. without further thread synchronization. 93 * If the respective item is NULL, fallback to hash lookup. 94 */ 95static void * volatile xlat_ntou_static_handle = NULL; 96static void * volatile xlat_uton_static_handle = NULL; 97 98/* Clean up the xlate handle cache. */ 99static apr_status_t 100xlate_cleanup(void *arg) 101{ 102 /* We set the cache variables to NULL so that translation works in other 103 cleanup functions, even if it isn't cached then. */ 104 xlate_handle_hash = NULL; 105 106 /* ensure no stale objects get accessed */ 107 xlat_ntou_static_handle = NULL; 108 xlat_uton_static_handle = NULL; 109 110 return APR_SUCCESS; 111} 112 113/* Set the handle of ARG to NULL. */ 114static apr_status_t 115xlate_handle_node_cleanup(void *arg) 116{ 117 xlate_handle_node_t *node = arg; 118 119 node->valid = FALSE; 120 return APR_SUCCESS; 121} 122 123void 124svn_utf_initialize2(svn_boolean_t assume_native_utf8, 125 apr_pool_t *pool) 126{ 127 if (!xlate_handle_hash) 128 { 129 /* We create our own subpool, which we protect with the mutex. 130 We can't use the pool passed to us by the caller, since we will 131 use it for xlate handle allocations, possibly in multiple threads, 132 and pool allocation is not thread-safe. */ 133 apr_pool_t *subpool = svn_pool_create(pool); 134 svn_mutex__t *mutex; 135 svn_error_t *err = svn_mutex__init(&mutex, TRUE, subpool); 136 if (err) 137 { 138 svn_error_clear(err); 139 return; 140 } 141 142 xlate_handle_mutex = mutex; 143 xlate_handle_hash = apr_hash_make(subpool); 144 145 apr_pool_cleanup_register(subpool, NULL, xlate_cleanup, 146 apr_pool_cleanup_null); 147 } 148 149 if (!assume_native_charset_is_utf8) 150 assume_native_charset_is_utf8 = assume_native_utf8; 151} 152 153/* Return a unique string key based on TOPAGE and FROMPAGE. TOPAGE and 154 * FROMPAGE can be any valid arguments of the same name to 155 * apr_xlate_open(). Allocate the returned string in POOL. */ 156static const char* 157get_xlate_key(const char *topage, 158 const char *frompage, 159 apr_pool_t *pool) 160{ 161 /* In the cases of SVN_APR_LOCALE_CHARSET and SVN_APR_DEFAULT_CHARSET 162 * topage/frompage is really an int, not a valid string. So generate a 163 * unique key accordingly. */ 164 if (frompage == SVN_APR_LOCALE_CHARSET) 165 frompage = "APR_LOCALE_CHARSET"; 166 else if (frompage == SVN_APR_DEFAULT_CHARSET) 167 frompage = "APR_DEFAULT_CHARSET"; 168 169 if (topage == SVN_APR_LOCALE_CHARSET) 170 topage = "APR_LOCALE_CHARSET"; 171 else if (topage == SVN_APR_DEFAULT_CHARSET) 172 topage = "APR_DEFAULT_CHARSET"; 173 174 return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage, 175 "-xlate-handle", (char *)NULL); 176} 177 178/* Atomically replace the content in *MEM with NEW_VALUE and return 179 * the previous content of *MEM. If atomicy cannot be guaranteed, 180 * *MEM will not be modified and NEW_VALUE is simply returned to 181 * the caller. 182 */ 183static APR_INLINE void* 184atomic_swap(void * volatile * mem, void *new_value) 185{ 186#if APR_HAS_THREADS 187#if APR_VERSION_AT_LEAST(1,3,0) 188 /* Cast is necessary because of APR bug: 189 https://issues.apache.org/bugzilla/show_bug.cgi?id=50731 */ 190 return apr_atomic_xchgptr((volatile void **)mem, new_value); 191#else 192 /* old APRs don't support atomic swaps. Simply return the 193 * input to the caller for further proccessing. */ 194 return new_value; 195#endif 196#else 197 /* no threads - no sync. necessary */ 198 void *old_value = (void*)*mem; 199 *mem = new_value; 200 return old_value; 201#endif 202} 203 204/* Set *RET to a newly created handle node for converting from FROMPAGE 205 to TOPAGE, If apr_xlate_open() returns APR_EINVAL or APR_ENOTIMPL, set 206 (*RET)->handle to NULL. If fail for any other reason, return the error. 207 Allocate *RET and its xlate handle in POOL. */ 208static svn_error_t * 209xlate_alloc_handle(xlate_handle_node_t **ret, 210 const char *topage, const char *frompage, 211 apr_pool_t *pool) 212{ 213 apr_status_t apr_err; 214 apr_xlate_t *handle; 215 const char *name; 216 217 /* The error handling doesn't support the following cases, since we don't 218 use them currently. Catch this here. */ 219 SVN_ERR_ASSERT(frompage != SVN_APR_DEFAULT_CHARSET 220 && topage != SVN_APR_DEFAULT_CHARSET 221 && (frompage != SVN_APR_LOCALE_CHARSET 222 || topage != SVN_APR_LOCALE_CHARSET)); 223 224 /* Try to create a handle. */ 225#if defined(WIN32) 226 apr_err = svn_subr__win32_xlate_open((win32_xlate_t **)&handle, topage, 227 frompage, pool); 228 name = "win32-xlate: "; 229#else 230 apr_err = apr_xlate_open(&handle, topage, frompage, pool); 231 name = "APR: "; 232#endif 233 234 if (APR_STATUS_IS_EINVAL(apr_err) || APR_STATUS_IS_ENOTIMPL(apr_err)) 235 handle = NULL; 236 else if (apr_err != APR_SUCCESS) 237 { 238 const char *errstr; 239 char apr_strerr[512]; 240 241 /* Can't use svn_error_wrap_apr here because it calls functions in 242 this file, leading to infinite recursion. */ 243 if (frompage == SVN_APR_LOCALE_CHARSET) 244 errstr = apr_psprintf(pool, 245 _("Can't create a character converter from " 246 "native encoding to '%s'"), topage); 247 else if (topage == SVN_APR_LOCALE_CHARSET) 248 errstr = apr_psprintf(pool, 249 _("Can't create a character converter from " 250 "'%s' to native encoding"), frompage); 251 else 252 errstr = apr_psprintf(pool, 253 _("Can't create a character converter from " 254 "'%s' to '%s'"), frompage, topage); 255 256 /* Just put the error on the stack, since svn_error_create duplicates it 257 later. APR_STRERR will be in the local encoding, not in UTF-8, though. 258 */ 259 svn_strerror(apr_err, apr_strerr, sizeof(apr_strerr)); 260 return svn_error_createf(SVN_ERR_PLUGIN_LOAD_FAILURE, 261 svn_error_create(apr_err, NULL, apr_strerr), 262 "%s%s", name, errstr); 263 } 264 265 /* Allocate and initialize the node. */ 266 *ret = apr_palloc(pool, sizeof(xlate_handle_node_t)); 267 (*ret)->handle = handle; 268 (*ret)->valid = TRUE; 269 (*ret)->frompage = ((frompage != SVN_APR_LOCALE_CHARSET) 270 ? apr_pstrdup(pool, frompage) : frompage); 271 (*ret)->topage = ((topage != SVN_APR_LOCALE_CHARSET) 272 ? apr_pstrdup(pool, topage) : topage); 273 (*ret)->next = NULL; 274 275 /* If we are called from inside a pool cleanup handler, the just created 276 xlate handle will be closed when that handler returns by a newly 277 registered cleanup handler, however, the handle is still cached by us. 278 To prevent this, we register a cleanup handler that will reset the valid 279 flag of our node, so we don't use an invalid handle. */ 280 if (handle) 281 apr_pool_cleanup_register(pool, *ret, xlate_handle_node_cleanup, 282 apr_pool_cleanup_null); 283 284 return SVN_NO_ERROR; 285} 286 287/* Extend xlate_alloc_handle by using USERDATA_KEY as a key in our 288 global hash map, if available. 289 290 Allocate *RET and its xlate handle in POOL if svn_utf_initialize() 291 hasn't been called or USERDATA_KEY is NULL. Else, allocate them 292 in the pool of xlate_handle_hash. 293 294 Note: this function is not thread-safe. Call get_xlate_handle_node 295 instead. */ 296static svn_error_t * 297get_xlate_handle_node_internal(xlate_handle_node_t **ret, 298 const char *topage, const char *frompage, 299 const char *userdata_key, apr_pool_t *pool) 300{ 301 /* If we already have a handle, just return it. */ 302 if (userdata_key && xlate_handle_hash) 303 { 304 xlate_handle_node_t *old_node = NULL; 305 306 /* 2nd level: hash lookup */ 307 xlate_handle_node_t **old_node_p = svn_hash_gets(xlate_handle_hash, 308 userdata_key); 309 if (old_node_p) 310 old_node = *old_node_p; 311 if (old_node) 312 { 313 /* Ensure that the handle is still valid. */ 314 if (old_node->valid) 315 { 316 /* Remove from the list. */ 317 *old_node_p = old_node->next; 318 old_node->next = NULL; 319 *ret = old_node; 320 return SVN_NO_ERROR; 321 } 322 } 323 } 324 325 /* Note that we still have the mutex locked (if it is initialized), so we 326 can use the global pool for creating the new xlate handle. */ 327 328 /* Use the correct pool for creating the handle. */ 329 pool = apr_hash_pool_get(xlate_handle_hash); 330 331 return xlate_alloc_handle(ret, topage, frompage, pool); 332} 333 334/* Set *RET to a handle node for converting from FROMPAGE to TOPAGE, 335 creating the handle node if it doesn't exist in USERDATA_KEY. 336 If a node is not cached and apr_xlate_open() returns APR_EINVAL or 337 APR_ENOTIMPL, set (*RET)->handle to NULL. If fail for any other 338 reason, return the error. 339 340 Allocate *RET and its xlate handle in POOL if svn_utf_initialize() 341 hasn't been called or USERDATA_KEY is NULL. Else, allocate them 342 in the pool of xlate_handle_hash. */ 343static svn_error_t * 344get_xlate_handle_node(xlate_handle_node_t **ret, 345 const char *topage, const char *frompage, 346 const char *userdata_key, apr_pool_t *pool) 347{ 348 xlate_handle_node_t *old_node = NULL; 349 350 /* If we already have a handle, just return it. */ 351 if (userdata_key) 352 { 353 if (xlate_handle_hash) 354 { 355 /* 1st level: global, static items */ 356 if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE) 357 old_node = atomic_swap(&xlat_ntou_static_handle, NULL); 358 else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE) 359 old_node = atomic_swap(&xlat_uton_static_handle, NULL); 360 361 if (old_node && old_node->valid) 362 { 363 *ret = old_node; 364 return SVN_NO_ERROR; 365 } 366 } 367 else 368 { 369 void *p; 370 /* We fall back on a per-pool cache instead. */ 371 apr_pool_userdata_get(&p, userdata_key, pool); 372 old_node = p; 373 /* Ensure that the handle is still valid. */ 374 if (old_node && old_node->valid) 375 { 376 *ret = old_node; 377 return SVN_NO_ERROR; 378 } 379 380 return xlate_alloc_handle(ret, topage, frompage, pool); 381 } 382 } 383 384 SVN_MUTEX__WITH_LOCK(xlate_handle_mutex, 385 get_xlate_handle_node_internal(ret, 386 topage, 387 frompage, 388 userdata_key, 389 pool)); 390 391 return SVN_NO_ERROR; 392} 393 394/* Put back NODE into the xlate handle cache for use by other calls. 395 396 Note: this function is not thread-safe. Call put_xlate_handle_node 397 instead. */ 398static svn_error_t * 399put_xlate_handle_node_internal(xlate_handle_node_t *node, 400 const char *userdata_key) 401{ 402 xlate_handle_node_t **node_p = svn_hash_gets(xlate_handle_hash, userdata_key); 403 if (node_p == NULL) 404 { 405 userdata_key = apr_pstrdup(apr_hash_pool_get(xlate_handle_hash), 406 userdata_key); 407 node_p = apr_palloc(apr_hash_pool_get(xlate_handle_hash), 408 sizeof(*node_p)); 409 *node_p = NULL; 410 svn_hash_sets(xlate_handle_hash, userdata_key, node_p); 411 } 412 node->next = *node_p; 413 *node_p = node; 414 415 return SVN_NO_ERROR; 416} 417 418/* Put back NODE into the xlate handle cache for use by other calls. 419 If there is no global cache, store the handle in POOL. 420 Ignore errors related to locking/unlocking the mutex. */ 421static svn_error_t * 422put_xlate_handle_node(xlate_handle_node_t *node, 423 const char *userdata_key, 424 apr_pool_t *pool) 425{ 426 assert(node->next == NULL); 427 if (!userdata_key) 428 return SVN_NO_ERROR; 429 430 /* push previous global node to the hash */ 431 if (xlate_handle_hash) 432 { 433 /* 1st level: global, static items */ 434 if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE) 435 node = atomic_swap(&xlat_ntou_static_handle, node); 436 else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE) 437 node = atomic_swap(&xlat_uton_static_handle, node); 438 if (node == NULL) 439 return SVN_NO_ERROR; 440 441 SVN_MUTEX__WITH_LOCK(xlate_handle_mutex, 442 put_xlate_handle_node_internal(node, 443 userdata_key)); 444 } 445 else 446 { 447 /* Store it in the per-pool cache. */ 448 apr_pool_userdata_set(node, userdata_key, apr_pool_cleanup_null, pool); 449 } 450 451 return SVN_NO_ERROR; 452} 453 454/* Return the apr_xlate handle for converting native characters to UTF-8. */ 455static svn_error_t * 456get_ntou_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool) 457{ 458 return get_xlate_handle_node(ret, SVN_APR_UTF8_CHARSET, 459 assume_native_charset_is_utf8 460 ? SVN_APR_UTF8_CHARSET 461 : SVN_APR_LOCALE_CHARSET, 462 SVN_UTF_NTOU_XLATE_HANDLE, pool); 463} 464 465 466/* Return the apr_xlate handle for converting UTF-8 to native characters. 467 Create one if it doesn't exist. If unable to find a handle, or 468 unable to create one because apr_xlate_open returned APR_EINVAL, then 469 set *RET to null and return SVN_NO_ERROR; if fail for some other 470 reason, return error. */ 471static svn_error_t * 472get_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool) 473{ 474 return get_xlate_handle_node(ret, 475 assume_native_charset_is_utf8 476 ? SVN_APR_UTF8_CHARSET 477 : SVN_APR_LOCALE_CHARSET, 478 SVN_APR_UTF8_CHARSET, 479 SVN_UTF_UTON_XLATE_HANDLE, pool); 480} 481 482 483/* Copy LEN bytes of SRC, converting non-ASCII and zero bytes to ?\nnn 484 sequences, allocating the result in POOL. */ 485static const char * 486fuzzy_escape(const char *src, apr_size_t len, apr_pool_t *pool) 487{ 488 const char *src_orig = src, *src_end = src + len; 489 apr_size_t new_len = 0; 490 char *new; 491 const char *new_orig; 492 493 /* First count how big a dest string we'll need. */ 494 while (src < src_end) 495 { 496 if (! svn_ctype_isascii(*src) || *src == '\0') 497 new_len += 5; /* 5 slots, for "?\XXX" */ 498 else 499 new_len += 1; /* one slot for the 7-bit char */ 500 501 src++; 502 } 503 504 /* Allocate that amount, plus one slot for '\0' character. */ 505 new = apr_palloc(pool, new_len + 1); 506 507 new_orig = new; 508 509 /* And fill it up. */ 510 while (src_orig < src_end) 511 { 512 if (! svn_ctype_isascii(*src_orig) || src_orig == '\0') 513 { 514 /* This is the same format as svn_xml_fuzzy_escape uses, but that 515 function escapes different characters. Please keep in sync! 516 ### If we add another fuzzy escape somewhere, we should abstract 517 ### this out to a common function. */ 518 apr_snprintf(new, 6, "?\\%03u", (unsigned char) *src_orig); 519 new += 5; 520 } 521 else 522 { 523 *new = *src_orig; 524 new += 1; 525 } 526 527 src_orig++; 528 } 529 530 *new = '\0'; 531 532 return new_orig; 533} 534 535/* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result 536 in *DEST, which is allocated in POOL. */ 537static svn_error_t * 538convert_to_stringbuf(xlate_handle_node_t *node, 539 const char *src_data, 540 apr_size_t src_length, 541 svn_stringbuf_t **dest, 542 apr_pool_t *pool) 543{ 544#ifdef WIN32 545 apr_status_t apr_err; 546 547 apr_err = svn_subr__win32_xlate_to_stringbuf((win32_xlate_t *) node->handle, 548 src_data, src_length, 549 dest, pool); 550#else 551 apr_size_t buflen = src_length * 2; 552 apr_status_t apr_err; 553 apr_size_t srclen = src_length; 554 apr_size_t destlen = buflen; 555 556 /* Initialize *DEST to an empty stringbuf. 557 A 1:2 ratio of input bytes to output bytes (as assigned above) 558 should be enough for most translations, and if it turns out not 559 to be enough, we'll grow the buffer again, sizing it based on a 560 1:3 ratio of the remainder of the string. */ 561 *dest = svn_stringbuf_create_ensure(buflen + 1, pool); 562 563 /* Not only does it not make sense to convert an empty string, but 564 apr-iconv is quite unreasonable about not allowing that. */ 565 if (src_length == 0) 566 return SVN_NO_ERROR; 567 568 do 569 { 570 /* Set up state variables for xlate. */ 571 destlen = buflen - (*dest)->len; 572 573 /* Attempt the conversion. */ 574 apr_err = apr_xlate_conv_buffer(node->handle, 575 src_data + (src_length - srclen), 576 &srclen, 577 (*dest)->data + (*dest)->len, 578 &destlen); 579 580 /* Now, update the *DEST->len to track the amount of output data 581 churned out so far from this loop. */ 582 (*dest)->len += ((buflen - (*dest)->len) - destlen); 583 buflen += srclen * 3; /* 3 is middle ground, 2 wasn't enough 584 for all characters in the buffer, 4 is 585 maximum character size (currently) */ 586 587 588 } while (apr_err == APR_SUCCESS && srclen != 0); 589#endif 590 591 /* If we exited the loop with an error, return the error. */ 592 if (apr_err) 593 { 594 const char *errstr; 595 svn_error_t *err; 596 597 /* Can't use svn_error_wrap_apr here because it calls functions in 598 this file, leading to infinite recursion. */ 599 if (node->frompage == SVN_APR_LOCALE_CHARSET) 600 errstr = apr_psprintf 601 (pool, _("Can't convert string from native encoding to '%s':"), 602 node->topage); 603 else if (node->topage == SVN_APR_LOCALE_CHARSET) 604 errstr = apr_psprintf 605 (pool, _("Can't convert string from '%s' to native encoding:"), 606 node->frompage); 607 else 608 errstr = apr_psprintf 609 (pool, _("Can't convert string from '%s' to '%s':"), 610 node->frompage, node->topage); 611 612 err = svn_error_create(apr_err, NULL, fuzzy_escape(src_data, 613 src_length, pool)); 614 return svn_error_create(apr_err, err, errstr); 615 } 616 /* Else, exited due to success. Trim the result buffer down to the 617 right length. */ 618 (*dest)->data[(*dest)->len] = '\0'; 619 620 return SVN_NO_ERROR; 621} 622 623 624/* Return APR_EINVAL if the first LEN bytes of DATA contain anything 625 other than seven-bit, non-control (except for whitespace) ASCII 626 characters, finding the error pool from POOL. Otherwise, return 627 SVN_NO_ERROR. */ 628static svn_error_t * 629check_non_ascii(const char *data, apr_size_t len, apr_pool_t *pool) 630{ 631 const char *data_start = data; 632 633 for (; len > 0; --len, data++) 634 { 635 if ((! svn_ctype_isascii(*data)) 636 || ((! svn_ctype_isspace(*data)) 637 && svn_ctype_iscntrl(*data))) 638 { 639 /* Show the printable part of the data, followed by the 640 decimal code of the questionable character. Because if a 641 user ever gets this error, she's going to have to spend 642 time tracking down the non-ASCII data, so we want to help 643 as much as possible. And yes, we just call the unsafe 644 data "non-ASCII", even though the actual constraint is 645 somewhat more complex than that. */ 646 647 if (data - data_start) 648 { 649 const char *error_data 650 = apr_pstrndup(pool, data_start, (data - data_start)); 651 652 return svn_error_createf 653 (APR_EINVAL, NULL, 654 _("Safe data '%s' was followed by non-ASCII byte %d: " 655 "unable to convert to/from UTF-8"), 656 error_data, *((const unsigned char *) data)); 657 } 658 else 659 { 660 return svn_error_createf 661 (APR_EINVAL, NULL, 662 _("Non-ASCII character (code %d) detected, " 663 "and unable to convert to/from UTF-8"), 664 *((const unsigned char *) data)); 665 } 666 } 667 } 668 669 return SVN_NO_ERROR; 670} 671 672/* Construct an error with code APR_EINVAL and with a suitable message 673 * to describe the invalid UTF-8 sequence DATA of length LEN (which 674 * may have embedded NULLs). We can't simply print the data, almost 675 * by definition we don't really know how it is encoded. 676 */ 677static svn_error_t * 678invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool) 679{ 680 const char *last = svn_utf__last_valid(data, len); 681 const char *valid_txt = "", *invalid_txt = ""; 682 apr_size_t i; 683 size_t valid, invalid; 684 685 /* We will display at most 24 valid octets (this may split a leading 686 multi-byte character) as that should fit on one 80 character line. */ 687 valid = last - data; 688 if (valid > 24) 689 valid = 24; 690 for (i = 0; i < valid; ++i) 691 valid_txt = apr_pstrcat(pool, valid_txt, 692 apr_psprintf(pool, " %02x", 693 (unsigned char)last[i-valid]), 694 (char *)NULL); 695 696 /* 4 invalid octets will guarantee that the faulty octet is displayed */ 697 invalid = data + len - last; 698 if (invalid > 4) 699 invalid = 4; 700 for (i = 0; i < invalid; ++i) 701 invalid_txt = apr_pstrcat(pool, invalid_txt, 702 apr_psprintf(pool, " %02x", 703 (unsigned char)last[i]), 704 (char *)NULL); 705 706 return svn_error_createf(APR_EINVAL, NULL, 707 _("Valid UTF-8 data\n(hex:%s)\n" 708 "followed by invalid UTF-8 sequence\n(hex:%s)"), 709 valid_txt, invalid_txt); 710} 711 712/* Verify that the sequence DATA of length LEN is valid UTF-8. 713 If it is not, return an error with code APR_EINVAL. */ 714static svn_error_t * 715check_utf8(const char *data, apr_size_t len, apr_pool_t *pool) 716{ 717 if (! svn_utf__is_valid(data, len)) 718 return invalid_utf8(data, len, pool); 719 return SVN_NO_ERROR; 720} 721 722/* Verify that the NULL terminated sequence DATA is valid UTF-8. 723 If it is not, return an error with code APR_EINVAL. */ 724static svn_error_t * 725check_cstring_utf8(const char *data, apr_pool_t *pool) 726{ 727 728 if (! svn_utf__cstring_is_valid(data)) 729 return invalid_utf8(data, strlen(data), pool); 730 return SVN_NO_ERROR; 731} 732 733 734svn_error_t * 735svn_utf_stringbuf_to_utf8(svn_stringbuf_t **dest, 736 const svn_stringbuf_t *src, 737 apr_pool_t *pool) 738{ 739 xlate_handle_node_t *node; 740 svn_error_t *err; 741 742 SVN_ERR(get_ntou_xlate_handle_node(&node, pool)); 743 744 if (node->handle) 745 { 746 err = convert_to_stringbuf(node, src->data, src->len, dest, pool); 747 if (! err) 748 err = check_utf8((*dest)->data, (*dest)->len, pool); 749 } 750 else 751 { 752 err = check_non_ascii(src->data, src->len, pool); 753 if (! err) 754 *dest = svn_stringbuf_dup(src, pool); 755 } 756 757 return svn_error_compose_create(err, 758 put_xlate_handle_node 759 (node, 760 SVN_UTF_NTOU_XLATE_HANDLE, 761 pool)); 762} 763 764 765svn_error_t * 766svn_utf_string_to_utf8(const svn_string_t **dest, 767 const svn_string_t *src, 768 apr_pool_t *pool) 769{ 770 svn_stringbuf_t *destbuf; 771 xlate_handle_node_t *node; 772 svn_error_t *err; 773 774 SVN_ERR(get_ntou_xlate_handle_node(&node, pool)); 775 776 if (node->handle) 777 { 778 err = convert_to_stringbuf(node, src->data, src->len, &destbuf, pool); 779 if (! err) 780 err = check_utf8(destbuf->data, destbuf->len, pool); 781 if (! err) 782 *dest = svn_stringbuf__morph_into_string(destbuf); 783 } 784 else 785 { 786 err = check_non_ascii(src->data, src->len, pool); 787 if (! err) 788 *dest = svn_string_dup(src, pool); 789 } 790 791 return svn_error_compose_create(err, 792 put_xlate_handle_node 793 (node, 794 SVN_UTF_NTOU_XLATE_HANDLE, 795 pool)); 796} 797 798 799/* Common implementation for svn_utf_cstring_to_utf8, 800 svn_utf_cstring_to_utf8_ex, svn_utf_cstring_from_utf8 and 801 svn_utf_cstring_from_utf8_ex. Convert SRC to DEST using NODE->handle as 802 the translator and allocating from POOL. */ 803static svn_error_t * 804convert_cstring(const char **dest, 805 const char *src, 806 xlate_handle_node_t *node, 807 apr_pool_t *pool) 808{ 809 if (node->handle) 810 { 811 svn_stringbuf_t *destbuf; 812 SVN_ERR(convert_to_stringbuf(node, src, strlen(src), 813 &destbuf, pool)); 814 *dest = destbuf->data; 815 } 816 else 817 { 818 apr_size_t len = strlen(src); 819 SVN_ERR(check_non_ascii(src, len, pool)); 820 *dest = apr_pstrmemdup(pool, src, len); 821 } 822 return SVN_NO_ERROR; 823} 824 825 826svn_error_t * 827svn_utf_cstring_to_utf8(const char **dest, 828 const char *src, 829 apr_pool_t *pool) 830{ 831 xlate_handle_node_t *node; 832 svn_error_t *err; 833 834 SVN_ERR(get_ntou_xlate_handle_node(&node, pool)); 835 err = convert_cstring(dest, src, node, pool); 836 SVN_ERR(svn_error_compose_create(err, 837 put_xlate_handle_node 838 (node, 839 SVN_UTF_NTOU_XLATE_HANDLE, 840 pool))); 841 return check_cstring_utf8(*dest, pool); 842} 843 844 845svn_error_t * 846svn_utf_cstring_to_utf8_ex2(const char **dest, 847 const char *src, 848 const char *frompage, 849 apr_pool_t *pool) 850{ 851 xlate_handle_node_t *node; 852 svn_error_t *err; 853 const char *convset_key = get_xlate_key(SVN_APR_UTF8_CHARSET, frompage, 854 pool); 855 856 SVN_ERR(get_xlate_handle_node(&node, SVN_APR_UTF8_CHARSET, frompage, 857 convset_key, pool)); 858 err = convert_cstring(dest, src, node, pool); 859 SVN_ERR(svn_error_compose_create(err, 860 put_xlate_handle_node 861 (node, 862 SVN_UTF_NTOU_XLATE_HANDLE, 863 pool))); 864 865 return check_cstring_utf8(*dest, pool); 866} 867 868 869svn_error_t * 870svn_utf_cstring_to_utf8_ex(const char **dest, 871 const char *src, 872 const char *frompage, 873 const char *convset_key, 874 apr_pool_t *pool) 875{ 876 return svn_utf_cstring_to_utf8_ex2(dest, src, frompage, pool); 877} 878 879 880svn_error_t * 881svn_utf_stringbuf_from_utf8(svn_stringbuf_t **dest, 882 const svn_stringbuf_t *src, 883 apr_pool_t *pool) 884{ 885 xlate_handle_node_t *node; 886 svn_error_t *err; 887 888 SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 889 890 if (node->handle) 891 { 892 err = check_utf8(src->data, src->len, pool); 893 if (! err) 894 err = convert_to_stringbuf(node, src->data, src->len, dest, pool); 895 } 896 else 897 { 898 err = check_non_ascii(src->data, src->len, pool); 899 if (! err) 900 *dest = svn_stringbuf_dup(src, pool); 901 } 902 903 err = svn_error_compose_create( 904 err, 905 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 906 907 return err; 908} 909 910 911svn_error_t * 912svn_utf_string_from_utf8(const svn_string_t **dest, 913 const svn_string_t *src, 914 apr_pool_t *pool) 915{ 916 svn_stringbuf_t *dbuf; 917 xlate_handle_node_t *node; 918 svn_error_t *err; 919 920 SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 921 922 if (node->handle) 923 { 924 err = check_utf8(src->data, src->len, pool); 925 if (! err) 926 err = convert_to_stringbuf(node, src->data, src->len, 927 &dbuf, pool); 928 if (! err) 929 *dest = svn_stringbuf__morph_into_string(dbuf); 930 } 931 else 932 { 933 err = check_non_ascii(src->data, src->len, pool); 934 if (! err) 935 *dest = svn_string_dup(src, pool); 936 } 937 938 err = svn_error_compose_create( 939 err, 940 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 941 942 return err; 943} 944 945 946svn_error_t * 947svn_utf_cstring_from_utf8(const char **dest, 948 const char *src, 949 apr_pool_t *pool) 950{ 951 xlate_handle_node_t *node; 952 svn_error_t *err; 953 954 SVN_ERR(check_cstring_utf8(src, pool)); 955 956 SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 957 err = convert_cstring(dest, src, node, pool); 958 err = svn_error_compose_create( 959 err, 960 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 961 962 return err; 963} 964 965 966svn_error_t * 967svn_utf_cstring_from_utf8_ex2(const char **dest, 968 const char *src, 969 const char *topage, 970 apr_pool_t *pool) 971{ 972 xlate_handle_node_t *node; 973 svn_error_t *err; 974 const char *convset_key = get_xlate_key(topage, SVN_APR_UTF8_CHARSET, 975 pool); 976 977 SVN_ERR(check_cstring_utf8(src, pool)); 978 979 SVN_ERR(get_xlate_handle_node(&node, topage, SVN_APR_UTF8_CHARSET, 980 convset_key, pool)); 981 err = convert_cstring(dest, src, node, pool); 982 err = svn_error_compose_create( 983 err, 984 put_xlate_handle_node(node, convset_key, pool)); 985 986 return err; 987} 988 989 990svn_error_t * 991svn_utf_cstring_from_utf8_ex(const char **dest, 992 const char *src, 993 const char *topage, 994 const char *convset_key, 995 apr_pool_t *pool) 996{ 997 return svn_utf_cstring_from_utf8_ex2(dest, src, topage, pool); 998} 999 1000 1001const char * 1002svn_utf__cstring_from_utf8_fuzzy(const char *src, 1003 apr_pool_t *pool, 1004 svn_error_t *(*convert_from_utf8) 1005 (const char **, const char *, apr_pool_t *)) 1006{ 1007 const char *escaped, *converted; 1008 svn_error_t *err; 1009 1010 escaped = fuzzy_escape(src, strlen(src), pool); 1011 1012 /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to 1013 contain only 7-bit bytes :-). Recode to native... */ 1014 err = convert_from_utf8(((const char **) &converted), escaped, pool); 1015 1016 if (err) 1017 { 1018 svn_error_clear(err); 1019 return escaped; 1020 } 1021 else 1022 return converted; 1023 1024 /* ### Check the client locale, maybe we can avoid that second 1025 * conversion! See Ulrich Drepper's patch at 1026 * http://subversion.tigris.org/issues/show_bug.cgi?id=807. 1027 */ 1028} 1029 1030 1031const char * 1032svn_utf_cstring_from_utf8_fuzzy(const char *src, 1033 apr_pool_t *pool) 1034{ 1035 return svn_utf__cstring_from_utf8_fuzzy(src, pool, 1036 svn_utf_cstring_from_utf8); 1037} 1038 1039 1040svn_error_t * 1041svn_utf_cstring_from_utf8_stringbuf(const char **dest, 1042 const svn_stringbuf_t *src, 1043 apr_pool_t *pool) 1044{ 1045 svn_stringbuf_t *destbuf; 1046 1047 SVN_ERR(svn_utf_stringbuf_from_utf8(&destbuf, src, pool)); 1048 *dest = destbuf->data; 1049 1050 return SVN_NO_ERROR; 1051} 1052 1053 1054svn_error_t * 1055svn_utf_cstring_from_utf8_string(const char **dest, 1056 const svn_string_t *src, 1057 apr_pool_t *pool) 1058{ 1059 svn_stringbuf_t *dbuf; 1060 xlate_handle_node_t *node; 1061 svn_error_t *err; 1062 1063 SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 1064 1065 if (node->handle) 1066 { 1067 err = check_utf8(src->data, src->len, pool); 1068 if (! err) 1069 err = convert_to_stringbuf(node, src->data, src->len, 1070 &dbuf, pool); 1071 if (! err) 1072 *dest = dbuf->data; 1073 } 1074 else 1075 { 1076 err = check_non_ascii(src->data, src->len, pool); 1077 if (! err) 1078 *dest = apr_pstrmemdup(pool, src->data, src->len); 1079 } 1080 1081 err = svn_error_compose_create( 1082 err, 1083 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 1084 1085 return err; 1086} 1087