1238730Sdelphij/* 2330571Sdelphij * Copyright (C) 1984-2017 Mark Nudelman 3238730Sdelphij * 4238730Sdelphij * You may distribute under the terms of either the GNU General Public 5238730Sdelphij * License or the Less License, as specified in the README file. 6238730Sdelphij * 7238730Sdelphij * For more information, see the README file. 8238730Sdelphij */ 960786Sps 1060786Sps 1160786Sps/* 1260786Sps * Functions to define the character set 1360786Sps * and do things specific to the character set. 1460786Sps */ 1560786Sps 1660786Sps#include "less.h" 1760786Sps#if HAVE_LOCALE 1860786Sps#include <locale.h> 1960786Sps#include <ctype.h> 20161475Sdelphij#include <langinfo.h> 2160786Sps#endif 2260786Sps 23161475Sdelphij#include "charset.h" 24161475Sdelphij 25330571Sdelphij#if MSDOS_COMPILER==WIN32C 26330571Sdelphij#define WIN32_LEAN_AND_MEAN 27330571Sdelphij#include <windows.h> 28330571Sdelphij#endif 29330571Sdelphij 30330571Sdelphijextern int bs_mode; 31330571Sdelphij 3260786Spspublic int utf_mode = 0; 3360786Sps 3460786Sps/* 3560786Sps * Predefined character sets, 3660786Sps * selected by the LESSCHARSET environment variable. 3760786Sps */ 3860786Spsstruct charset { 3960786Sps char *name; 4060786Sps int *p_flag; 4160786Sps char *desc; 4260786Sps} charsets[] = { 43161475Sdelphij { "ascii", NULL, "8bcccbcc18b95.b" }, 44161475Sdelphij { "utf-8", &utf_mode, "8bcccbcc18b95.b126.bb" }, 45161475Sdelphij { "iso8859", NULL, "8bcccbcc18b95.33b." }, 46161475Sdelphij { "latin3", NULL, "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." }, 47161475Sdelphij { "arabic", NULL, "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" }, 48161475Sdelphij { "greek", NULL, "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" }, 49161475Sdelphij { "greek2005", NULL, "8bcccbcc18b95.33b14.b35.b44.b" }, 50161475Sdelphij { "hebrew", NULL, "8bcccbcc18b95.33b.b29.32b28.2b2.b" }, 51161475Sdelphij { "koi8-r", NULL, "8bcccbcc18b95.b." }, 52161475Sdelphij { "KOI8-T", NULL, "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." }, 53161475Sdelphij { "georgianps", NULL, "8bcccbcc18b95.3b11.4b12.2b." }, 54161475Sdelphij { "tcvn", NULL, "b..b...bcccbccbbb7.8b95.b48.5b." }, 55161475Sdelphij { "TIS-620", NULL, "8bcccbcc18b95.b.4b.11b7.8b." }, 56161475Sdelphij { "next", NULL, "8bcccbcc18b95.bb125.bb" }, 57161475Sdelphij { "dos", NULL, "8bcccbcc12bc5b95.b." }, 58161475Sdelphij { "windows-1251", NULL, "8bcccbcc12bc5b95.b24.b." }, 59161475Sdelphij { "windows-1252", NULL, "8bcccbcc12bc5b95.b.b11.b.2b12.b." }, 60161475Sdelphij { "windows-1255", NULL, "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." }, 61161475Sdelphij { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." }, 62161475Sdelphij { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" }, 6360786Sps { NULL, NULL, NULL } 6460786Sps}; 6560786Sps 66161475Sdelphij/* 67161475Sdelphij * Support "locale charmap"/nl_langinfo(CODESET) values, as well as others. 68161475Sdelphij */ 6989019Spsstruct cs_alias { 7089019Sps char *name; 7189019Sps char *oname; 7289019Sps} cs_aliases[] = { 73161475Sdelphij { "UTF-8", "utf-8" }, 74330571Sdelphij { "utf8", "utf-8" }, 75330571Sdelphij { "UTF8", "utf-8" }, 76161475Sdelphij { "ANSI_X3.4-1968", "ascii" }, 77161475Sdelphij { "US-ASCII", "ascii" }, 78161475Sdelphij { "latin1", "iso8859" }, 79161475Sdelphij { "ISO-8859-1", "iso8859" }, 80161475Sdelphij { "latin9", "iso8859" }, 81161475Sdelphij { "ISO-8859-15", "iso8859" }, 82161475Sdelphij { "latin2", "iso8859" }, 83161475Sdelphij { "ISO-8859-2", "iso8859" }, 84161475Sdelphij { "ISO-8859-3", "latin3" }, 85161475Sdelphij { "latin4", "iso8859" }, 86161475Sdelphij { "ISO-8859-4", "iso8859" }, 87161475Sdelphij { "cyrillic", "iso8859" }, 88161475Sdelphij { "ISO-8859-5", "iso8859" }, 89161475Sdelphij { "ISO-8859-6", "arabic" }, 90161475Sdelphij { "ISO-8859-7", "greek" }, 91161475Sdelphij { "IBM9005", "greek2005" }, 92161475Sdelphij { "ISO-8859-8", "hebrew" }, 93161475Sdelphij { "latin5", "iso8859" }, 94161475Sdelphij { "ISO-8859-9", "iso8859" }, 95161475Sdelphij { "latin6", "iso8859" }, 96161475Sdelphij { "ISO-8859-10", "iso8859" }, 97161475Sdelphij { "latin7", "iso8859" }, 98161475Sdelphij { "ISO-8859-13", "iso8859" }, 99161475Sdelphij { "latin8", "iso8859" }, 100161475Sdelphij { "ISO-8859-14", "iso8859" }, 101161475Sdelphij { "latin10", "iso8859" }, 102161475Sdelphij { "ISO-8859-16", "iso8859" }, 103161475Sdelphij { "IBM437", "dos" }, 104161475Sdelphij { "EBCDIC-US", "ebcdic" }, 105161475Sdelphij { "IBM1047", "IBM-1047" }, 106161475Sdelphij { "KOI8-R", "koi8-r" }, 107161475Sdelphij { "KOI8-U", "koi8-r" }, 108161475Sdelphij { "GEORGIAN-PS", "georgianps" }, 109161475Sdelphij { "TCVN5712-1", "tcvn" }, 110161475Sdelphij { "NEXTSTEP", "next" }, 111161475Sdelphij { "windows", "windows-1252" }, /* backward compatibility */ 112161475Sdelphij { "CP1251", "windows-1251" }, 113161475Sdelphij { "CP1252", "windows-1252" }, 114161475Sdelphij { "CP1255", "windows-1255" }, 11589019Sps { NULL, NULL } 11689019Sps}; 11789019Sps 11860786Sps#define IS_BINARY_CHAR 01 11960786Sps#define IS_CONTROL_CHAR 02 12060786Sps 12160786Spsstatic char chardef[256]; 12260786Spsstatic char *binfmt = NULL; 123161475Sdelphijstatic char *utfbinfmt = NULL; 12460786Spspublic int binattr = AT_STANDOUT; 12560786Sps 12660786Sps 12760786Sps/* 12860786Sps * Define a charset, given a description string. 12960786Sps * The string consists of 256 letters, 13060786Sps * one for each character in the charset. 13160786Sps * If the string is shorter than 256 letters, missing letters 13260786Sps * are taken to be identical to the last one. 13360786Sps * A decimal number followed by a letter is taken to be a 13460786Sps * repetition of the letter. 13560786Sps * 13660786Sps * Each letter is one of: 13760786Sps * . normal character 13860786Sps * b binary character 13960786Sps * c control character 14060786Sps */ 14160786Sps static void 14260786Spsichardef(s) 14360786Sps char *s; 14460786Sps{ 145330571Sdelphij char *cp; 146330571Sdelphij int n; 147330571Sdelphij char v; 14860786Sps 14960786Sps n = 0; 15060786Sps v = 0; 15160786Sps cp = chardef; 15260786Sps while (*s != '\0') 15360786Sps { 15460786Sps switch (*s++) 15560786Sps { 15660786Sps case '.': 15760786Sps v = 0; 15860786Sps break; 15960786Sps case 'c': 16060786Sps v = IS_CONTROL_CHAR; 16160786Sps break; 16260786Sps case 'b': 16360786Sps v = IS_BINARY_CHAR|IS_CONTROL_CHAR; 16460786Sps break; 16560786Sps 16660786Sps case '0': case '1': case '2': case '3': case '4': 16760786Sps case '5': case '6': case '7': case '8': case '9': 16860786Sps n = (10 * n) + (s[-1] - '0'); 16960786Sps continue; 17060786Sps 17160786Sps default: 17260786Sps error("invalid chardef", NULL_PARG); 17360786Sps quit(QUIT_ERROR); 17460786Sps /*NOTREACHED*/ 17560786Sps } 17660786Sps 17760786Sps do 17860786Sps { 17960786Sps if (cp >= chardef + sizeof(chardef)) 18060786Sps { 18160786Sps error("chardef longer than 256", NULL_PARG); 18260786Sps quit(QUIT_ERROR); 18360786Sps /*NOTREACHED*/ 18460786Sps } 18560786Sps *cp++ = v; 18660786Sps } while (--n > 0); 18760786Sps n = 0; 18860786Sps } 18960786Sps 19060786Sps while (cp < chardef + sizeof(chardef)) 19160786Sps *cp++ = v; 19260786Sps} 19360786Sps 19460786Sps/* 19560786Sps * Define a charset, given a charset name. 19660786Sps * The valid charset names are listed in the "charsets" array. 19760786Sps */ 19860786Sps static int 199161475Sdelphijicharset(name, no_error) 200330571Sdelphij char *name; 201161475Sdelphij int no_error; 20260786Sps{ 203330571Sdelphij struct charset *p; 204330571Sdelphij struct cs_alias *a; 20560786Sps 20660786Sps if (name == NULL || *name == '\0') 20760786Sps return (0); 20860786Sps 20989019Sps /* First see if the name is an alias. */ 21089019Sps for (a = cs_aliases; a->name != NULL; a++) 21189019Sps { 21289019Sps if (strcmp(name, a->name) == 0) 21389019Sps { 21489019Sps name = a->oname; 21589019Sps break; 21689019Sps } 21789019Sps } 21889019Sps 21960786Sps for (p = charsets; p->name != NULL; p++) 22060786Sps { 22160786Sps if (strcmp(name, p->name) == 0) 22260786Sps { 22360786Sps ichardef(p->desc); 22460786Sps if (p->p_flag != NULL) 225330571Sdelphij { 226330571Sdelphij#if MSDOS_COMPILER==WIN32C 227330571Sdelphij *(p->p_flag) = 1 + (GetConsoleOutputCP() != CP_UTF8); 228330571Sdelphij#else 22960786Sps *(p->p_flag) = 1; 230330571Sdelphij#endif 231330571Sdelphij } 23260786Sps return (1); 23360786Sps } 23460786Sps } 23560786Sps 236161475Sdelphij if (!no_error) { 237161475Sdelphij error("invalid charset name", NULL_PARG); 238161475Sdelphij quit(QUIT_ERROR); 239161475Sdelphij } 240128345Stjr return (0); 24160786Sps} 24260786Sps 24360786Sps#if HAVE_LOCALE 24460786Sps/* 24560786Sps * Define a charset, given a locale name. 24660786Sps */ 24760786Sps static void 24860786Spsilocale() 24960786Sps{ 250330571Sdelphij int c; 25160786Sps 25260786Sps for (c = 0; c < (int) sizeof(chardef); c++) 25360786Sps { 25460786Sps if (isprint(c)) 25560786Sps chardef[c] = 0; 25660786Sps else if (iscntrl(c)) 25760786Sps chardef[c] = IS_CONTROL_CHAR; 25860786Sps else 25960786Sps chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; 26060786Sps } 26160786Sps} 26260786Sps#endif 26360786Sps 26460786Sps/* 265161475Sdelphij * Define the printing format for control (or binary utf) chars. 26660786Sps */ 267330571Sdelphij public void 268330571Sdelphijsetfmt(s, fmtvarptr, attrptr, default_fmt) 26960786Sps char *s; 270161475Sdelphij char **fmtvarptr; 271330571Sdelphij int *attrptr; 272161475Sdelphij char *default_fmt; 27360786Sps{ 274161475Sdelphij if (s && utf_mode) 275161475Sdelphij { 276161475Sdelphij /* It would be too hard to account for width otherwise. */ 277330571Sdelphij char constant *t = s; 278161475Sdelphij while (*t) 279161475Sdelphij { 280161475Sdelphij if (*t < ' ' || *t > '~') 281161475Sdelphij { 282161475Sdelphij s = default_fmt; 283161475Sdelphij goto attr; 284161475Sdelphij } 285161475Sdelphij t++; 286161475Sdelphij } 287161475Sdelphij } 288161475Sdelphij 289161475Sdelphij /* %n is evil */ 290161475Sdelphij if (s == NULL || *s == '\0' || 291161475Sdelphij (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) || 292161475Sdelphij (*s != '*' && strchr(s, 'n'))) 293161475Sdelphij s = default_fmt; 294161475Sdelphij 29560786Sps /* 29660786Sps * Select the attributes if it starts with "*". 29760786Sps */ 298161475Sdelphij attr: 299330571Sdelphij if (*s == '*' && s[1] != '\0') 30060786Sps { 30160786Sps switch (s[1]) 30260786Sps { 303330571Sdelphij case 'd': *attrptr = AT_BOLD; break; 304330571Sdelphij case 'k': *attrptr = AT_BLINK; break; 305330571Sdelphij case 's': *attrptr = AT_STANDOUT; break; 306330571Sdelphij case 'u': *attrptr = AT_UNDERLINE; break; 307330571Sdelphij default: *attrptr = AT_NORMAL; break; 30860786Sps } 30960786Sps s += 2; 31060786Sps } 311161475Sdelphij *fmtvarptr = s; 31260786Sps} 31360786Sps 31460786Sps/* 315161475Sdelphij * 31660786Sps */ 317161475Sdelphij static void 318161475Sdelphijset_charset() 31960786Sps{ 320161475Sdelphij char *s; 32160786Sps 322330571Sdelphij#if MSDOS_COMPILER==WIN32C 32360786Sps /* 324330571Sdelphij * If the Windows console is using UTF-8, we'll use it too. 325330571Sdelphij */ 326330571Sdelphij if (GetConsoleOutputCP() == CP_UTF8) 327330571Sdelphij if (icharset("utf-8", 1)) 328330571Sdelphij return; 329330571Sdelphij#endif 330330571Sdelphij /* 33160786Sps * See if environment variable LESSCHARSET is defined. 33260786Sps */ 33360786Sps s = lgetenv("LESSCHARSET"); 334161475Sdelphij if (icharset(s, 0)) 33560786Sps return; 336161475Sdelphij 33760786Sps /* 33860786Sps * LESSCHARSET is not defined: try LESSCHARDEF. 33960786Sps */ 34060786Sps s = lgetenv("LESSCHARDEF"); 34160786Sps if (s != NULL && *s != '\0') 34260786Sps { 34360786Sps ichardef(s); 34460786Sps return; 34560786Sps } 34660786Sps 347161475Sdelphij#if HAVE_LOCALE 348170256Sdelphij#ifdef CODESET 349161475Sdelphij /* 350161475Sdelphij * Try using the codeset name as the charset name. 351161475Sdelphij */ 352161475Sdelphij s = nl_langinfo(CODESET); 353161475Sdelphij if (icharset(s, 1)) 354161475Sdelphij return; 355161475Sdelphij#endif 356170256Sdelphij#endif 357161475Sdelphij 35860786Sps#if HAVE_STRSTR 35960786Sps /* 36060786Sps * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used. 36160786Sps */ 36260786Sps if ((s = lgetenv("LC_ALL")) != NULL || 36360786Sps (s = lgetenv("LC_CTYPE")) != NULL || 36460786Sps (s = lgetenv("LANG")) != NULL) 36560786Sps { 366161475Sdelphij if ( strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL 367161475Sdelphij || strstr(s, "UTF8") != NULL || strstr(s, "utf8") != NULL) 368161475Sdelphij if (icharset("utf-8", 1)) 36960786Sps return; 37060786Sps } 37160786Sps#endif 37260786Sps 37360786Sps#if HAVE_LOCALE 37460786Sps /* 375161475Sdelphij * Get character definitions from locale functions, 376161475Sdelphij * rather than from predefined charset entry. 37760786Sps */ 37860786Sps ilocale(); 379330571Sdelphij#else 38089019Sps#if MSDOS_COMPILER 38160786Sps /* 38289019Sps * Default to "dos". 38389019Sps */ 384161475Sdelphij (void) icharset("dos", 1); 38589019Sps#else 38689019Sps /* 38760786Sps * Default to "latin1". 38860786Sps */ 389161475Sdelphij (void) icharset("latin1", 1); 39060786Sps#endif 39189019Sps#endif 39260786Sps} 39360786Sps 39460786Sps/* 395161475Sdelphij * Initialize charset data structures. 396161475Sdelphij */ 397161475Sdelphij public void 398161475Sdelphijinit_charset() 399161475Sdelphij{ 400161475Sdelphij char *s; 401161475Sdelphij 402161475Sdelphij#if HAVE_LOCALE 403161475Sdelphij setlocale(LC_ALL, ""); 404161475Sdelphij#endif 405161475Sdelphij 406161475Sdelphij set_charset(); 407161475Sdelphij 408161475Sdelphij s = lgetenv("LESSBINFMT"); 409330571Sdelphij setfmt(s, &binfmt, &binattr, "*s<%02X>"); 410161475Sdelphij 411161475Sdelphij s = lgetenv("LESSUTFBINFMT"); 412330571Sdelphij setfmt(s, &utfbinfmt, &binattr, "<U+%04lX>"); 413161475Sdelphij} 414161475Sdelphij 415161475Sdelphij/* 41660786Sps * Is a given character a "binary" character? 41760786Sps */ 41860786Sps public int 41960786Spsbinary_char(c) 420191930Sdelphij LWCHAR c; 42160786Sps{ 422221715Sdelphij if (utf_mode) 423191930Sdelphij return (is_ubin_char(c)); 42460786Sps c &= 0377; 42560786Sps return (chardef[c] & IS_BINARY_CHAR); 42660786Sps} 42760786Sps 42860786Sps/* 42960786Sps * Is a given character a "control" character? 43060786Sps */ 43160786Sps public int 43260786Spscontrol_char(c) 433191930Sdelphij LWCHAR c; 43460786Sps{ 43560786Sps c &= 0377; 43660786Sps return (chardef[c] & IS_CONTROL_CHAR); 43760786Sps} 43860786Sps 43960786Sps/* 44060786Sps * Return the printable form of a character. 44160786Sps * For example, in the "ascii" charset '\3' is printed as "^C". 44260786Sps */ 44360786Sps public char * 44460786Spsprchar(c) 445191930Sdelphij LWCHAR c; 44660786Sps{ 447161475Sdelphij /* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */ 448161475Sdelphij static char buf[32]; 44960786Sps 45060786Sps c &= 0377; 451161475Sdelphij if ((c < 128 || !utf_mode) && !control_char(c)) 452195941Sdelphij SNPRINTF1(buf, sizeof(buf), "%c", (int) c); 45360786Sps else if (c == ESC) 454161475Sdelphij strcpy(buf, "ESC"); 45589019Sps#if IS_EBCDIC_HOST 45689019Sps else if (!binary_char(c) && c < 64) 457161475Sdelphij SNPRINTF1(buf, sizeof(buf), "^%c", 45889019Sps /* 45989019Sps * This array roughly inverts CONTROL() #defined in less.h, 46089019Sps * and should be kept in sync with CONTROL() and IBM-1047. 46189019Sps */ 46289019Sps "@ABC.I.?...KLMNO" 46389019Sps "PQRS.JH.XY.." 46489019Sps "\\]^_" 46589019Sps "......W[.....EFG" 46689019Sps "..V....D....TU.Z"[c]); 46789019Sps#else 46889019Sps else if (c < 128 && !control_char(c ^ 0100)) 469195941Sdelphij SNPRINTF1(buf, sizeof(buf), "^%c", (int) (c ^ 0100)); 47089019Sps#endif 47160786Sps else 472161475Sdelphij SNPRINTF1(buf, sizeof(buf), binfmt, c); 47360786Sps return (buf); 47460786Sps} 475161475Sdelphij 476161475Sdelphij/* 477161475Sdelphij * Return the printable form of a UTF-8 character. 478161475Sdelphij */ 479161475Sdelphij public char * 480161475Sdelphijprutfchar(ch) 481161475Sdelphij LWCHAR ch; 482161475Sdelphij{ 483161475Sdelphij static char buf[32]; 484161475Sdelphij 485161475Sdelphij if (ch == ESC) 486161475Sdelphij strcpy(buf, "ESC"); 487161475Sdelphij else if (ch < 128 && control_char(ch)) 488161475Sdelphij { 489161475Sdelphij if (!control_char(ch ^ 0100)) 490161475Sdelphij SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100); 491161475Sdelphij else 492161475Sdelphij SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch); 493161475Sdelphij } else if (is_ubin_char(ch)) 494294286Sdelphij { 495161475Sdelphij SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch); 496294286Sdelphij } else 497161475Sdelphij { 498294286Sdelphij char *p = buf; 499161475Sdelphij if (ch >= 0x80000000) 500294286Sdelphij ch = 0xFFFD; /* REPLACEMENT CHARACTER */ 501294286Sdelphij put_wchar(&p, ch); 502294286Sdelphij *p = '\0'; 503161475Sdelphij } 504161475Sdelphij return (buf); 505161475Sdelphij} 506161475Sdelphij 507161475Sdelphij/* 508161475Sdelphij * Get the length of a UTF-8 character in bytes. 509161475Sdelphij */ 510161475Sdelphij public int 511161475Sdelphijutf_len(ch) 512330571Sdelphij unsigned char ch; 513161475Sdelphij{ 514161475Sdelphij if ((ch & 0x80) == 0) 515161475Sdelphij return 1; 516161475Sdelphij if ((ch & 0xE0) == 0xC0) 517161475Sdelphij return 2; 518161475Sdelphij if ((ch & 0xF0) == 0xE0) 519161475Sdelphij return 3; 520161475Sdelphij if ((ch & 0xF8) == 0xF0) 521161475Sdelphij return 4; 522161475Sdelphij if ((ch & 0xFC) == 0xF8) 523161475Sdelphij return 5; 524161475Sdelphij if ((ch & 0xFE) == 0xFC) 525161475Sdelphij return 6; 526161475Sdelphij /* Invalid UTF-8 encoding. */ 527161475Sdelphij return 1; 528161475Sdelphij} 529161475Sdelphij 530161475Sdelphij/* 531294286Sdelphij * Does the parameter point to the lead byte of a well-formed UTF-8 character? 532161475Sdelphij */ 533161475Sdelphij public int 534330571Sdelphijis_utf8_well_formed(ss, slen) 535330571Sdelphij char *ss; 536294286Sdelphij int slen; 537161475Sdelphij{ 538161475Sdelphij int i; 539161475Sdelphij int len; 540330571Sdelphij unsigned char *s = (unsigned char *) ss; 541161475Sdelphij 542161475Sdelphij if (IS_UTF8_INVALID(s[0])) 543161475Sdelphij return (0); 544161475Sdelphij 545330571Sdelphij len = utf_len(s[0]); 546294286Sdelphij if (len > slen) 547294286Sdelphij return (0); 548161475Sdelphij if (len == 1) 549161475Sdelphij return (1); 550161475Sdelphij if (len == 2) 551161475Sdelphij { 552161475Sdelphij if (s[0] < 0xC2) 553161475Sdelphij return (0); 554161475Sdelphij } else 555161475Sdelphij { 556161475Sdelphij unsigned char mask; 557161475Sdelphij mask = (~((1 << (8-len)) - 1)) & 0xFF; 558161475Sdelphij if (s[0] == mask && (s[1] & mask) == 0x80) 559161475Sdelphij return (0); 560161475Sdelphij } 561161475Sdelphij 562161475Sdelphij for (i = 1; i < len; i++) 563161475Sdelphij if (!IS_UTF8_TRAIL(s[i])) 564161475Sdelphij return (0); 565161475Sdelphij return (1); 566161475Sdelphij} 567161475Sdelphij 568161475Sdelphij/* 569330571Sdelphij * Skip bytes until a UTF-8 lead byte (11xxxxxx) or ASCII byte (0xxxxxxx) is found. 570294286Sdelphij */ 571330571Sdelphij public void 572330571Sdelphijutf_skip_to_lead(pp, limit) 573330571Sdelphij char **pp; 574330571Sdelphij char *limit; 575294286Sdelphij{ 576330571Sdelphij do { 577330571Sdelphij ++(*pp); 578330571Sdelphij } while (*pp < limit && !IS_UTF8_LEAD((*pp)[0] & 0377) && !IS_ASCII_OCTET((*pp)[0])); 579294286Sdelphij} 580294286Sdelphij 581330571Sdelphij 582294286Sdelphij/* 583161475Sdelphij * Get the value of a UTF-8 character. 584161475Sdelphij */ 585161475Sdelphij public LWCHAR 586161475Sdelphijget_wchar(p) 587330571Sdelphij constant char *p; 588161475Sdelphij{ 589161475Sdelphij switch (utf_len(p[0])) 590161475Sdelphij { 591161475Sdelphij case 1: 592161475Sdelphij default: 593172468Sdelphij /* 0xxxxxxx */ 594161475Sdelphij return (LWCHAR) 595161475Sdelphij (p[0] & 0xFF); 596161475Sdelphij case 2: 597172468Sdelphij /* 110xxxxx 10xxxxxx */ 598161475Sdelphij return (LWCHAR) ( 599161475Sdelphij ((p[0] & 0x1F) << 6) | 600161475Sdelphij (p[1] & 0x3F)); 601161475Sdelphij case 3: 602172468Sdelphij /* 1110xxxx 10xxxxxx 10xxxxxx */ 603161475Sdelphij return (LWCHAR) ( 604161475Sdelphij ((p[0] & 0x0F) << 12) | 605161475Sdelphij ((p[1] & 0x3F) << 6) | 606161475Sdelphij (p[2] & 0x3F)); 607161475Sdelphij case 4: 608172468Sdelphij /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 609161475Sdelphij return (LWCHAR) ( 610161475Sdelphij ((p[0] & 0x07) << 18) | 611161475Sdelphij ((p[1] & 0x3F) << 12) | 612161475Sdelphij ((p[2] & 0x3F) << 6) | 613161475Sdelphij (p[3] & 0x3F)); 614161475Sdelphij case 5: 615172468Sdelphij /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 616161475Sdelphij return (LWCHAR) ( 617161475Sdelphij ((p[0] & 0x03) << 24) | 618161475Sdelphij ((p[1] & 0x3F) << 18) | 619161475Sdelphij ((p[2] & 0x3F) << 12) | 620161475Sdelphij ((p[3] & 0x3F) << 6) | 621161475Sdelphij (p[4] & 0x3F)); 622161475Sdelphij case 6: 623172468Sdelphij /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 624161475Sdelphij return (LWCHAR) ( 625161475Sdelphij ((p[0] & 0x01) << 30) | 626161475Sdelphij ((p[1] & 0x3F) << 24) | 627161475Sdelphij ((p[2] & 0x3F) << 18) | 628161475Sdelphij ((p[3] & 0x3F) << 12) | 629161475Sdelphij ((p[4] & 0x3F) << 6) | 630161475Sdelphij (p[5] & 0x3F)); 631161475Sdelphij } 632161475Sdelphij} 633161475Sdelphij 634161475Sdelphij/* 635172468Sdelphij * Store a character into a UTF-8 string. 636172468Sdelphij */ 637172468Sdelphij public void 638172468Sdelphijput_wchar(pp, ch) 639172468Sdelphij char **pp; 640172468Sdelphij LWCHAR ch; 641172468Sdelphij{ 642172468Sdelphij if (!utf_mode || ch < 0x80) 643172468Sdelphij { 644172468Sdelphij /* 0xxxxxxx */ 645172468Sdelphij *(*pp)++ = (char) ch; 646172468Sdelphij } else if (ch < 0x800) 647172468Sdelphij { 648172468Sdelphij /* 110xxxxx 10xxxxxx */ 649172468Sdelphij *(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F)); 650172468Sdelphij *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 651172468Sdelphij } else if (ch < 0x10000) 652172468Sdelphij { 653172468Sdelphij /* 1110xxxx 10xxxxxx 10xxxxxx */ 654172468Sdelphij *(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F)); 655172468Sdelphij *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 656172468Sdelphij *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 657172468Sdelphij } else if (ch < 0x200000) 658172468Sdelphij { 659172468Sdelphij /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 660172468Sdelphij *(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07)); 661172468Sdelphij *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 662172468Sdelphij *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 663172468Sdelphij *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 664172468Sdelphij } else if (ch < 0x4000000) 665172468Sdelphij { 666172468Sdelphij /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 667172468Sdelphij *(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03)); 668172468Sdelphij *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F)); 669172468Sdelphij *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 670172468Sdelphij *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 671172468Sdelphij *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 672172468Sdelphij } else 673172468Sdelphij { 674172468Sdelphij /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ 675172468Sdelphij *(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01)); 676172468Sdelphij *(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F)); 677172468Sdelphij *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F)); 678172468Sdelphij *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F)); 679172468Sdelphij *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F)); 680172468Sdelphij *(*pp)++ = (char) (0x80 | (ch & 0x3F)); 681172468Sdelphij } 682172468Sdelphij} 683172468Sdelphij 684172468Sdelphij/* 685161475Sdelphij * Step forward or backward one character in a string. 686161475Sdelphij */ 687161475Sdelphij public LWCHAR 688161475Sdelphijstep_char(pp, dir, limit) 689161475Sdelphij char **pp; 690161475Sdelphij signed int dir; 691330571Sdelphij constant char *limit; 692161475Sdelphij{ 693161475Sdelphij LWCHAR ch; 694172597Sdelphij int len; 695161475Sdelphij char *p = *pp; 696161475Sdelphij 697161475Sdelphij if (!utf_mode) 698161475Sdelphij { 699161475Sdelphij /* It's easy if chars are one byte. */ 700161475Sdelphij if (dir > 0) 701330571Sdelphij ch = (LWCHAR) (unsigned char) ((p < limit) ? *p++ : 0); 702161475Sdelphij else 703330571Sdelphij ch = (LWCHAR) (unsigned char) ((p > limit) ? *--p : 0); 704161475Sdelphij } else if (dir > 0) 705161475Sdelphij { 706172597Sdelphij len = utf_len(*p); 707172597Sdelphij if (p + len > limit) 708172597Sdelphij { 709161475Sdelphij ch = 0; 710330571Sdelphij p = (char *) limit; 711172597Sdelphij } else 712161475Sdelphij { 713161475Sdelphij ch = get_wchar(p); 714172597Sdelphij p += len; 715161475Sdelphij } 716161475Sdelphij } else 717161475Sdelphij { 718161475Sdelphij while (p > limit && IS_UTF8_TRAIL(p[-1])) 719161475Sdelphij p--; 720161475Sdelphij if (p > limit) 721161475Sdelphij ch = get_wchar(--p); 722161475Sdelphij else 723161475Sdelphij ch = 0; 724161475Sdelphij } 725161475Sdelphij *pp = p; 726161475Sdelphij return ch; 727161475Sdelphij} 728161475Sdelphij 729161475Sdelphij/* 730161475Sdelphij * Unicode characters data 731294286Sdelphij * Actual data is in the generated *.uni files. 732161475Sdelphij */ 733161475Sdelphij 734294286Sdelphij#define DECLARE_RANGE_TABLE_START(name) \ 735294286Sdelphij static struct wchar_range name##_array[] = { 736294286Sdelphij#define DECLARE_RANGE_TABLE_END(name) \ 737294286Sdelphij }; struct wchar_range_table name##_table = { name##_array, sizeof(name##_array)/sizeof(*name##_array) }; 738161475Sdelphij 739294286SdelphijDECLARE_RANGE_TABLE_START(compose) 740294286Sdelphij#include "compose.uni" 741294286SdelphijDECLARE_RANGE_TABLE_END(compose) 742294286Sdelphij 743294286SdelphijDECLARE_RANGE_TABLE_START(ubin) 744294286Sdelphij#include "ubin.uni" 745294286SdelphijDECLARE_RANGE_TABLE_END(ubin) 746294286Sdelphij 747294286SdelphijDECLARE_RANGE_TABLE_START(wide) 748294286Sdelphij#include "wide.uni" 749294286SdelphijDECLARE_RANGE_TABLE_END(wide) 750294286Sdelphij 751330571SdelphijDECLARE_RANGE_TABLE_START(fmt) 752330571Sdelphij#include "fmt.uni" 753330571SdelphijDECLARE_RANGE_TABLE_END(fmt) 754330571Sdelphij 755294286Sdelphij/* comb_table is special pairs, not ranges. */ 756161475Sdelphijstatic struct wchar_range comb_table[] = { 757170256Sdelphij {0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627}, 758161475Sdelphij}; 759161475Sdelphij 760161475Sdelphij 761161475Sdelphij static int 762294286Sdelphijis_in_table(ch, table) 763161475Sdelphij LWCHAR ch; 764294286Sdelphij struct wchar_range_table *table; 765161475Sdelphij{ 766161475Sdelphij int hi; 767161475Sdelphij int lo; 768161475Sdelphij 769161475Sdelphij /* Binary search in the table. */ 770294286Sdelphij if (ch < table->table[0].first) 771161475Sdelphij return 0; 772161475Sdelphij lo = 0; 773294286Sdelphij hi = table->count - 1; 774161475Sdelphij while (lo <= hi) 775161475Sdelphij { 776161475Sdelphij int mid = (lo + hi) / 2; 777294286Sdelphij if (ch > table->table[mid].last) 778161475Sdelphij lo = mid + 1; 779294286Sdelphij else if (ch < table->table[mid].first) 780161475Sdelphij hi = mid - 1; 781161475Sdelphij else 782161475Sdelphij return 1; 783161475Sdelphij } 784161475Sdelphij return 0; 785161475Sdelphij} 786161475Sdelphij 787161475Sdelphij/* 788161475Sdelphij * Is a character a UTF-8 composing character? 789161475Sdelphij * If a composing character follows any char, the two combine into one glyph. 790161475Sdelphij */ 791161475Sdelphij public int 792161475Sdelphijis_composing_char(ch) 793161475Sdelphij LWCHAR ch; 794161475Sdelphij{ 795330571Sdelphij return is_in_table(ch, &compose_table) || 796330571Sdelphij (bs_mode != BS_CONTROL && is_in_table(ch, &fmt_table)); 797161475Sdelphij} 798161475Sdelphij 799161475Sdelphij/* 800161475Sdelphij * Should this UTF-8 character be treated as binary? 801161475Sdelphij */ 802161475Sdelphij public int 803161475Sdelphijis_ubin_char(ch) 804161475Sdelphij LWCHAR ch; 805161475Sdelphij{ 806330571Sdelphij int ubin = is_in_table(ch, &ubin_table) || 807330571Sdelphij (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table)); 808330571Sdelphij#if MSDOS_COMPILER==WIN32C 809330571Sdelphij if (!ubin && utf_mode == 2 && ch < 0x10000) 810330571Sdelphij { 811330571Sdelphij /* 812330571Sdelphij * Consider it binary if it can't be converted. 813330571Sdelphij */ 814330571Sdelphij BOOL used_default = TRUE; 815330571Sdelphij WideCharToMultiByte(GetConsoleOutputCP(), WC_NO_BEST_FIT_CHARS, (LPCWSTR) &ch, 1, NULL, 0, NULL, &used_default); 816330571Sdelphij if (used_default) 817330571Sdelphij ubin = 1; 818330571Sdelphij } 819330571Sdelphij#endif 820330571Sdelphij return ubin; 821161475Sdelphij} 822161475Sdelphij 823161475Sdelphij/* 824161475Sdelphij * Is this a double width UTF-8 character? 825161475Sdelphij */ 826161475Sdelphij public int 827161475Sdelphijis_wide_char(ch) 828161475Sdelphij LWCHAR ch; 829161475Sdelphij{ 830294286Sdelphij return is_in_table(ch, &wide_table); 831161475Sdelphij} 832161475Sdelphij 833161475Sdelphij/* 834161475Sdelphij * Is a character a UTF-8 combining character? 835161475Sdelphij * A combining char acts like an ordinary char, but if it follows 836161475Sdelphij * a specific char (not any char), the two combine into one glyph. 837161475Sdelphij */ 838161475Sdelphij public int 839161475Sdelphijis_combining_char(ch1, ch2) 840161475Sdelphij LWCHAR ch1; 841161475Sdelphij LWCHAR ch2; 842161475Sdelphij{ 843161475Sdelphij /* The table is small; use linear search. */ 844161475Sdelphij int i; 845161475Sdelphij for (i = 0; i < sizeof(comb_table)/sizeof(*comb_table); i++) 846161475Sdelphij { 847161475Sdelphij if (ch1 == comb_table[i].first && 848161475Sdelphij ch2 == comb_table[i].last) 849161475Sdelphij return 1; 850161475Sdelphij } 851161475Sdelphij return 0; 852161475Sdelphij} 853161475Sdelphij 854