1#include "vterm_internal.h" 2 3#define UNICODE_INVALID 0xFFFD 4 5#ifdef DEBUG_LIBVTERM 6# define DEBUG_PRINT_UTF8 7#endif 8 9struct UTF8DecoderData { 10 // number of bytes remaining in this codepoint 11 int bytes_remaining; 12 13 // number of bytes total in this codepoint once it's finished 14 // (for detecting overlongs) 15 int bytes_total; 16 17 int this_cp; 18}; 19 20static void init_utf8(VTermEncoding *enc, void *data_) 21{ 22 struct UTF8DecoderData *data = data_; 23 24 data->bytes_remaining = 0; 25 data->bytes_total = 0; 26} 27 28static void decode_utf8(VTermEncoding *enc, void *data_, 29 uint32_t cp[], int *cpi, int cplen, 30 const char bytes[], size_t *pos, size_t bytelen) 31{ 32 struct UTF8DecoderData *data = data_; 33 34#ifdef DEBUG_PRINT_UTF8 35 printf("BEGIN UTF-8\n"); 36#endif 37 38 for( ; *pos < bytelen; (*pos)++) { 39 unsigned char c = bytes[*pos]; 40 41#ifdef DEBUG_PRINT_UTF8 42 printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining); 43#endif 44 45 if(c < 0x20) 46 return; 47 48 else if(c >= 0x20 && c < 0x80) { 49 if(data->bytes_remaining) 50 cp[(*cpi)++] = UNICODE_INVALID; 51 52 cp[(*cpi)++] = c; 53#ifdef DEBUG_PRINT_UTF8 54 printf(" UTF-8 char: U+%04x\n", c); 55#endif 56 data->bytes_remaining = 0; 57 } 58 59 else if(c >= 0x80 && c < 0xc0) { 60 if(!data->bytes_remaining) { 61 cp[(*cpi)++] = UNICODE_INVALID; 62 continue; 63 } 64 65 data->this_cp <<= 6; 66 data->this_cp |= c & 0x3f; 67 data->bytes_remaining--; 68 69 if(!data->bytes_remaining) { 70#ifdef DEBUG_PRINT_UTF8 71 printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total); 72#endif 73 // Check for overlong sequences 74 switch(data->bytes_total) { 75 case 2: 76 if(data->this_cp < 0x0080) data->this_cp = UNICODE_INVALID; break; 77 case 3: 78 if(data->this_cp < 0x0800) data->this_cp = UNICODE_INVALID; break; 79 case 4: 80 if(data->this_cp < 0x10000) data->this_cp = UNICODE_INVALID; break; 81 case 5: 82 if(data->this_cp < 0x200000) data->this_cp = UNICODE_INVALID; break; 83 case 6: 84 if(data->this_cp < 0x4000000) data->this_cp = UNICODE_INVALID; break; 85 } 86 // Now look for plain invalid ones 87 if((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF) || 88 data->this_cp == 0xFFFE || 89 data->this_cp == 0xFFFF) 90 data->this_cp = UNICODE_INVALID; 91#ifdef DEBUG_PRINT_UTF8 92 printf(" char: U+%04x\n", data->this_cp); 93#endif 94 cp[(*cpi)++] = data->this_cp; 95 } 96 } 97 98 else if(c >= 0xc0 && c < 0xe0) { 99 if(data->bytes_remaining) 100 cp[(*cpi)++] = UNICODE_INVALID; 101 102 data->this_cp = c & 0x1f; 103 data->bytes_total = 2; 104 data->bytes_remaining = 1; 105 } 106 107 else if(c >= 0xe0 && c < 0xf0) { 108 if(data->bytes_remaining) 109 cp[(*cpi)++] = UNICODE_INVALID; 110 111 data->this_cp = c & 0x0f; 112 data->bytes_total = 3; 113 data->bytes_remaining = 2; 114 } 115 116 else if(c >= 0xf0 && c < 0xf8) { 117 if(data->bytes_remaining) 118 cp[(*cpi)++] = UNICODE_INVALID; 119 120 data->this_cp = c & 0x07; 121 data->bytes_total = 4; 122 data->bytes_remaining = 3; 123 } 124 125 else if(c >= 0xf8 && c < 0xfc) { 126 if(data->bytes_remaining) 127 cp[(*cpi)++] = UNICODE_INVALID; 128 129 data->this_cp = c & 0x03; 130 data->bytes_total = 5; 131 data->bytes_remaining = 4; 132 } 133 134 else if(c >= 0xfc && c < 0xfe) { 135 if(data->bytes_remaining) 136 cp[(*cpi)++] = UNICODE_INVALID; 137 138 data->this_cp = c & 0x01; 139 data->bytes_total = 6; 140 data->bytes_remaining = 5; 141 } 142 143 else { 144 cp[(*cpi)++] = UNICODE_INVALID; 145 } 146 } 147} 148 149static VTermEncoding encoding_utf8 = { 150 .init = &init_utf8, 151 .decode = &decode_utf8, 152}; 153 154static void decode_usascii(VTermEncoding *enc, void *data, 155 uint32_t cp[], int *cpi, int cplen, 156 const char bytes[], size_t *pos, size_t bytelen) 157{ 158 for(; *pos < bytelen; (*pos)++) { 159 unsigned char c = bytes[*pos]; 160 161 if(c < 0x20 || c >= 0x80) 162 return; 163 164 cp[(*cpi)++] = c; 165 } 166} 167 168static VTermEncoding encoding_usascii = { 169 .decode = &decode_usascii, 170}; 171 172struct StaticTableEncoding { 173 const VTermEncoding enc; 174 const uint32_t chars[128]; 175}; 176 177static void decode_table(VTermEncoding *enc, void *data, 178 uint32_t cp[], int *cpi, int cplen, 179 const char bytes[], size_t *pos, size_t bytelen) 180{ 181 struct StaticTableEncoding *table = (struct StaticTableEncoding *)enc; 182 183 for(; *pos < bytelen; (*pos)++) { 184 unsigned char c = (bytes[*pos]) & 0x7f; 185 186 if(c < 0x20) 187 return; 188 189 if(table->chars[c]) 190 cp[(*cpi)++] = table->chars[c]; 191 else 192 cp[(*cpi)++] = c; 193 } 194} 195 196#include "encoding/DECdrawing.inc" 197#include "encoding/uk.inc" 198 199static struct { 200 VTermEncodingType type; 201 char designation; 202 VTermEncoding *enc; 203} 204encodings[] = { 205 { ENC_UTF8, 'u', &encoding_utf8 }, 206 { ENC_SINGLE_94, '0', (VTermEncoding*)&encoding_DECdrawing }, 207 { ENC_SINGLE_94, 'A', (VTermEncoding*)&encoding_uk }, 208 { ENC_SINGLE_94, 'B', &encoding_usascii }, 209 { 0, 0 }, 210}; 211 212VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation) 213{ 214 for(int i = 0; encodings[i].designation; i++) 215 if(encodings[i].type == type && encodings[i].designation == designation) 216 return encodings[i].enc; 217 return NULL; 218} 219