ascmagic.c revision 133359
168349Sobrien/* 2133359Sobrien * Copyright (c) Ian F. Darwin 1986-1995. 3133359Sobrien * Software written by Ian F. Darwin and others; 4133359Sobrien * maintained 1995-present by Christos Zoulas and others. 5133359Sobrien * 6133359Sobrien * Redistribution and use in source and binary forms, with or without 7133359Sobrien * modification, are permitted provided that the following conditions 8133359Sobrien * are met: 9133359Sobrien * 1. Redistributions of source code must retain the above copyright 10133359Sobrien * notice immediately at the beginning of the file, without modification, 11133359Sobrien * this list of conditions, and the following disclaimer. 12133359Sobrien * 2. Redistributions in binary form must reproduce the above copyright 13133359Sobrien * notice, this list of conditions and the following disclaimer in the 14133359Sobrien * documentation and/or other materials provided with the distribution. 15133359Sobrien * 3. All advertising materials mentioning features or use of this software 16133359Sobrien * must display the following acknowledgement: 17133359Sobrien * This product includes software developed by Ian F. Darwin and others. 18133359Sobrien * 4. The name of the author may not be used to endorse or promote products 19133359Sobrien * derived from this software without specific prior written permission. 20133359Sobrien * 21133359Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 22133359Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23133359Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24133359Sobrien * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 25133359Sobrien * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26133359Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27133359Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28133359Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29133359Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30133359Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31133359Sobrien * SUCH DAMAGE. 32133359Sobrien */ 33133359Sobrien/* 3468349Sobrien * ASCII magic -- file types that we know based on keywords 3568349Sobrien * that can appear anywhere in the file. 3668349Sobrien * 3768349Sobrien * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000, 3868349Sobrien * to handle character codes other than ASCII on a unified basis. 3968349Sobrien * 4068349Sobrien * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 4168349Sobrien * international characters, now subsumed into this file. 4268349Sobrien */ 4368349Sobrien 4468349Sobrien#include "file.h" 45133359Sobrien#include "magic.h" 46133359Sobrien#include <stdio.h> 4768349Sobrien#include <string.h> 4868349Sobrien#include <memory.h> 4968349Sobrien#include <ctype.h> 5068349Sobrien#include <stdlib.h> 5168349Sobrien#ifdef HAVE_UNISTD_H 5268349Sobrien#include <unistd.h> 5368349Sobrien#endif 5468349Sobrien#include "names.h" 5568349Sobrien 5668349Sobrien#ifndef lint 57133359SobrienFILE_RCSID("@(#)$Id: ascmagic.c,v 1.40 2003/11/20 00:25:39 christos Exp $") 5868349Sobrien#endif /* lint */ 5968349Sobrien 6068349Sobrientypedef unsigned long unichar; 6168349Sobrien 6268349Sobrien#define MAXLINELEN 300 /* longest sane line length */ 6368349Sobrien#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ 6468349Sobrien || (x) == 0x85 || (x) == '\f') 6568349Sobrien 66133359Sobrienprivate int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); 67133359Sobrienprivate int looks_utf8(const unsigned char *, size_t, unichar *, size_t *); 68133359Sobrienprivate int looks_unicode(const unsigned char *, size_t, unichar *, size_t *); 69133359Sobrienprivate int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); 70133359Sobrienprivate int looks_extended(const unsigned char *, size_t, unichar *, size_t *); 71133359Sobrienprivate void from_ebcdic(const unsigned char *, size_t, unsigned char *); 72133359Sobrienprivate int ascmatch(const unsigned char *, const unichar *, size_t); 7368349Sobrien 74133359Sobrien 75133359Sobrienprotected int 76133359Sobrienfile_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) 7768349Sobrien{ 78133359Sobrien size_t i; 79110949Sobrien unsigned char nbuf[HOWMANY+1]; /* one extra for terminating '\0' */ 8068349Sobrien unichar ubuf[HOWMANY+1]; /* one extra for terminating '\0' */ 81133359Sobrien size_t ulen; 8268349Sobrien struct names *p; 8368349Sobrien 84133359Sobrien const char *code = NULL; 85133359Sobrien const char *code_mime = NULL; 86133359Sobrien const char *type = NULL; 87133359Sobrien const char *subtype = NULL; 88133359Sobrien const char *subtype_mime = NULL; 8968349Sobrien 9068349Sobrien int has_escapes = 0; 9168349Sobrien int has_backspace = 0; 9268349Sobrien 9368349Sobrien int n_crlf = 0; 9468349Sobrien int n_lf = 0; 9568349Sobrien int n_cr = 0; 9668349Sobrien int n_nel = 0; 9768349Sobrien 9868349Sobrien int last_line_end = -1; 9968349Sobrien int has_long_lines = 0; 10068349Sobrien 10168349Sobrien /* 10284685Sobrien * Undo the NUL-termination kindly provided by process() 10384685Sobrien * but leave at least one byte to look at 10484685Sobrien */ 10568349Sobrien 10684685Sobrien while (nbytes > 1 && buf[nbytes - 1] == '\0') 10768349Sobrien nbytes--; 10868349Sobrien 109133359Sobrien /* nbuf and ubuf relies on this */ 110133359Sobrien if (nbytes > HOWMANY) 111133359Sobrien nbytes = HOWMANY; 112133359Sobrien 11368349Sobrien /* 11468349Sobrien * Then try to determine whether it's any character code we can 11568349Sobrien * identify. Each of these tests, if it succeeds, will leave 11668349Sobrien * the text converted into one-unichar-per-character Unicode in 11768349Sobrien * ubuf, and the number of characters converted in ulen. 11868349Sobrien */ 11968349Sobrien if (looks_ascii(buf, nbytes, ubuf, &ulen)) { 12068349Sobrien code = "ASCII"; 12168349Sobrien code_mime = "us-ascii"; 12268349Sobrien type = "text"; 12368349Sobrien } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) { 12468349Sobrien code = "UTF-8 Unicode"; 12568349Sobrien code_mime = "utf-8"; 12668349Sobrien type = "text"; 127133359Sobrien } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) { 12868349Sobrien if (i == 1) 12968349Sobrien code = "Little-endian UTF-16 Unicode"; 13068349Sobrien else 13168349Sobrien code = "Big-endian UTF-16 Unicode"; 13268349Sobrien 13368349Sobrien type = "character data"; 13468349Sobrien code_mime = "utf-16"; /* is this defined? */ 13568349Sobrien } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) { 13668349Sobrien code = "ISO-8859"; 13768349Sobrien type = "text"; 13868349Sobrien code_mime = "iso-8859-1"; 13968349Sobrien } else if (looks_extended(buf, nbytes, ubuf, &ulen)) { 14068349Sobrien code = "Non-ISO extended-ASCII"; 14168349Sobrien type = "text"; 14268349Sobrien code_mime = "unknown"; 14368349Sobrien } else { 14468349Sobrien from_ebcdic(buf, nbytes, nbuf); 14568349Sobrien 14668349Sobrien if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) { 14768349Sobrien code = "EBCDIC"; 14868349Sobrien type = "character data"; 14968349Sobrien code_mime = "ebcdic"; 15068349Sobrien } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) { 15168349Sobrien code = "International EBCDIC"; 15268349Sobrien type = "character data"; 15368349Sobrien code_mime = "ebcdic"; 15468349Sobrien } else { 15568349Sobrien return 0; /* doesn't look like text at all */ 15668349Sobrien } 15768349Sobrien } 15868349Sobrien 15968349Sobrien /* 16068349Sobrien * for troff, look for . + letter + letter or .\"; 16168349Sobrien * this must be done to disambiguate tar archives' ./file 16268349Sobrien * and other trash from real troff input. 16368349Sobrien * 16468349Sobrien * I believe Plan 9 troff allows non-ASCII characters in the names 16568349Sobrien * of macros, so this test might possibly fail on such a file. 16668349Sobrien */ 16768349Sobrien if (*ubuf == '.') { 16868349Sobrien unichar *tp = ubuf + 1; 16968349Sobrien 17068349Sobrien while (ISSPC(*tp)) 17168349Sobrien ++tp; /* skip leading whitespace */ 17268349Sobrien if ((tp[0] == '\\' && tp[1] == '\"') || 173133359Sobrien (isascii((unsigned char)tp[0]) && 174133359Sobrien isalnum((unsigned char)tp[0]) && 175133359Sobrien isascii((unsigned char)tp[1]) && 176133359Sobrien isalnum((unsigned char)tp[1]) && 17768349Sobrien ISSPC(tp[2]))) { 17868349Sobrien subtype_mime = "text/troff"; 17968349Sobrien subtype = "troff or preprocessor input"; 18068349Sobrien goto subtype_identified; 18168349Sobrien } 18268349Sobrien } 18368349Sobrien 18468349Sobrien if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) { 18568349Sobrien subtype_mime = "text/fortran"; 18668349Sobrien subtype = "fortran program"; 18768349Sobrien goto subtype_identified; 18868349Sobrien } 18968349Sobrien 19068349Sobrien /* look for tokens from names.h - this is expensive! */ 19168349Sobrien 19268349Sobrien i = 0; 19368349Sobrien while (i < ulen) { 194133359Sobrien size_t end; 19568349Sobrien 19668349Sobrien /* 19768349Sobrien * skip past any leading space 19868349Sobrien */ 19968349Sobrien while (i < ulen && ISSPC(ubuf[i])) 20068349Sobrien i++; 20168349Sobrien if (i >= ulen) 20268349Sobrien break; 20368349Sobrien 20468349Sobrien /* 20568349Sobrien * find the next whitespace 20668349Sobrien */ 20768349Sobrien for (end = i + 1; end < nbytes; end++) 20868349Sobrien if (ISSPC(ubuf[end])) 20968349Sobrien break; 21068349Sobrien 21168349Sobrien /* 21268349Sobrien * compare the word thus isolated against the token list 21368349Sobrien */ 21468349Sobrien for (p = names; p < names + NNAMES; p++) { 215133359Sobrien if (ascmatch((const unsigned char *)p->name, ubuf + i, 216110949Sobrien end - i)) { 21768349Sobrien subtype = types[p->type].human; 21868349Sobrien subtype_mime = types[p->type].mime; 21968349Sobrien goto subtype_identified; 22068349Sobrien } 22168349Sobrien } 22268349Sobrien 22368349Sobrien i = end; 22468349Sobrien } 22568349Sobrien 22668349Sobriensubtype_identified: 22768349Sobrien 22868349Sobrien /* 22968349Sobrien * Now try to discover other details about the file. 23068349Sobrien */ 23168349Sobrien for (i = 0; i < ulen; i++) { 23268349Sobrien if (i > last_line_end + MAXLINELEN) 23368349Sobrien has_long_lines = 1; 23468349Sobrien 23568349Sobrien if (ubuf[i] == '\033') 23668349Sobrien has_escapes = 1; 23768349Sobrien if (ubuf[i] == '\b') 23868349Sobrien has_backspace = 1; 23968349Sobrien 24068349Sobrien if (ubuf[i] == '\r' && (i + 1 < ulen && ubuf[i + 1] == '\n')) { 24168349Sobrien n_crlf++; 24268349Sobrien last_line_end = i; 24368349Sobrien } 24468349Sobrien if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) { 24568349Sobrien n_cr++; 24668349Sobrien last_line_end = i; 24768349Sobrien } 248133359Sobrien if (ubuf[i] == '\n' && ((int)i - 1 < 0 || ubuf[i - 1] != '\r')){ 24968349Sobrien n_lf++; 25068349Sobrien last_line_end = i; 25168349Sobrien } 25268349Sobrien if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ 25368349Sobrien n_nel++; 25468349Sobrien last_line_end = i; 25568349Sobrien } 25668349Sobrien } 25768349Sobrien 258133359Sobrien if ((ms->flags & MAGIC_MIME)) { 259133359Sobrien if (subtype_mime) { 260133359Sobrien if (file_printf(ms, subtype_mime) == -1) 261133359Sobrien return -1; 262133359Sobrien } else { 263133359Sobrien if (file_printf(ms, "text/plain") == -1) 264133359Sobrien return -1; 265133359Sobrien } 26668349Sobrien 26768349Sobrien if (code_mime) { 268133359Sobrien if (file_printf(ms, "; charset=") == -1) 269133359Sobrien return -1; 270133359Sobrien if (file_printf(ms, code_mime) == -1) 271133359Sobrien return -1; 27268349Sobrien } 27368349Sobrien } else { 274133359Sobrien if (file_printf(ms, code) == -1) 275133359Sobrien return -1; 27668349Sobrien 27768349Sobrien if (subtype) { 278133359Sobrien if (file_printf(ms, " ") == -1) 279133359Sobrien return -1; 280133359Sobrien if (file_printf(ms, subtype) == -1) 281133359Sobrien return -1; 28268349Sobrien } 28368349Sobrien 284133359Sobrien if (file_printf(ms, " ") == -1) 285133359Sobrien return -1; 286133359Sobrien if (file_printf(ms, type) == -1) 287133359Sobrien return -1; 28868349Sobrien 28968349Sobrien if (has_long_lines) 290133359Sobrien if (file_printf(ms, ", with very long lines") == -1) 291133359Sobrien return -1; 29268349Sobrien 29368349Sobrien /* 29468349Sobrien * Only report line terminators if we find one other than LF, 29568349Sobrien * or if we find none at all. 29668349Sobrien */ 29768349Sobrien if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) || 29868349Sobrien (n_crlf != 0 || n_cr != 0 || n_nel != 0)) { 299133359Sobrien if (file_printf(ms, ", with") == -1) 300133359Sobrien return -1; 30168349Sobrien 302133359Sobrien if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) { 303133359Sobrien if (file_printf(ms, " no") == -1) 304133359Sobrien return -1; 305133359Sobrien } else { 30668349Sobrien if (n_crlf) { 307133359Sobrien if (file_printf(ms, " CRLF") == -1) 308133359Sobrien return -1; 30968349Sobrien if (n_cr || n_lf || n_nel) 310133359Sobrien if (file_printf(ms, ",") == -1) 311133359Sobrien return -1; 31268349Sobrien } 31368349Sobrien if (n_cr) { 314133359Sobrien if (file_printf(ms, " CR") == -1) 315133359Sobrien return -1; 31668349Sobrien if (n_lf || n_nel) 317133359Sobrien if (file_printf(ms, ",") == -1) 318133359Sobrien return -1; 31968349Sobrien } 32068349Sobrien if (n_lf) { 321133359Sobrien if (file_printf(ms, " LF") == -1) 322133359Sobrien return -1; 32368349Sobrien if (n_nel) 324133359Sobrien if (file_printf(ms, ",") == -1) 325133359Sobrien return -1; 32668349Sobrien } 32768349Sobrien if (n_nel) 328133359Sobrien if (file_printf(ms, " NEL") == -1) 329133359Sobrien return -1; 33068349Sobrien } 33168349Sobrien 332133359Sobrien if (file_printf(ms, " line terminators") == -1) 333133359Sobrien return -1; 33468349Sobrien } 33568349Sobrien 33668349Sobrien if (has_escapes) 337133359Sobrien if (file_printf(ms, ", with escape sequences") == -1) 338133359Sobrien return -1; 33968349Sobrien if (has_backspace) 340133359Sobrien if (file_printf(ms, ", with overstriking") == -1) 341133359Sobrien return -1; 34268349Sobrien } 34368349Sobrien 34468349Sobrien return 1; 34568349Sobrien} 34668349Sobrien 347133359Sobrienprivate int 348133359Sobrienascmatch(const unsigned char *s, const unichar *us, size_t ulen) 34968349Sobrien{ 35068349Sobrien size_t i; 35168349Sobrien 35268349Sobrien for (i = 0; i < ulen; i++) { 35368349Sobrien if (s[i] != us[i]) 35468349Sobrien return 0; 35568349Sobrien } 35668349Sobrien 35768349Sobrien if (s[i]) 35868349Sobrien return 0; 35968349Sobrien else 36068349Sobrien return 1; 36168349Sobrien} 36268349Sobrien 36368349Sobrien/* 36468349Sobrien * This table reflects a particular philosophy about what constitutes 36568349Sobrien * "text," and there is room for disagreement about it. 36668349Sobrien * 36768349Sobrien * Version 3.31 of the file command considered a file to be ASCII if 36868349Sobrien * each of its characters was approved by either the isascii() or 36968349Sobrien * isalpha() function. On most systems, this would mean that any 37068349Sobrien * file consisting only of characters in the range 0x00 ... 0x7F 37168349Sobrien * would be called ASCII text, but many systems might reasonably 37268349Sobrien * consider some characters outside this range to be alphabetic, 37368349Sobrien * so the file command would call such characters ASCII. It might 37468349Sobrien * have been more accurate to call this "considered textual on the 37568349Sobrien * local system" than "ASCII." 37668349Sobrien * 37768349Sobrien * It considered a file to be "International language text" if each 37868349Sobrien * of its characters was either an ASCII printing character (according 37968349Sobrien * to the real ASCII standard, not the above test), a character in 38068349Sobrien * the range 0x80 ... 0xFF, or one of the following control characters: 38168349Sobrien * backspace, tab, line feed, vertical tab, form feed, carriage return, 38268349Sobrien * escape. No attempt was made to determine the language in which files 38368349Sobrien * of this type were written. 38468349Sobrien * 38568349Sobrien * 38668349Sobrien * The table below considers a file to be ASCII if all of its characters 38768349Sobrien * are either ASCII printing characters (again, according to the X3.4 38868349Sobrien * standard, not isascii()) or any of the following controls: bell, 38968349Sobrien * backspace, tab, line feed, form feed, carriage return, esc, nextline. 39068349Sobrien * 39168349Sobrien * I include bell because some programs (particularly shell scripts) 39268349Sobrien * use it literally, even though it is rare in normal text. I exclude 39368349Sobrien * vertical tab because it never seems to be used in real text. I also 39468349Sobrien * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 39568349Sobrien * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 39668349Sobrien * character to. It might be more appropriate to include it in the 8859 39768349Sobrien * set instead of the ASCII set, but it's got to be included in *something* 39868349Sobrien * we recognize or EBCDIC files aren't going to be considered textual. 39968349Sobrien * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 40068349Sobrien * and Latin characters, so these should possibly be allowed. But they 40168349Sobrien * make a real mess on VT100-style displays if they're not paired properly, 40268349Sobrien * so we are probably better off not calling them text. 40368349Sobrien * 40468349Sobrien * A file is considered to be ISO-8859 text if its characters are all 40568349Sobrien * either ASCII, according to the above definition, or printing characters 40668349Sobrien * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 40768349Sobrien * 40868349Sobrien * Finally, a file is considered to be international text from some other 40968349Sobrien * character code if its characters are all either ISO-8859 (according to 41068349Sobrien * the above definition) or characters in the range 0x80 ... 0x9F, which 41168349Sobrien * ISO-8859 considers to be control characters but the IBM PC and Macintosh 41268349Sobrien * consider to be printing characters. 41368349Sobrien */ 41468349Sobrien 41568349Sobrien#define F 0 /* character never appears in text */ 41668349Sobrien#define T 1 /* character appears in plain ASCII text */ 41768349Sobrien#define I 2 /* character appears in ISO-8859 text */ 41868349Sobrien#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 41968349Sobrien 420133359Sobrienprivate char text_chars[256] = { 42168349Sobrien /* BEL BS HT LF FF CR */ 42268349Sobrien F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 42368349Sobrien /* ESC */ 42468349Sobrien F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 42568349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 42668349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 42768349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 42868349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 42968349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 43068349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 43168349Sobrien /* NEL */ 43268349Sobrien X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 43368349Sobrien X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 43468349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 43568349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 43668349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 43768349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 43868349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 43968349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 44068349Sobrien}; 44168349Sobrien 442133359Sobrienprivate int 443133359Sobrienlooks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, 444133359Sobrien size_t *ulen) 44568349Sobrien{ 44668349Sobrien int i; 44768349Sobrien 44868349Sobrien *ulen = 0; 44968349Sobrien 45068349Sobrien for (i = 0; i < nbytes; i++) { 45168349Sobrien int t = text_chars[buf[i]]; 45268349Sobrien 45368349Sobrien if (t != T) 45468349Sobrien return 0; 45568349Sobrien 45668349Sobrien ubuf[(*ulen)++] = buf[i]; 45768349Sobrien } 45868349Sobrien 45968349Sobrien return 1; 46068349Sobrien} 46168349Sobrien 462133359Sobrienprivate int 463133359Sobrienlooks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 46468349Sobrien{ 46568349Sobrien int i; 46668349Sobrien 46768349Sobrien *ulen = 0; 46868349Sobrien 46968349Sobrien for (i = 0; i < nbytes; i++) { 47068349Sobrien int t = text_chars[buf[i]]; 47168349Sobrien 47268349Sobrien if (t != T && t != I) 47368349Sobrien return 0; 47468349Sobrien 47568349Sobrien ubuf[(*ulen)++] = buf[i]; 47668349Sobrien } 47768349Sobrien 47868349Sobrien return 1; 47968349Sobrien} 48068349Sobrien 481133359Sobrienprivate int 482133359Sobrienlooks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, 483133359Sobrien size_t *ulen) 48468349Sobrien{ 48568349Sobrien int i; 48668349Sobrien 48768349Sobrien *ulen = 0; 48868349Sobrien 48968349Sobrien for (i = 0; i < nbytes; i++) { 49068349Sobrien int t = text_chars[buf[i]]; 49168349Sobrien 49268349Sobrien if (t != T && t != I && t != X) 49368349Sobrien return 0; 49468349Sobrien 49568349Sobrien ubuf[(*ulen)++] = buf[i]; 49668349Sobrien } 49768349Sobrien 49868349Sobrien return 1; 49968349Sobrien} 50068349Sobrien 501133359Sobrienprivate int 502133359Sobrienlooks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 50368349Sobrien{ 50468349Sobrien int i, n; 50568349Sobrien unichar c; 50668349Sobrien int gotone = 0; 50768349Sobrien 50868349Sobrien *ulen = 0; 50968349Sobrien 51068349Sobrien for (i = 0; i < nbytes; i++) { 51168349Sobrien if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 51268349Sobrien /* 51368349Sobrien * Even if the whole file is valid UTF-8 sequences, 51468349Sobrien * still reject it if it uses weird control characters. 51568349Sobrien */ 51668349Sobrien 51768349Sobrien if (text_chars[buf[i]] != T) 51868349Sobrien return 0; 51968349Sobrien 52068349Sobrien ubuf[(*ulen)++] = buf[i]; 52168349Sobrien } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 52268349Sobrien return 0; 52368349Sobrien } else { /* 11xxxxxx begins UTF-8 */ 52468349Sobrien int following; 52568349Sobrien 52668349Sobrien if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 52768349Sobrien c = buf[i] & 0x1f; 52868349Sobrien following = 1; 52968349Sobrien } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 53068349Sobrien c = buf[i] & 0x0f; 53168349Sobrien following = 2; 53268349Sobrien } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 53368349Sobrien c = buf[i] & 0x07; 53468349Sobrien following = 3; 53568349Sobrien } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 53668349Sobrien c = buf[i] & 0x03; 53768349Sobrien following = 4; 53868349Sobrien } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 53968349Sobrien c = buf[i] & 0x01; 54068349Sobrien following = 5; 54168349Sobrien } else 54268349Sobrien return 0; 54368349Sobrien 54468349Sobrien for (n = 0; n < following; n++) { 54568349Sobrien i++; 54668349Sobrien if (i >= nbytes) 54768349Sobrien goto done; 54868349Sobrien 54968349Sobrien if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 55068349Sobrien return 0; 55168349Sobrien 55268349Sobrien c = (c << 6) + (buf[i] & 0x3f); 55368349Sobrien } 55468349Sobrien 55568349Sobrien ubuf[(*ulen)++] = c; 55668349Sobrien gotone = 1; 55768349Sobrien } 55868349Sobrien } 55968349Sobriendone: 56068349Sobrien return gotone; /* don't claim it's UTF-8 if it's all 7-bit */ 56168349Sobrien} 56268349Sobrien 563133359Sobrienprivate int 564133359Sobrienlooks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf, 565133359Sobrien size_t *ulen) 56668349Sobrien{ 56768349Sobrien int bigend; 56868349Sobrien int i; 56968349Sobrien 57068349Sobrien if (nbytes < 2) 57168349Sobrien return 0; 57268349Sobrien 57368349Sobrien if (buf[0] == 0xff && buf[1] == 0xfe) 57468349Sobrien bigend = 0; 57568349Sobrien else if (buf[0] == 0xfe && buf[1] == 0xff) 57668349Sobrien bigend = 1; 57768349Sobrien else 57868349Sobrien return 0; 57968349Sobrien 58068349Sobrien *ulen = 0; 58168349Sobrien 58268349Sobrien for (i = 2; i + 1 < nbytes; i += 2) { 58368349Sobrien /* XXX fix to properly handle chars > 65536 */ 58468349Sobrien 58568349Sobrien if (bigend) 58668349Sobrien ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i]; 58768349Sobrien else 58868349Sobrien ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1]; 58968349Sobrien 59068349Sobrien if (ubuf[*ulen - 1] == 0xfffe) 59168349Sobrien return 0; 592133359Sobrien if (ubuf[*ulen - 1] < 128 && 593133359Sobrien text_chars[(size_t)ubuf[*ulen - 1]] != T) 59468349Sobrien return 0; 59568349Sobrien } 59668349Sobrien 597110949Sobrien return 1 + bigend; 59868349Sobrien} 59968349Sobrien 60068349Sobrien#undef F 60168349Sobrien#undef T 60268349Sobrien#undef I 60368349Sobrien#undef X 60468349Sobrien 60568349Sobrien/* 60668349Sobrien * This table maps each EBCDIC character to an (8-bit extended) ASCII 60768349Sobrien * character, as specified in the rationale for the dd(1) command in 60868349Sobrien * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 60968349Sobrien * 61068349Sobrien * Unfortunately it does not seem to correspond exactly to any of the 61168349Sobrien * five variants of EBCDIC documented in IBM's _Enterprise Systems 61268349Sobrien * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 61368349Sobrien * Edition, July, 1999, pp. I-1 - I-4. 61468349Sobrien * 61568349Sobrien * Fortunately, though, all versions of EBCDIC, including this one, agree 61668349Sobrien * on most of the printing characters that also appear in (7-bit) ASCII. 61768349Sobrien * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 61868349Sobrien * 61968349Sobrien * Fortunately too, there is general agreement that codes 0x00 through 62068349Sobrien * 0x3F represent control characters, 0x41 a nonbreaking space, and the 62168349Sobrien * remainder printing characters. 62268349Sobrien * 62368349Sobrien * This is sufficient to allow us to identify EBCDIC text and to distinguish 62468349Sobrien * between old-style and internationalized examples of text. 62568349Sobrien */ 62668349Sobrien 627133359Sobrienprivate unsigned char ebcdic_to_ascii[] = { 62868349Sobrien 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 62968349Sobrien 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 63068349Sobrien128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 63168349Sobrien144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 63268349Sobrien' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 63368349Sobrien'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 63468349Sobrien'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 63568349Sobrien186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 63668349Sobrien195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 63768349Sobrien202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 63868349Sobrien209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 63968349Sobrien216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 64068349Sobrien'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 64168349Sobrien'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 64268349Sobrien'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 64368349Sobrien'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 64468349Sobrien}; 64568349Sobrien 646133359Sobrien#ifdef notdef 64768349Sobrien/* 64868349Sobrien * The following EBCDIC-to-ASCII table may relate more closely to reality, 64968349Sobrien * or at least to modern reality. It comes from 65068349Sobrien * 65168349Sobrien * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 65268349Sobrien * 65368349Sobrien * and maps the characters of EBCDIC code page 1047 (the code used for 65468349Sobrien * Unix-derived software on IBM's 390 systems) to the corresponding 65568349Sobrien * characters from ISO 8859-1. 65668349Sobrien * 65768349Sobrien * If this table is used instead of the above one, some of the special 65868349Sobrien * cases for the NEL character can be taken out of the code. 65968349Sobrien */ 66068349Sobrien 661133359Sobrienprivate unsigned char ebcdic_1047_to_8859[] = { 66268349Sobrien0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 66368349Sobrien0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 66468349Sobrien0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 66568349Sobrien0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 66668349Sobrien0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 66768349Sobrien0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 66868349Sobrien0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 66968349Sobrien0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 67068349Sobrien0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 67168349Sobrien0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 67268349Sobrien0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 67368349Sobrien0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 67468349Sobrien0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 67568349Sobrien0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 67668349Sobrien0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 67768349Sobrien0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 67868349Sobrien}; 679133359Sobrien#endif 68068349Sobrien 68168349Sobrien/* 68268349Sobrien * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 68368349Sobrien */ 684133359Sobrienprivate void 685133359Sobrienfrom_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 68668349Sobrien{ 68768349Sobrien int i; 68868349Sobrien 68968349Sobrien for (i = 0; i < nbytes; i++) { 69068349Sobrien out[i] = ebcdic_to_ascii[buf[i]]; 69168349Sobrien } 69268349Sobrien} 693