ascmagic.c revision 159764
168349Sobrien/* 2133359Sobrien * Copyright (c) Ian F. Darwin 1986-1995. 3133359Sobrien * Software written by Ian F. Darwin and others; 4133359Sobrien * maintained 1995-present by Christos Zoulas and others. 5133359Sobrien * 6133359Sobrien * Redistribution and use in source and binary forms, with or without 7133359Sobrien * modification, are permitted provided that the following conditions 8133359Sobrien * are met: 9133359Sobrien * 1. Redistributions of source code must retain the above copyright 10133359Sobrien * notice immediately at the beginning of the file, without modification, 11133359Sobrien * this list of conditions, and the following disclaimer. 12133359Sobrien * 2. Redistributions in binary form must reproduce the above copyright 13133359Sobrien * notice, this list of conditions and the following disclaimer in the 14133359Sobrien * documentation and/or other materials provided with the distribution. 15133359Sobrien * 16133359Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17133359Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18133359Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19133359Sobrien * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 20133359Sobrien * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21133359Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22133359Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23133359Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24133359Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25133359Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26133359Sobrien * SUCH DAMAGE. 27133359Sobrien */ 28133359Sobrien/* 2968349Sobrien * ASCII magic -- file types that we know based on keywords 3068349Sobrien * that can appear anywhere in the file. 3168349Sobrien * 3268349Sobrien * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000, 3368349Sobrien * to handle character codes other than ASCII on a unified basis. 3468349Sobrien * 3568349Sobrien * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 3668349Sobrien * international characters, now subsumed into this file. 3768349Sobrien */ 3868349Sobrien 3968349Sobrien#include "file.h" 40133359Sobrien#include "magic.h" 41133359Sobrien#include <stdio.h> 4268349Sobrien#include <string.h> 4368349Sobrien#include <memory.h> 4468349Sobrien#include <ctype.h> 4568349Sobrien#include <stdlib.h> 4668349Sobrien#ifdef HAVE_UNISTD_H 4768349Sobrien#include <unistd.h> 4868349Sobrien#endif 4968349Sobrien#include "names.h" 5068349Sobrien 5168349Sobrien#ifndef lint 52159764SobrienFILE_RCSID("@(#)$Id: ascmagic.c,v 1.45 2006/03/12 22:09:33 christos Exp $") 5368349Sobrien#endif /* lint */ 5468349Sobrien 5568349Sobrientypedef unsigned long unichar; 5668349Sobrien 5768349Sobrien#define MAXLINELEN 300 /* longest sane line length */ 5868349Sobrien#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ 5968349Sobrien || (x) == 0x85 || (x) == '\f') 6068349Sobrien 61133359Sobrienprivate int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); 62133359Sobrienprivate int looks_utf8(const unsigned char *, size_t, unichar *, size_t *); 63133359Sobrienprivate int looks_unicode(const unsigned char *, size_t, unichar *, size_t *); 64133359Sobrienprivate int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); 65133359Sobrienprivate int looks_extended(const unsigned char *, size_t, unichar *, size_t *); 66133359Sobrienprivate void from_ebcdic(const unsigned char *, size_t, unsigned char *); 67133359Sobrienprivate int ascmatch(const unsigned char *, const unichar *, size_t); 6868349Sobrien 69133359Sobrien 70133359Sobrienprotected int 71133359Sobrienfile_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) 7268349Sobrien{ 73133359Sobrien size_t i; 74159764Sobrien unsigned char *nbuf = NULL; 75159764Sobrien unichar *ubuf = NULL; 76133359Sobrien size_t ulen; 7768349Sobrien struct names *p; 78159764Sobrien int rv = -1; 7968349Sobrien 80133359Sobrien const char *code = NULL; 81133359Sobrien const char *code_mime = NULL; 82133359Sobrien const char *type = NULL; 83133359Sobrien const char *subtype = NULL; 84133359Sobrien const char *subtype_mime = NULL; 8568349Sobrien 8668349Sobrien int has_escapes = 0; 8768349Sobrien int has_backspace = 0; 88159764Sobrien int seen_cr = 0; 8968349Sobrien 9068349Sobrien int n_crlf = 0; 9168349Sobrien int n_lf = 0; 9268349Sobrien int n_cr = 0; 9368349Sobrien int n_nel = 0; 9468349Sobrien 9568349Sobrien int last_line_end = -1; 9668349Sobrien int has_long_lines = 0; 9768349Sobrien 9868349Sobrien /* 9984685Sobrien * Undo the NUL-termination kindly provided by process() 10084685Sobrien * but leave at least one byte to look at 10184685Sobrien */ 10284685Sobrien while (nbytes > 1 && buf[nbytes - 1] == '\0') 10368349Sobrien nbytes--; 10468349Sobrien 105159764Sobrien if ((nbuf = malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL) 106159764Sobrien goto done; 107159764Sobrien if ((ubuf = malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL) 108159764Sobrien goto done; 109133359Sobrien 11068349Sobrien /* 11168349Sobrien * Then try to determine whether it's any character code we can 11268349Sobrien * identify. Each of these tests, if it succeeds, will leave 11368349Sobrien * the text converted into one-unichar-per-character Unicode in 11468349Sobrien * ubuf, and the number of characters converted in ulen. 11568349Sobrien */ 11668349Sobrien if (looks_ascii(buf, nbytes, ubuf, &ulen)) { 11768349Sobrien code = "ASCII"; 11868349Sobrien code_mime = "us-ascii"; 11968349Sobrien type = "text"; 12068349Sobrien } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) { 12168349Sobrien code = "UTF-8 Unicode"; 12268349Sobrien code_mime = "utf-8"; 12368349Sobrien type = "text"; 124133359Sobrien } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) { 12568349Sobrien if (i == 1) 12668349Sobrien code = "Little-endian UTF-16 Unicode"; 12768349Sobrien else 12868349Sobrien code = "Big-endian UTF-16 Unicode"; 12968349Sobrien 13068349Sobrien type = "character data"; 13168349Sobrien code_mime = "utf-16"; /* is this defined? */ 13268349Sobrien } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) { 13368349Sobrien code = "ISO-8859"; 13468349Sobrien type = "text"; 13568349Sobrien code_mime = "iso-8859-1"; 13668349Sobrien } else if (looks_extended(buf, nbytes, ubuf, &ulen)) { 13768349Sobrien code = "Non-ISO extended-ASCII"; 13868349Sobrien type = "text"; 13968349Sobrien code_mime = "unknown"; 14068349Sobrien } else { 14168349Sobrien from_ebcdic(buf, nbytes, nbuf); 14268349Sobrien 14368349Sobrien if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) { 14468349Sobrien code = "EBCDIC"; 14568349Sobrien type = "character data"; 14668349Sobrien code_mime = "ebcdic"; 14768349Sobrien } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) { 14868349Sobrien code = "International EBCDIC"; 14968349Sobrien type = "character data"; 15068349Sobrien code_mime = "ebcdic"; 15168349Sobrien } else { 152159764Sobrien rv = 0; 153159764Sobrien goto done; /* doesn't look like text at all */ 15468349Sobrien } 15568349Sobrien } 15668349Sobrien 157159764Sobrien if (nbytes <= 1) { 158159764Sobrien rv = 0; 159159764Sobrien goto done; 160159764Sobrien } 161159764Sobrien 16268349Sobrien /* 16368349Sobrien * for troff, look for . + letter + letter or .\"; 16468349Sobrien * this must be done to disambiguate tar archives' ./file 16568349Sobrien * and other trash from real troff input. 16668349Sobrien * 16768349Sobrien * I believe Plan 9 troff allows non-ASCII characters in the names 16868349Sobrien * of macros, so this test might possibly fail on such a file. 16968349Sobrien */ 17068349Sobrien if (*ubuf == '.') { 17168349Sobrien unichar *tp = ubuf + 1; 17268349Sobrien 17368349Sobrien while (ISSPC(*tp)) 17468349Sobrien ++tp; /* skip leading whitespace */ 17568349Sobrien if ((tp[0] == '\\' && tp[1] == '\"') || 176133359Sobrien (isascii((unsigned char)tp[0]) && 177133359Sobrien isalnum((unsigned char)tp[0]) && 178133359Sobrien isascii((unsigned char)tp[1]) && 179133359Sobrien isalnum((unsigned char)tp[1]) && 18068349Sobrien ISSPC(tp[2]))) { 18168349Sobrien subtype_mime = "text/troff"; 18268349Sobrien subtype = "troff or preprocessor input"; 18368349Sobrien goto subtype_identified; 18468349Sobrien } 18568349Sobrien } 18668349Sobrien 18768349Sobrien if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) { 18868349Sobrien subtype_mime = "text/fortran"; 18968349Sobrien subtype = "fortran program"; 19068349Sobrien goto subtype_identified; 19168349Sobrien } 19268349Sobrien 19368349Sobrien /* look for tokens from names.h - this is expensive! */ 19468349Sobrien 19568349Sobrien i = 0; 19668349Sobrien while (i < ulen) { 197133359Sobrien size_t end; 19868349Sobrien 19968349Sobrien /* 20068349Sobrien * skip past any leading space 20168349Sobrien */ 20268349Sobrien while (i < ulen && ISSPC(ubuf[i])) 20368349Sobrien i++; 20468349Sobrien if (i >= ulen) 20568349Sobrien break; 20668349Sobrien 20768349Sobrien /* 20868349Sobrien * find the next whitespace 20968349Sobrien */ 21068349Sobrien for (end = i + 1; end < nbytes; end++) 21168349Sobrien if (ISSPC(ubuf[end])) 21268349Sobrien break; 21368349Sobrien 21468349Sobrien /* 21568349Sobrien * compare the word thus isolated against the token list 21668349Sobrien */ 21768349Sobrien for (p = names; p < names + NNAMES; p++) { 218133359Sobrien if (ascmatch((const unsigned char *)p->name, ubuf + i, 219110949Sobrien end - i)) { 22068349Sobrien subtype = types[p->type].human; 22168349Sobrien subtype_mime = types[p->type].mime; 22268349Sobrien goto subtype_identified; 22368349Sobrien } 22468349Sobrien } 22568349Sobrien 22668349Sobrien i = end; 22768349Sobrien } 22868349Sobrien 22968349Sobriensubtype_identified: 23068349Sobrien 23168349Sobrien /* 23268349Sobrien * Now try to discover other details about the file. 23368349Sobrien */ 23468349Sobrien for (i = 0; i < ulen; i++) { 235159764Sobrien if (ubuf[i] == '\n') { 236159764Sobrien if (seen_cr) 237159764Sobrien n_crlf++; 238159764Sobrien else 239159764Sobrien n_lf++; 240159764Sobrien last_line_end = i; 241159764Sobrien } else if (seen_cr) 242159764Sobrien n_cr++; 243159764Sobrien 244159764Sobrien seen_cr = (ubuf[i] == '\r'); 245159764Sobrien if (seen_cr) 246159764Sobrien last_line_end = i; 247159764Sobrien 248159764Sobrien if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ 249159764Sobrien n_nel++; 250159764Sobrien last_line_end = i; 251159764Sobrien } 252159764Sobrien 253159764Sobrien /* If this line is _longer_ than MAXLINELEN, remember it. */ 25468349Sobrien if (i > last_line_end + MAXLINELEN) 25568349Sobrien has_long_lines = 1; 25668349Sobrien 25768349Sobrien if (ubuf[i] == '\033') 25868349Sobrien has_escapes = 1; 25968349Sobrien if (ubuf[i] == '\b') 26068349Sobrien has_backspace = 1; 26168349Sobrien } 26268349Sobrien 263159764Sobrien /* Beware, if the data has been truncated, the final CR could have 264159764Sobrien been followed by a LF. If we have HOWMANY bytes, it indicates 265159764Sobrien that the data might have been truncated, probably even before 266159764Sobrien this function was called. */ 267159764Sobrien if (seen_cr && nbytes < HOWMANY) 268159764Sobrien n_cr++; 269159764Sobrien 270133359Sobrien if ((ms->flags & MAGIC_MIME)) { 271133359Sobrien if (subtype_mime) { 272133359Sobrien if (file_printf(ms, subtype_mime) == -1) 273159764Sobrien goto done; 274133359Sobrien } else { 275133359Sobrien if (file_printf(ms, "text/plain") == -1) 276159764Sobrien goto done; 277133359Sobrien } 27868349Sobrien 27968349Sobrien if (code_mime) { 280133359Sobrien if (file_printf(ms, "; charset=") == -1) 281159764Sobrien goto done; 282133359Sobrien if (file_printf(ms, code_mime) == -1) 283159764Sobrien goto done; 28468349Sobrien } 28568349Sobrien } else { 286133359Sobrien if (file_printf(ms, code) == -1) 287159764Sobrien goto done; 28868349Sobrien 28968349Sobrien if (subtype) { 290133359Sobrien if (file_printf(ms, " ") == -1) 291159764Sobrien goto done; 292133359Sobrien if (file_printf(ms, subtype) == -1) 293159764Sobrien goto done; 29468349Sobrien } 29568349Sobrien 296133359Sobrien if (file_printf(ms, " ") == -1) 297159764Sobrien goto done; 298133359Sobrien if (file_printf(ms, type) == -1) 299159764Sobrien goto done; 30068349Sobrien 30168349Sobrien if (has_long_lines) 302133359Sobrien if (file_printf(ms, ", with very long lines") == -1) 303159764Sobrien goto done; 30468349Sobrien 30568349Sobrien /* 30668349Sobrien * Only report line terminators if we find one other than LF, 30768349Sobrien * or if we find none at all. 30868349Sobrien */ 30968349Sobrien if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) || 31068349Sobrien (n_crlf != 0 || n_cr != 0 || n_nel != 0)) { 311133359Sobrien if (file_printf(ms, ", with") == -1) 312159764Sobrien goto done; 31368349Sobrien 314133359Sobrien if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) { 315133359Sobrien if (file_printf(ms, " no") == -1) 316159764Sobrien goto done; 317133359Sobrien } else { 31868349Sobrien if (n_crlf) { 319133359Sobrien if (file_printf(ms, " CRLF") == -1) 320159764Sobrien goto done; 32168349Sobrien if (n_cr || n_lf || n_nel) 322133359Sobrien if (file_printf(ms, ",") == -1) 323159764Sobrien goto done; 32468349Sobrien } 32568349Sobrien if (n_cr) { 326133359Sobrien if (file_printf(ms, " CR") == -1) 327159764Sobrien goto done; 32868349Sobrien if (n_lf || n_nel) 329133359Sobrien if (file_printf(ms, ",") == -1) 330159764Sobrien goto done; 33168349Sobrien } 33268349Sobrien if (n_lf) { 333133359Sobrien if (file_printf(ms, " LF") == -1) 334159764Sobrien goto done; 33568349Sobrien if (n_nel) 336133359Sobrien if (file_printf(ms, ",") == -1) 337159764Sobrien goto done; 33868349Sobrien } 33968349Sobrien if (n_nel) 340133359Sobrien if (file_printf(ms, " NEL") == -1) 341159764Sobrien goto done; 34268349Sobrien } 34368349Sobrien 344133359Sobrien if (file_printf(ms, " line terminators") == -1) 345159764Sobrien goto done; 34668349Sobrien } 34768349Sobrien 34868349Sobrien if (has_escapes) 349133359Sobrien if (file_printf(ms, ", with escape sequences") == -1) 350159764Sobrien goto done; 35168349Sobrien if (has_backspace) 352133359Sobrien if (file_printf(ms, ", with overstriking") == -1) 353159764Sobrien goto done; 35468349Sobrien } 355159764Sobrien rv = 1; 356159764Sobriendone: 357159764Sobrien if (nbuf) 358159764Sobrien free(nbuf); 359159764Sobrien if (ubuf) 360159764Sobrien free(ubuf); 36168349Sobrien 362159764Sobrien return rv; 36368349Sobrien} 36468349Sobrien 365133359Sobrienprivate int 366133359Sobrienascmatch(const unsigned char *s, const unichar *us, size_t ulen) 36768349Sobrien{ 36868349Sobrien size_t i; 36968349Sobrien 37068349Sobrien for (i = 0; i < ulen; i++) { 37168349Sobrien if (s[i] != us[i]) 37268349Sobrien return 0; 37368349Sobrien } 37468349Sobrien 37568349Sobrien if (s[i]) 37668349Sobrien return 0; 37768349Sobrien else 37868349Sobrien return 1; 37968349Sobrien} 38068349Sobrien 38168349Sobrien/* 38268349Sobrien * This table reflects a particular philosophy about what constitutes 38368349Sobrien * "text," and there is room for disagreement about it. 38468349Sobrien * 38568349Sobrien * Version 3.31 of the file command considered a file to be ASCII if 38668349Sobrien * each of its characters was approved by either the isascii() or 38768349Sobrien * isalpha() function. On most systems, this would mean that any 38868349Sobrien * file consisting only of characters in the range 0x00 ... 0x7F 38968349Sobrien * would be called ASCII text, but many systems might reasonably 39068349Sobrien * consider some characters outside this range to be alphabetic, 39168349Sobrien * so the file command would call such characters ASCII. It might 39268349Sobrien * have been more accurate to call this "considered textual on the 39368349Sobrien * local system" than "ASCII." 39468349Sobrien * 39568349Sobrien * It considered a file to be "International language text" if each 39668349Sobrien * of its characters was either an ASCII printing character (according 39768349Sobrien * to the real ASCII standard, not the above test), a character in 39868349Sobrien * the range 0x80 ... 0xFF, or one of the following control characters: 39968349Sobrien * backspace, tab, line feed, vertical tab, form feed, carriage return, 40068349Sobrien * escape. No attempt was made to determine the language in which files 40168349Sobrien * of this type were written. 40268349Sobrien * 40368349Sobrien * 40468349Sobrien * The table below considers a file to be ASCII if all of its characters 40568349Sobrien * are either ASCII printing characters (again, according to the X3.4 40668349Sobrien * standard, not isascii()) or any of the following controls: bell, 40768349Sobrien * backspace, tab, line feed, form feed, carriage return, esc, nextline. 40868349Sobrien * 40968349Sobrien * I include bell because some programs (particularly shell scripts) 41068349Sobrien * use it literally, even though it is rare in normal text. I exclude 41168349Sobrien * vertical tab because it never seems to be used in real text. I also 41268349Sobrien * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 41368349Sobrien * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 41468349Sobrien * character to. It might be more appropriate to include it in the 8859 41568349Sobrien * set instead of the ASCII set, but it's got to be included in *something* 41668349Sobrien * we recognize or EBCDIC files aren't going to be considered textual. 41768349Sobrien * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 41868349Sobrien * and Latin characters, so these should possibly be allowed. But they 41968349Sobrien * make a real mess on VT100-style displays if they're not paired properly, 42068349Sobrien * so we are probably better off not calling them text. 42168349Sobrien * 42268349Sobrien * A file is considered to be ISO-8859 text if its characters are all 42368349Sobrien * either ASCII, according to the above definition, or printing characters 42468349Sobrien * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 42568349Sobrien * 42668349Sobrien * Finally, a file is considered to be international text from some other 42768349Sobrien * character code if its characters are all either ISO-8859 (according to 42868349Sobrien * the above definition) or characters in the range 0x80 ... 0x9F, which 42968349Sobrien * ISO-8859 considers to be control characters but the IBM PC and Macintosh 43068349Sobrien * consider to be printing characters. 43168349Sobrien */ 43268349Sobrien 43368349Sobrien#define F 0 /* character never appears in text */ 43468349Sobrien#define T 1 /* character appears in plain ASCII text */ 43568349Sobrien#define I 2 /* character appears in ISO-8859 text */ 43668349Sobrien#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 43768349Sobrien 438133359Sobrienprivate char text_chars[256] = { 43968349Sobrien /* BEL BS HT LF FF CR */ 44068349Sobrien F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 44168349Sobrien /* ESC */ 44268349Sobrien F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 44368349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 44468349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 44568349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 44668349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 44768349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 44868349Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 44968349Sobrien /* NEL */ 45068349Sobrien X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 45168349Sobrien X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 45268349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 45368349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 45468349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 45568349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 45668349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 45768349Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 45868349Sobrien}; 45968349Sobrien 460133359Sobrienprivate int 461133359Sobrienlooks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, 462133359Sobrien size_t *ulen) 46368349Sobrien{ 46468349Sobrien int i; 46568349Sobrien 46668349Sobrien *ulen = 0; 46768349Sobrien 46868349Sobrien for (i = 0; i < nbytes; i++) { 46968349Sobrien int t = text_chars[buf[i]]; 47068349Sobrien 47168349Sobrien if (t != T) 47268349Sobrien return 0; 47368349Sobrien 47468349Sobrien ubuf[(*ulen)++] = buf[i]; 47568349Sobrien } 47668349Sobrien 47768349Sobrien return 1; 47868349Sobrien} 47968349Sobrien 480133359Sobrienprivate int 481133359Sobrienlooks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 48268349Sobrien{ 48368349Sobrien int i; 48468349Sobrien 48568349Sobrien *ulen = 0; 48668349Sobrien 48768349Sobrien for (i = 0; i < nbytes; i++) { 48868349Sobrien int t = text_chars[buf[i]]; 48968349Sobrien 49068349Sobrien if (t != T && t != I) 49168349Sobrien return 0; 49268349Sobrien 49368349Sobrien ubuf[(*ulen)++] = buf[i]; 49468349Sobrien } 49568349Sobrien 49668349Sobrien return 1; 49768349Sobrien} 49868349Sobrien 499133359Sobrienprivate int 500133359Sobrienlooks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, 501133359Sobrien size_t *ulen) 50268349Sobrien{ 50368349Sobrien int i; 50468349Sobrien 50568349Sobrien *ulen = 0; 50668349Sobrien 50768349Sobrien for (i = 0; i < nbytes; i++) { 50868349Sobrien int t = text_chars[buf[i]]; 50968349Sobrien 51068349Sobrien if (t != T && t != I && t != X) 51168349Sobrien return 0; 51268349Sobrien 51368349Sobrien ubuf[(*ulen)++] = buf[i]; 51468349Sobrien } 51568349Sobrien 51668349Sobrien return 1; 51768349Sobrien} 51868349Sobrien 519133359Sobrienprivate int 520133359Sobrienlooks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 52168349Sobrien{ 52268349Sobrien int i, n; 52368349Sobrien unichar c; 52468349Sobrien int gotone = 0; 52568349Sobrien 52668349Sobrien *ulen = 0; 52768349Sobrien 52868349Sobrien for (i = 0; i < nbytes; i++) { 52968349Sobrien if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 53068349Sobrien /* 53168349Sobrien * Even if the whole file is valid UTF-8 sequences, 53268349Sobrien * still reject it if it uses weird control characters. 53368349Sobrien */ 53468349Sobrien 53568349Sobrien if (text_chars[buf[i]] != T) 53668349Sobrien return 0; 53768349Sobrien 53868349Sobrien ubuf[(*ulen)++] = buf[i]; 53968349Sobrien } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 54068349Sobrien return 0; 54168349Sobrien } else { /* 11xxxxxx begins UTF-8 */ 54268349Sobrien int following; 54368349Sobrien 54468349Sobrien if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 54568349Sobrien c = buf[i] & 0x1f; 54668349Sobrien following = 1; 54768349Sobrien } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 54868349Sobrien c = buf[i] & 0x0f; 54968349Sobrien following = 2; 55068349Sobrien } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 55168349Sobrien c = buf[i] & 0x07; 55268349Sobrien following = 3; 55368349Sobrien } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 55468349Sobrien c = buf[i] & 0x03; 55568349Sobrien following = 4; 55668349Sobrien } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 55768349Sobrien c = buf[i] & 0x01; 55868349Sobrien following = 5; 55968349Sobrien } else 56068349Sobrien return 0; 56168349Sobrien 56268349Sobrien for (n = 0; n < following; n++) { 56368349Sobrien i++; 56468349Sobrien if (i >= nbytes) 56568349Sobrien goto done; 56668349Sobrien 56768349Sobrien if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 56868349Sobrien return 0; 56968349Sobrien 57068349Sobrien c = (c << 6) + (buf[i] & 0x3f); 57168349Sobrien } 57268349Sobrien 57368349Sobrien ubuf[(*ulen)++] = c; 57468349Sobrien gotone = 1; 57568349Sobrien } 57668349Sobrien } 57768349Sobriendone: 57868349Sobrien return gotone; /* don't claim it's UTF-8 if it's all 7-bit */ 57968349Sobrien} 58068349Sobrien 581133359Sobrienprivate int 582133359Sobrienlooks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf, 583133359Sobrien size_t *ulen) 58468349Sobrien{ 58568349Sobrien int bigend; 58668349Sobrien int i; 58768349Sobrien 58868349Sobrien if (nbytes < 2) 58968349Sobrien return 0; 59068349Sobrien 59168349Sobrien if (buf[0] == 0xff && buf[1] == 0xfe) 59268349Sobrien bigend = 0; 59368349Sobrien else if (buf[0] == 0xfe && buf[1] == 0xff) 59468349Sobrien bigend = 1; 59568349Sobrien else 59668349Sobrien return 0; 59768349Sobrien 59868349Sobrien *ulen = 0; 59968349Sobrien 60068349Sobrien for (i = 2; i + 1 < nbytes; i += 2) { 60168349Sobrien /* XXX fix to properly handle chars > 65536 */ 60268349Sobrien 60368349Sobrien if (bigend) 60468349Sobrien ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i]; 60568349Sobrien else 60668349Sobrien ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1]; 60768349Sobrien 60868349Sobrien if (ubuf[*ulen - 1] == 0xfffe) 60968349Sobrien return 0; 610133359Sobrien if (ubuf[*ulen - 1] < 128 && 611133359Sobrien text_chars[(size_t)ubuf[*ulen - 1]] != T) 61268349Sobrien return 0; 61368349Sobrien } 61468349Sobrien 615110949Sobrien return 1 + bigend; 61668349Sobrien} 61768349Sobrien 61868349Sobrien#undef F 61968349Sobrien#undef T 62068349Sobrien#undef I 62168349Sobrien#undef X 62268349Sobrien 62368349Sobrien/* 62468349Sobrien * This table maps each EBCDIC character to an (8-bit extended) ASCII 62568349Sobrien * character, as specified in the rationale for the dd(1) command in 62668349Sobrien * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 62768349Sobrien * 62868349Sobrien * Unfortunately it does not seem to correspond exactly to any of the 62968349Sobrien * five variants of EBCDIC documented in IBM's _Enterprise Systems 63068349Sobrien * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 63168349Sobrien * Edition, July, 1999, pp. I-1 - I-4. 63268349Sobrien * 63368349Sobrien * Fortunately, though, all versions of EBCDIC, including this one, agree 63468349Sobrien * on most of the printing characters that also appear in (7-bit) ASCII. 63568349Sobrien * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 63668349Sobrien * 63768349Sobrien * Fortunately too, there is general agreement that codes 0x00 through 63868349Sobrien * 0x3F represent control characters, 0x41 a nonbreaking space, and the 63968349Sobrien * remainder printing characters. 64068349Sobrien * 64168349Sobrien * This is sufficient to allow us to identify EBCDIC text and to distinguish 64268349Sobrien * between old-style and internationalized examples of text. 64368349Sobrien */ 64468349Sobrien 645133359Sobrienprivate unsigned char ebcdic_to_ascii[] = { 64668349Sobrien 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 64768349Sobrien 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 64868349Sobrien128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 64968349Sobrien144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 65068349Sobrien' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 65168349Sobrien'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 65268349Sobrien'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 65368349Sobrien186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 65468349Sobrien195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 65568349Sobrien202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 65668349Sobrien209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 65768349Sobrien216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 65868349Sobrien'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 65968349Sobrien'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 66068349Sobrien'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 66168349Sobrien'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 66268349Sobrien}; 66368349Sobrien 664133359Sobrien#ifdef notdef 66568349Sobrien/* 66668349Sobrien * The following EBCDIC-to-ASCII table may relate more closely to reality, 66768349Sobrien * or at least to modern reality. It comes from 66868349Sobrien * 66968349Sobrien * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 67068349Sobrien * 67168349Sobrien * and maps the characters of EBCDIC code page 1047 (the code used for 67268349Sobrien * Unix-derived software on IBM's 390 systems) to the corresponding 67368349Sobrien * characters from ISO 8859-1. 67468349Sobrien * 67568349Sobrien * If this table is used instead of the above one, some of the special 67668349Sobrien * cases for the NEL character can be taken out of the code. 67768349Sobrien */ 67868349Sobrien 679133359Sobrienprivate unsigned char ebcdic_1047_to_8859[] = { 68068349Sobrien0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 68168349Sobrien0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 68268349Sobrien0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 68368349Sobrien0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 68468349Sobrien0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 68568349Sobrien0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 68668349Sobrien0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 68768349Sobrien0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 68868349Sobrien0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 68968349Sobrien0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 69068349Sobrien0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 69168349Sobrien0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 69268349Sobrien0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 69368349Sobrien0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 69468349Sobrien0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 69568349Sobrien0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 69668349Sobrien}; 697133359Sobrien#endif 69868349Sobrien 69968349Sobrien/* 70068349Sobrien * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 70168349Sobrien */ 702133359Sobrienprivate void 703133359Sobrienfrom_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 70468349Sobrien{ 70568349Sobrien int i; 70668349Sobrien 70768349Sobrien for (i = 0; i < nbytes; i++) { 70868349Sobrien out[i] = ebcdic_to_ascii[buf[i]]; 70968349Sobrien } 71068349Sobrien} 711