ascmagic.c revision 133359
168349Sobrien/*
2133359Sobrien * Copyright (c) Ian F. Darwin 1986-1995.
3133359Sobrien * Software written by Ian F. Darwin and others;
4133359Sobrien * maintained 1995-present by Christos Zoulas and others.
5133359Sobrien *
6133359Sobrien * Redistribution and use in source and binary forms, with or without
7133359Sobrien * modification, are permitted provided that the following conditions
8133359Sobrien * are met:
9133359Sobrien * 1. Redistributions of source code must retain the above copyright
10133359Sobrien *    notice immediately at the beginning of the file, without modification,
11133359Sobrien *    this list of conditions, and the following disclaimer.
12133359Sobrien * 2. Redistributions in binary form must reproduce the above copyright
13133359Sobrien *    notice, this list of conditions and the following disclaimer in the
14133359Sobrien *    documentation and/or other materials provided with the distribution.
15133359Sobrien * 3. All advertising materials mentioning features or use of this software
16133359Sobrien *    must display the following acknowledgement:
17133359Sobrien *    This product includes software developed by Ian F. Darwin and others.
18133359Sobrien * 4. The name of the author may not be used to endorse or promote products
19133359Sobrien *    derived from this software without specific prior written permission.
20133359Sobrien *
21133359Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22133359Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23133359Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24133359Sobrien * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
25133359Sobrien * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26133359Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27133359Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28133359Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29133359Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30133359Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31133359Sobrien * SUCH DAMAGE.
32133359Sobrien */
33133359Sobrien/*
3468349Sobrien * ASCII magic -- file types that we know based on keywords
3568349Sobrien * that can appear anywhere in the file.
3668349Sobrien *
3768349Sobrien * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
3868349Sobrien * to handle character codes other than ASCII on a unified basis.
3968349Sobrien *
4068349Sobrien * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
4168349Sobrien * international characters, now subsumed into this file.
4268349Sobrien */
4368349Sobrien
4468349Sobrien#include "file.h"
45133359Sobrien#include "magic.h"
46133359Sobrien#include <stdio.h>
4768349Sobrien#include <string.h>
4868349Sobrien#include <memory.h>
4968349Sobrien#include <ctype.h>
5068349Sobrien#include <stdlib.h>
5168349Sobrien#ifdef HAVE_UNISTD_H
5268349Sobrien#include <unistd.h>
5368349Sobrien#endif
5468349Sobrien#include "names.h"
5568349Sobrien
5668349Sobrien#ifndef	lint
57133359SobrienFILE_RCSID("@(#)$Id: ascmagic.c,v 1.40 2003/11/20 00:25:39 christos Exp $")
5868349Sobrien#endif	/* lint */
5968349Sobrien
6068349Sobrientypedef unsigned long unichar;
6168349Sobrien
6268349Sobrien#define MAXLINELEN 300	/* longest sane line length */
6368349Sobrien#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
6468349Sobrien		  || (x) == 0x85 || (x) == '\f')
6568349Sobrien
66133359Sobrienprivate int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
67133359Sobrienprivate int looks_utf8(const unsigned char *, size_t, unichar *, size_t *);
68133359Sobrienprivate int looks_unicode(const unsigned char *, size_t, unichar *, size_t *);
69133359Sobrienprivate int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
70133359Sobrienprivate int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
71133359Sobrienprivate void from_ebcdic(const unsigned char *, size_t, unsigned char *);
72133359Sobrienprivate int ascmatch(const unsigned char *, const unichar *, size_t);
7368349Sobrien
74133359Sobrien
75133359Sobrienprotected int
76133359Sobrienfile_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
7768349Sobrien{
78133359Sobrien	size_t i;
79110949Sobrien	unsigned char nbuf[HOWMANY+1];	/* one extra for terminating '\0' */
8068349Sobrien	unichar ubuf[HOWMANY+1];	/* one extra for terminating '\0' */
81133359Sobrien	size_t ulen;
8268349Sobrien	struct names *p;
8368349Sobrien
84133359Sobrien	const char *code = NULL;
85133359Sobrien	const char *code_mime = NULL;
86133359Sobrien	const char *type = NULL;
87133359Sobrien	const char *subtype = NULL;
88133359Sobrien	const char *subtype_mime = NULL;
8968349Sobrien
9068349Sobrien	int has_escapes = 0;
9168349Sobrien	int has_backspace = 0;
9268349Sobrien
9368349Sobrien	int n_crlf = 0;
9468349Sobrien	int n_lf = 0;
9568349Sobrien	int n_cr = 0;
9668349Sobrien	int n_nel = 0;
9768349Sobrien
9868349Sobrien	int last_line_end = -1;
9968349Sobrien	int has_long_lines = 0;
10068349Sobrien
10168349Sobrien	/*
10284685Sobrien	 * Undo the NUL-termination kindly provided by process()
10384685Sobrien	 * but leave at least one byte to look at
10484685Sobrien	 */
10568349Sobrien
10684685Sobrien	while (nbytes > 1 && buf[nbytes - 1] == '\0')
10768349Sobrien		nbytes--;
10868349Sobrien
109133359Sobrien	/* nbuf and ubuf relies on this */
110133359Sobrien	if (nbytes > HOWMANY)
111133359Sobrien		nbytes = HOWMANY;
112133359Sobrien
11368349Sobrien	/*
11468349Sobrien	 * Then try to determine whether it's any character code we can
11568349Sobrien	 * identify.  Each of these tests, if it succeeds, will leave
11668349Sobrien	 * the text converted into one-unichar-per-character Unicode in
11768349Sobrien	 * ubuf, and the number of characters converted in ulen.
11868349Sobrien	 */
11968349Sobrien	if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
12068349Sobrien		code = "ASCII";
12168349Sobrien		code_mime = "us-ascii";
12268349Sobrien		type = "text";
12368349Sobrien	} else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
12468349Sobrien		code = "UTF-8 Unicode";
12568349Sobrien		code_mime = "utf-8";
12668349Sobrien		type = "text";
127133359Sobrien	} else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
12868349Sobrien		if (i == 1)
12968349Sobrien			code = "Little-endian UTF-16 Unicode";
13068349Sobrien		else
13168349Sobrien			code = "Big-endian UTF-16 Unicode";
13268349Sobrien
13368349Sobrien		type = "character data";
13468349Sobrien		code_mime = "utf-16";    /* is this defined? */
13568349Sobrien	} else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
13668349Sobrien		code = "ISO-8859";
13768349Sobrien		type = "text";
13868349Sobrien		code_mime = "iso-8859-1";
13968349Sobrien	} else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
14068349Sobrien		code = "Non-ISO extended-ASCII";
14168349Sobrien		type = "text";
14268349Sobrien		code_mime = "unknown";
14368349Sobrien	} else {
14468349Sobrien		from_ebcdic(buf, nbytes, nbuf);
14568349Sobrien
14668349Sobrien		if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
14768349Sobrien			code = "EBCDIC";
14868349Sobrien			type = "character data";
14968349Sobrien			code_mime = "ebcdic";
15068349Sobrien		} else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
15168349Sobrien			code = "International EBCDIC";
15268349Sobrien			type = "character data";
15368349Sobrien			code_mime = "ebcdic";
15468349Sobrien		} else {
15568349Sobrien			return 0;  /* doesn't look like text at all */
15668349Sobrien		}
15768349Sobrien	}
15868349Sobrien
15968349Sobrien	/*
16068349Sobrien	 * for troff, look for . + letter + letter or .\";
16168349Sobrien	 * this must be done to disambiguate tar archives' ./file
16268349Sobrien	 * and other trash from real troff input.
16368349Sobrien	 *
16468349Sobrien	 * I believe Plan 9 troff allows non-ASCII characters in the names
16568349Sobrien	 * of macros, so this test might possibly fail on such a file.
16668349Sobrien	 */
16768349Sobrien	if (*ubuf == '.') {
16868349Sobrien		unichar *tp = ubuf + 1;
16968349Sobrien
17068349Sobrien		while (ISSPC(*tp))
17168349Sobrien			++tp;	/* skip leading whitespace */
17268349Sobrien		if ((tp[0] == '\\' && tp[1] == '\"') ||
173133359Sobrien		    (isascii((unsigned char)tp[0]) &&
174133359Sobrien		     isalnum((unsigned char)tp[0]) &&
175133359Sobrien		     isascii((unsigned char)tp[1]) &&
176133359Sobrien		     isalnum((unsigned char)tp[1]) &&
17768349Sobrien		     ISSPC(tp[2]))) {
17868349Sobrien			subtype_mime = "text/troff";
17968349Sobrien			subtype = "troff or preprocessor input";
18068349Sobrien			goto subtype_identified;
18168349Sobrien		}
18268349Sobrien	}
18368349Sobrien
18468349Sobrien	if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
18568349Sobrien		subtype_mime = "text/fortran";
18668349Sobrien		subtype = "fortran program";
18768349Sobrien		goto subtype_identified;
18868349Sobrien	}
18968349Sobrien
19068349Sobrien	/* look for tokens from names.h - this is expensive! */
19168349Sobrien
19268349Sobrien	i = 0;
19368349Sobrien	while (i < ulen) {
194133359Sobrien		size_t end;
19568349Sobrien
19668349Sobrien		/*
19768349Sobrien		 * skip past any leading space
19868349Sobrien		 */
19968349Sobrien		while (i < ulen && ISSPC(ubuf[i]))
20068349Sobrien			i++;
20168349Sobrien		if (i >= ulen)
20268349Sobrien			break;
20368349Sobrien
20468349Sobrien		/*
20568349Sobrien		 * find the next whitespace
20668349Sobrien		 */
20768349Sobrien		for (end = i + 1; end < nbytes; end++)
20868349Sobrien			if (ISSPC(ubuf[end]))
20968349Sobrien				break;
21068349Sobrien
21168349Sobrien		/*
21268349Sobrien		 * compare the word thus isolated against the token list
21368349Sobrien		 */
21468349Sobrien		for (p = names; p < names + NNAMES; p++) {
215133359Sobrien			if (ascmatch((const unsigned char *)p->name, ubuf + i,
216110949Sobrien			    end - i)) {
21768349Sobrien				subtype = types[p->type].human;
21868349Sobrien				subtype_mime = types[p->type].mime;
21968349Sobrien				goto subtype_identified;
22068349Sobrien			}
22168349Sobrien		}
22268349Sobrien
22368349Sobrien		i = end;
22468349Sobrien	}
22568349Sobrien
22668349Sobriensubtype_identified:
22768349Sobrien
22868349Sobrien	/*
22968349Sobrien	 * Now try to discover other details about the file.
23068349Sobrien	 */
23168349Sobrien	for (i = 0; i < ulen; i++) {
23268349Sobrien		if (i > last_line_end + MAXLINELEN)
23368349Sobrien			has_long_lines = 1;
23468349Sobrien
23568349Sobrien		if (ubuf[i] == '\033')
23668349Sobrien			has_escapes = 1;
23768349Sobrien		if (ubuf[i] == '\b')
23868349Sobrien			has_backspace = 1;
23968349Sobrien
24068349Sobrien		if (ubuf[i] == '\r' && (i + 1 <  ulen && ubuf[i + 1] == '\n')) {
24168349Sobrien			n_crlf++;
24268349Sobrien			last_line_end = i;
24368349Sobrien		}
24468349Sobrien		if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) {
24568349Sobrien			n_cr++;
24668349Sobrien			last_line_end = i;
24768349Sobrien		}
248133359Sobrien		if (ubuf[i] == '\n' && ((int)i - 1 < 0 || ubuf[i - 1] != '\r')){
24968349Sobrien			n_lf++;
25068349Sobrien			last_line_end = i;
25168349Sobrien		}
25268349Sobrien		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
25368349Sobrien			n_nel++;
25468349Sobrien			last_line_end = i;
25568349Sobrien		}
25668349Sobrien	}
25768349Sobrien
258133359Sobrien	if ((ms->flags & MAGIC_MIME)) {
259133359Sobrien		if (subtype_mime) {
260133359Sobrien			if (file_printf(ms, subtype_mime) == -1)
261133359Sobrien				return -1;
262133359Sobrien		} else {
263133359Sobrien			if (file_printf(ms, "text/plain") == -1)
264133359Sobrien				return -1;
265133359Sobrien		}
26668349Sobrien
26768349Sobrien		if (code_mime) {
268133359Sobrien			if (file_printf(ms, "; charset=") == -1)
269133359Sobrien				return -1;
270133359Sobrien			if (file_printf(ms, code_mime) == -1)
271133359Sobrien				return -1;
27268349Sobrien		}
27368349Sobrien	} else {
274133359Sobrien		if (file_printf(ms, code) == -1)
275133359Sobrien			return -1;
27668349Sobrien
27768349Sobrien		if (subtype) {
278133359Sobrien			if (file_printf(ms, " ") == -1)
279133359Sobrien				return -1;
280133359Sobrien			if (file_printf(ms, subtype) == -1)
281133359Sobrien				return -1;
28268349Sobrien		}
28368349Sobrien
284133359Sobrien		if (file_printf(ms, " ") == -1)
285133359Sobrien			return -1;
286133359Sobrien		if (file_printf(ms, type) == -1)
287133359Sobrien			return -1;
28868349Sobrien
28968349Sobrien		if (has_long_lines)
290133359Sobrien			if (file_printf(ms, ", with very long lines") == -1)
291133359Sobrien				return -1;
29268349Sobrien
29368349Sobrien		/*
29468349Sobrien		 * Only report line terminators if we find one other than LF,
29568349Sobrien		 * or if we find none at all.
29668349Sobrien		 */
29768349Sobrien		if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
29868349Sobrien		    (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
299133359Sobrien			if (file_printf(ms, ", with") == -1)
300133359Sobrien				return -1;
30168349Sobrien
302133359Sobrien			if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)			{
303133359Sobrien				if (file_printf(ms, " no") == -1)
304133359Sobrien					return -1;
305133359Sobrien			} else {
30668349Sobrien				if (n_crlf) {
307133359Sobrien					if (file_printf(ms, " CRLF") == -1)
308133359Sobrien						return -1;
30968349Sobrien					if (n_cr || n_lf || n_nel)
310133359Sobrien						if (file_printf(ms, ",") == -1)
311133359Sobrien							return -1;
31268349Sobrien				}
31368349Sobrien				if (n_cr) {
314133359Sobrien					if (file_printf(ms, " CR") == -1)
315133359Sobrien						return -1;
31668349Sobrien					if (n_lf || n_nel)
317133359Sobrien						if (file_printf(ms, ",") == -1)
318133359Sobrien							return -1;
31968349Sobrien				}
32068349Sobrien				if (n_lf) {
321133359Sobrien					if (file_printf(ms, " LF") == -1)
322133359Sobrien						return -1;
32368349Sobrien					if (n_nel)
324133359Sobrien						if (file_printf(ms, ",") == -1)
325133359Sobrien							return -1;
32668349Sobrien				}
32768349Sobrien				if (n_nel)
328133359Sobrien					if (file_printf(ms, " NEL") == -1)
329133359Sobrien						return -1;
33068349Sobrien			}
33168349Sobrien
332133359Sobrien			if (file_printf(ms, " line terminators") == -1)
333133359Sobrien				return -1;
33468349Sobrien		}
33568349Sobrien
33668349Sobrien		if (has_escapes)
337133359Sobrien			if (file_printf(ms, ", with escape sequences") == -1)
338133359Sobrien				return -1;
33968349Sobrien		if (has_backspace)
340133359Sobrien			if (file_printf(ms, ", with overstriking") == -1)
341133359Sobrien				return -1;
34268349Sobrien	}
34368349Sobrien
34468349Sobrien	return 1;
34568349Sobrien}
34668349Sobrien
347133359Sobrienprivate int
348133359Sobrienascmatch(const unsigned char *s, const unichar *us, size_t ulen)
34968349Sobrien{
35068349Sobrien	size_t i;
35168349Sobrien
35268349Sobrien	for (i = 0; i < ulen; i++) {
35368349Sobrien		if (s[i] != us[i])
35468349Sobrien			return 0;
35568349Sobrien	}
35668349Sobrien
35768349Sobrien	if (s[i])
35868349Sobrien		return 0;
35968349Sobrien	else
36068349Sobrien		return 1;
36168349Sobrien}
36268349Sobrien
36368349Sobrien/*
36468349Sobrien * This table reflects a particular philosophy about what constitutes
36568349Sobrien * "text," and there is room for disagreement about it.
36668349Sobrien *
36768349Sobrien * Version 3.31 of the file command considered a file to be ASCII if
36868349Sobrien * each of its characters was approved by either the isascii() or
36968349Sobrien * isalpha() function.  On most systems, this would mean that any
37068349Sobrien * file consisting only of characters in the range 0x00 ... 0x7F
37168349Sobrien * would be called ASCII text, but many systems might reasonably
37268349Sobrien * consider some characters outside this range to be alphabetic,
37368349Sobrien * so the file command would call such characters ASCII.  It might
37468349Sobrien * have been more accurate to call this "considered textual on the
37568349Sobrien * local system" than "ASCII."
37668349Sobrien *
37768349Sobrien * It considered a file to be "International language text" if each
37868349Sobrien * of its characters was either an ASCII printing character (according
37968349Sobrien * to the real ASCII standard, not the above test), a character in
38068349Sobrien * the range 0x80 ... 0xFF, or one of the following control characters:
38168349Sobrien * backspace, tab, line feed, vertical tab, form feed, carriage return,
38268349Sobrien * escape.  No attempt was made to determine the language in which files
38368349Sobrien * of this type were written.
38468349Sobrien *
38568349Sobrien *
38668349Sobrien * The table below considers a file to be ASCII if all of its characters
38768349Sobrien * are either ASCII printing characters (again, according to the X3.4
38868349Sobrien * standard, not isascii()) or any of the following controls: bell,
38968349Sobrien * backspace, tab, line feed, form feed, carriage return, esc, nextline.
39068349Sobrien *
39168349Sobrien * I include bell because some programs (particularly shell scripts)
39268349Sobrien * use it literally, even though it is rare in normal text.  I exclude
39368349Sobrien * vertical tab because it never seems to be used in real text.  I also
39468349Sobrien * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
39568349Sobrien * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
39668349Sobrien * character to.  It might be more appropriate to include it in the 8859
39768349Sobrien * set instead of the ASCII set, but it's got to be included in *something*
39868349Sobrien * we recognize or EBCDIC files aren't going to be considered textual.
39968349Sobrien * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
40068349Sobrien * and Latin characters, so these should possibly be allowed.  But they
40168349Sobrien * make a real mess on VT100-style displays if they're not paired properly,
40268349Sobrien * so we are probably better off not calling them text.
40368349Sobrien *
40468349Sobrien * A file is considered to be ISO-8859 text if its characters are all
40568349Sobrien * either ASCII, according to the above definition, or printing characters
40668349Sobrien * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
40768349Sobrien *
40868349Sobrien * Finally, a file is considered to be international text from some other
40968349Sobrien * character code if its characters are all either ISO-8859 (according to
41068349Sobrien * the above definition) or characters in the range 0x80 ... 0x9F, which
41168349Sobrien * ISO-8859 considers to be control characters but the IBM PC and Macintosh
41268349Sobrien * consider to be printing characters.
41368349Sobrien */
41468349Sobrien
41568349Sobrien#define F 0   /* character never appears in text */
41668349Sobrien#define T 1   /* character appears in plain ASCII text */
41768349Sobrien#define I 2   /* character appears in ISO-8859 text */
41868349Sobrien#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
41968349Sobrien
420133359Sobrienprivate char text_chars[256] = {
42168349Sobrien	/*                  BEL BS HT LF    FF CR    */
42268349Sobrien	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
42368349Sobrien        /*                              ESC          */
42468349Sobrien	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
42568349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
42668349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
42768349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
42868349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
42968349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
43068349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
43168349Sobrien	/*            NEL                            */
43268349Sobrien	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
43368349Sobrien	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
43468349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
43568349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
43668349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
43768349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
43868349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
43968349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
44068349Sobrien};
44168349Sobrien
442133359Sobrienprivate int
443133359Sobrienlooks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
444133359Sobrien    size_t *ulen)
44568349Sobrien{
44668349Sobrien	int i;
44768349Sobrien
44868349Sobrien	*ulen = 0;
44968349Sobrien
45068349Sobrien	for (i = 0; i < nbytes; i++) {
45168349Sobrien		int t = text_chars[buf[i]];
45268349Sobrien
45368349Sobrien		if (t != T)
45468349Sobrien			return 0;
45568349Sobrien
45668349Sobrien		ubuf[(*ulen)++] = buf[i];
45768349Sobrien	}
45868349Sobrien
45968349Sobrien	return 1;
46068349Sobrien}
46168349Sobrien
462133359Sobrienprivate int
463133359Sobrienlooks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
46468349Sobrien{
46568349Sobrien	int i;
46668349Sobrien
46768349Sobrien	*ulen = 0;
46868349Sobrien
46968349Sobrien	for (i = 0; i < nbytes; i++) {
47068349Sobrien		int t = text_chars[buf[i]];
47168349Sobrien
47268349Sobrien		if (t != T && t != I)
47368349Sobrien			return 0;
47468349Sobrien
47568349Sobrien		ubuf[(*ulen)++] = buf[i];
47668349Sobrien	}
47768349Sobrien
47868349Sobrien	return 1;
47968349Sobrien}
48068349Sobrien
481133359Sobrienprivate int
482133359Sobrienlooks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
483133359Sobrien    size_t *ulen)
48468349Sobrien{
48568349Sobrien	int i;
48668349Sobrien
48768349Sobrien	*ulen = 0;
48868349Sobrien
48968349Sobrien	for (i = 0; i < nbytes; i++) {
49068349Sobrien		int t = text_chars[buf[i]];
49168349Sobrien
49268349Sobrien		if (t != T && t != I && t != X)
49368349Sobrien			return 0;
49468349Sobrien
49568349Sobrien		ubuf[(*ulen)++] = buf[i];
49668349Sobrien	}
49768349Sobrien
49868349Sobrien	return 1;
49968349Sobrien}
50068349Sobrien
501133359Sobrienprivate int
502133359Sobrienlooks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
50368349Sobrien{
50468349Sobrien	int i, n;
50568349Sobrien	unichar c;
50668349Sobrien	int gotone = 0;
50768349Sobrien
50868349Sobrien	*ulen = 0;
50968349Sobrien
51068349Sobrien	for (i = 0; i < nbytes; i++) {
51168349Sobrien		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
51268349Sobrien			/*
51368349Sobrien			 * Even if the whole file is valid UTF-8 sequences,
51468349Sobrien			 * still reject it if it uses weird control characters.
51568349Sobrien			 */
51668349Sobrien
51768349Sobrien			if (text_chars[buf[i]] != T)
51868349Sobrien				return 0;
51968349Sobrien
52068349Sobrien			ubuf[(*ulen)++] = buf[i];
52168349Sobrien		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
52268349Sobrien			return 0;
52368349Sobrien		} else {			   /* 11xxxxxx begins UTF-8 */
52468349Sobrien			int following;
52568349Sobrien
52668349Sobrien			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
52768349Sobrien				c = buf[i] & 0x1f;
52868349Sobrien				following = 1;
52968349Sobrien			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
53068349Sobrien				c = buf[i] & 0x0f;
53168349Sobrien				following = 2;
53268349Sobrien			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
53368349Sobrien				c = buf[i] & 0x07;
53468349Sobrien				following = 3;
53568349Sobrien			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
53668349Sobrien				c = buf[i] & 0x03;
53768349Sobrien				following = 4;
53868349Sobrien			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
53968349Sobrien				c = buf[i] & 0x01;
54068349Sobrien				following = 5;
54168349Sobrien			} else
54268349Sobrien				return 0;
54368349Sobrien
54468349Sobrien			for (n = 0; n < following; n++) {
54568349Sobrien				i++;
54668349Sobrien				if (i >= nbytes)
54768349Sobrien					goto done;
54868349Sobrien
54968349Sobrien				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
55068349Sobrien					return 0;
55168349Sobrien
55268349Sobrien				c = (c << 6) + (buf[i] & 0x3f);
55368349Sobrien			}
55468349Sobrien
55568349Sobrien			ubuf[(*ulen)++] = c;
55668349Sobrien			gotone = 1;
55768349Sobrien		}
55868349Sobrien	}
55968349Sobriendone:
56068349Sobrien	return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
56168349Sobrien}
56268349Sobrien
563133359Sobrienprivate int
564133359Sobrienlooks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
565133359Sobrien    size_t *ulen)
56668349Sobrien{
56768349Sobrien	int bigend;
56868349Sobrien	int i;
56968349Sobrien
57068349Sobrien	if (nbytes < 2)
57168349Sobrien		return 0;
57268349Sobrien
57368349Sobrien	if (buf[0] == 0xff && buf[1] == 0xfe)
57468349Sobrien		bigend = 0;
57568349Sobrien	else if (buf[0] == 0xfe && buf[1] == 0xff)
57668349Sobrien		bigend = 1;
57768349Sobrien	else
57868349Sobrien		return 0;
57968349Sobrien
58068349Sobrien	*ulen = 0;
58168349Sobrien
58268349Sobrien	for (i = 2; i + 1 < nbytes; i += 2) {
58368349Sobrien		/* XXX fix to properly handle chars > 65536 */
58468349Sobrien
58568349Sobrien		if (bigend)
58668349Sobrien			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
58768349Sobrien		else
58868349Sobrien			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
58968349Sobrien
59068349Sobrien		if (ubuf[*ulen - 1] == 0xfffe)
59168349Sobrien			return 0;
592133359Sobrien		if (ubuf[*ulen - 1] < 128 &&
593133359Sobrien		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
59468349Sobrien			return 0;
59568349Sobrien	}
59668349Sobrien
597110949Sobrien	return 1 + bigend;
59868349Sobrien}
59968349Sobrien
60068349Sobrien#undef F
60168349Sobrien#undef T
60268349Sobrien#undef I
60368349Sobrien#undef X
60468349Sobrien
60568349Sobrien/*
60668349Sobrien * This table maps each EBCDIC character to an (8-bit extended) ASCII
60768349Sobrien * character, as specified in the rationale for the dd(1) command in
60868349Sobrien * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
60968349Sobrien *
61068349Sobrien * Unfortunately it does not seem to correspond exactly to any of the
61168349Sobrien * five variants of EBCDIC documented in IBM's _Enterprise Systems
61268349Sobrien * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
61368349Sobrien * Edition, July, 1999, pp. I-1 - I-4.
61468349Sobrien *
61568349Sobrien * Fortunately, though, all versions of EBCDIC, including this one, agree
61668349Sobrien * on most of the printing characters that also appear in (7-bit) ASCII.
61768349Sobrien * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
61868349Sobrien *
61968349Sobrien * Fortunately too, there is general agreement that codes 0x00 through
62068349Sobrien * 0x3F represent control characters, 0x41 a nonbreaking space, and the
62168349Sobrien * remainder printing characters.
62268349Sobrien *
62368349Sobrien * This is sufficient to allow us to identify EBCDIC text and to distinguish
62468349Sobrien * between old-style and internationalized examples of text.
62568349Sobrien */
62668349Sobrien
627133359Sobrienprivate unsigned char ebcdic_to_ascii[] = {
62868349Sobrien  0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
62968349Sobrien 16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
63068349Sobrien128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
63168349Sobrien144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
63268349Sobrien' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
63368349Sobrien'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
63468349Sobrien'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
63568349Sobrien186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
63668349Sobrien195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
63768349Sobrien202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
63868349Sobrien209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
63968349Sobrien216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
64068349Sobrien'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
64168349Sobrien'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
64268349Sobrien'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
64368349Sobrien'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
64468349Sobrien};
64568349Sobrien
646133359Sobrien#ifdef notdef
64768349Sobrien/*
64868349Sobrien * The following EBCDIC-to-ASCII table may relate more closely to reality,
64968349Sobrien * or at least to modern reality.  It comes from
65068349Sobrien *
65168349Sobrien *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
65268349Sobrien *
65368349Sobrien * and maps the characters of EBCDIC code page 1047 (the code used for
65468349Sobrien * Unix-derived software on IBM's 390 systems) to the corresponding
65568349Sobrien * characters from ISO 8859-1.
65668349Sobrien *
65768349Sobrien * If this table is used instead of the above one, some of the special
65868349Sobrien * cases for the NEL character can be taken out of the code.
65968349Sobrien */
66068349Sobrien
661133359Sobrienprivate unsigned char ebcdic_1047_to_8859[] = {
66268349Sobrien0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
66368349Sobrien0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
66468349Sobrien0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
66568349Sobrien0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
66668349Sobrien0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
66768349Sobrien0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
66868349Sobrien0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
66968349Sobrien0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
67068349Sobrien0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
67168349Sobrien0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
67268349Sobrien0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
67368349Sobrien0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
67468349Sobrien0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
67568349Sobrien0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
67668349Sobrien0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
67768349Sobrien0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
67868349Sobrien};
679133359Sobrien#endif
68068349Sobrien
68168349Sobrien/*
68268349Sobrien * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
68368349Sobrien */
684133359Sobrienprivate void
685133359Sobrienfrom_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
68668349Sobrien{
68768349Sobrien	int i;
68868349Sobrien
68968349Sobrien	for (i = 0; i < nbytes; i++) {
69068349Sobrien		out[i] = ebcdic_to_ascii[buf[i]];
69168349Sobrien	}
69268349Sobrien}
693