ascmagic.c revision 159764
168349Sobrien/*
2133359Sobrien * Copyright (c) Ian F. Darwin 1986-1995.
3133359Sobrien * Software written by Ian F. Darwin and others;
4133359Sobrien * maintained 1995-present by Christos Zoulas and others.
5133359Sobrien *
6133359Sobrien * Redistribution and use in source and binary forms, with or without
7133359Sobrien * modification, are permitted provided that the following conditions
8133359Sobrien * are met:
9133359Sobrien * 1. Redistributions of source code must retain the above copyright
10133359Sobrien *    notice immediately at the beginning of the file, without modification,
11133359Sobrien *    this list of conditions, and the following disclaimer.
12133359Sobrien * 2. Redistributions in binary form must reproduce the above copyright
13133359Sobrien *    notice, this list of conditions and the following disclaimer in the
14133359Sobrien *    documentation and/or other materials provided with the distribution.
15133359Sobrien *
16133359Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17133359Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18133359Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19133359Sobrien * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
20133359Sobrien * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21133359Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22133359Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23133359Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24133359Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25133359Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26133359Sobrien * SUCH DAMAGE.
27133359Sobrien */
28133359Sobrien/*
2968349Sobrien * ASCII magic -- file types that we know based on keywords
3068349Sobrien * that can appear anywhere in the file.
3168349Sobrien *
3268349Sobrien * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
3368349Sobrien * to handle character codes other than ASCII on a unified basis.
3468349Sobrien *
3568349Sobrien * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
3668349Sobrien * international characters, now subsumed into this file.
3768349Sobrien */
3868349Sobrien
3968349Sobrien#include "file.h"
40133359Sobrien#include "magic.h"
41133359Sobrien#include <stdio.h>
4268349Sobrien#include <string.h>
4368349Sobrien#include <memory.h>
4468349Sobrien#include <ctype.h>
4568349Sobrien#include <stdlib.h>
4668349Sobrien#ifdef HAVE_UNISTD_H
4768349Sobrien#include <unistd.h>
4868349Sobrien#endif
4968349Sobrien#include "names.h"
5068349Sobrien
5168349Sobrien#ifndef	lint
52159764SobrienFILE_RCSID("@(#)$Id: ascmagic.c,v 1.45 2006/03/12 22:09:33 christos Exp $")
5368349Sobrien#endif	/* lint */
5468349Sobrien
5568349Sobrientypedef unsigned long unichar;
5668349Sobrien
5768349Sobrien#define MAXLINELEN 300	/* longest sane line length */
5868349Sobrien#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
5968349Sobrien		  || (x) == 0x85 || (x) == '\f')
6068349Sobrien
61133359Sobrienprivate int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
62133359Sobrienprivate int looks_utf8(const unsigned char *, size_t, unichar *, size_t *);
63133359Sobrienprivate int looks_unicode(const unsigned char *, size_t, unichar *, size_t *);
64133359Sobrienprivate int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
65133359Sobrienprivate int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
66133359Sobrienprivate void from_ebcdic(const unsigned char *, size_t, unsigned char *);
67133359Sobrienprivate int ascmatch(const unsigned char *, const unichar *, size_t);
6868349Sobrien
69133359Sobrien
70133359Sobrienprotected int
71133359Sobrienfile_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
7268349Sobrien{
73133359Sobrien	size_t i;
74159764Sobrien	unsigned char *nbuf = NULL;
75159764Sobrien	unichar *ubuf = NULL;
76133359Sobrien	size_t ulen;
7768349Sobrien	struct names *p;
78159764Sobrien	int rv = -1;
7968349Sobrien
80133359Sobrien	const char *code = NULL;
81133359Sobrien	const char *code_mime = NULL;
82133359Sobrien	const char *type = NULL;
83133359Sobrien	const char *subtype = NULL;
84133359Sobrien	const char *subtype_mime = NULL;
8568349Sobrien
8668349Sobrien	int has_escapes = 0;
8768349Sobrien	int has_backspace = 0;
88159764Sobrien	int seen_cr = 0;
8968349Sobrien
9068349Sobrien	int n_crlf = 0;
9168349Sobrien	int n_lf = 0;
9268349Sobrien	int n_cr = 0;
9368349Sobrien	int n_nel = 0;
9468349Sobrien
9568349Sobrien	int last_line_end = -1;
9668349Sobrien	int has_long_lines = 0;
9768349Sobrien
9868349Sobrien	/*
9984685Sobrien	 * Undo the NUL-termination kindly provided by process()
10084685Sobrien	 * but leave at least one byte to look at
10184685Sobrien	 */
10284685Sobrien	while (nbytes > 1 && buf[nbytes - 1] == '\0')
10368349Sobrien		nbytes--;
10468349Sobrien
105159764Sobrien	if ((nbuf = malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
106159764Sobrien		goto done;
107159764Sobrien	if ((ubuf = malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
108159764Sobrien		goto done;
109133359Sobrien
11068349Sobrien	/*
11168349Sobrien	 * Then try to determine whether it's any character code we can
11268349Sobrien	 * identify.  Each of these tests, if it succeeds, will leave
11368349Sobrien	 * the text converted into one-unichar-per-character Unicode in
11468349Sobrien	 * ubuf, and the number of characters converted in ulen.
11568349Sobrien	 */
11668349Sobrien	if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
11768349Sobrien		code = "ASCII";
11868349Sobrien		code_mime = "us-ascii";
11968349Sobrien		type = "text";
12068349Sobrien	} else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
12168349Sobrien		code = "UTF-8 Unicode";
12268349Sobrien		code_mime = "utf-8";
12368349Sobrien		type = "text";
124133359Sobrien	} else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
12568349Sobrien		if (i == 1)
12668349Sobrien			code = "Little-endian UTF-16 Unicode";
12768349Sobrien		else
12868349Sobrien			code = "Big-endian UTF-16 Unicode";
12968349Sobrien
13068349Sobrien		type = "character data";
13168349Sobrien		code_mime = "utf-16";    /* is this defined? */
13268349Sobrien	} else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
13368349Sobrien		code = "ISO-8859";
13468349Sobrien		type = "text";
13568349Sobrien		code_mime = "iso-8859-1";
13668349Sobrien	} else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
13768349Sobrien		code = "Non-ISO extended-ASCII";
13868349Sobrien		type = "text";
13968349Sobrien		code_mime = "unknown";
14068349Sobrien	} else {
14168349Sobrien		from_ebcdic(buf, nbytes, nbuf);
14268349Sobrien
14368349Sobrien		if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
14468349Sobrien			code = "EBCDIC";
14568349Sobrien			type = "character data";
14668349Sobrien			code_mime = "ebcdic";
14768349Sobrien		} else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
14868349Sobrien			code = "International EBCDIC";
14968349Sobrien			type = "character data";
15068349Sobrien			code_mime = "ebcdic";
15168349Sobrien		} else {
152159764Sobrien			rv = 0;
153159764Sobrien			goto done;  /* doesn't look like text at all */
15468349Sobrien		}
15568349Sobrien	}
15668349Sobrien
157159764Sobrien	if (nbytes <= 1) {
158159764Sobrien		rv = 0;
159159764Sobrien		goto done;
160159764Sobrien	}
161159764Sobrien
16268349Sobrien	/*
16368349Sobrien	 * for troff, look for . + letter + letter or .\";
16468349Sobrien	 * this must be done to disambiguate tar archives' ./file
16568349Sobrien	 * and other trash from real troff input.
16668349Sobrien	 *
16768349Sobrien	 * I believe Plan 9 troff allows non-ASCII characters in the names
16868349Sobrien	 * of macros, so this test might possibly fail on such a file.
16968349Sobrien	 */
17068349Sobrien	if (*ubuf == '.') {
17168349Sobrien		unichar *tp = ubuf + 1;
17268349Sobrien
17368349Sobrien		while (ISSPC(*tp))
17468349Sobrien			++tp;	/* skip leading whitespace */
17568349Sobrien		if ((tp[0] == '\\' && tp[1] == '\"') ||
176133359Sobrien		    (isascii((unsigned char)tp[0]) &&
177133359Sobrien		     isalnum((unsigned char)tp[0]) &&
178133359Sobrien		     isascii((unsigned char)tp[1]) &&
179133359Sobrien		     isalnum((unsigned char)tp[1]) &&
18068349Sobrien		     ISSPC(tp[2]))) {
18168349Sobrien			subtype_mime = "text/troff";
18268349Sobrien			subtype = "troff or preprocessor input";
18368349Sobrien			goto subtype_identified;
18468349Sobrien		}
18568349Sobrien	}
18668349Sobrien
18768349Sobrien	if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
18868349Sobrien		subtype_mime = "text/fortran";
18968349Sobrien		subtype = "fortran program";
19068349Sobrien		goto subtype_identified;
19168349Sobrien	}
19268349Sobrien
19368349Sobrien	/* look for tokens from names.h - this is expensive! */
19468349Sobrien
19568349Sobrien	i = 0;
19668349Sobrien	while (i < ulen) {
197133359Sobrien		size_t end;
19868349Sobrien
19968349Sobrien		/*
20068349Sobrien		 * skip past any leading space
20168349Sobrien		 */
20268349Sobrien		while (i < ulen && ISSPC(ubuf[i]))
20368349Sobrien			i++;
20468349Sobrien		if (i >= ulen)
20568349Sobrien			break;
20668349Sobrien
20768349Sobrien		/*
20868349Sobrien		 * find the next whitespace
20968349Sobrien		 */
21068349Sobrien		for (end = i + 1; end < nbytes; end++)
21168349Sobrien			if (ISSPC(ubuf[end]))
21268349Sobrien				break;
21368349Sobrien
21468349Sobrien		/*
21568349Sobrien		 * compare the word thus isolated against the token list
21668349Sobrien		 */
21768349Sobrien		for (p = names; p < names + NNAMES; p++) {
218133359Sobrien			if (ascmatch((const unsigned char *)p->name, ubuf + i,
219110949Sobrien			    end - i)) {
22068349Sobrien				subtype = types[p->type].human;
22168349Sobrien				subtype_mime = types[p->type].mime;
22268349Sobrien				goto subtype_identified;
22368349Sobrien			}
22468349Sobrien		}
22568349Sobrien
22668349Sobrien		i = end;
22768349Sobrien	}
22868349Sobrien
22968349Sobriensubtype_identified:
23068349Sobrien
23168349Sobrien	/*
23268349Sobrien	 * Now try to discover other details about the file.
23368349Sobrien	 */
23468349Sobrien	for (i = 0; i < ulen; i++) {
235159764Sobrien		if (ubuf[i] == '\n') {
236159764Sobrien			if (seen_cr)
237159764Sobrien				n_crlf++;
238159764Sobrien			else
239159764Sobrien				n_lf++;
240159764Sobrien			last_line_end = i;
241159764Sobrien		} else if (seen_cr)
242159764Sobrien			n_cr++;
243159764Sobrien
244159764Sobrien		seen_cr = (ubuf[i] == '\r');
245159764Sobrien		if (seen_cr)
246159764Sobrien			last_line_end = i;
247159764Sobrien
248159764Sobrien		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
249159764Sobrien			n_nel++;
250159764Sobrien			last_line_end = i;
251159764Sobrien		}
252159764Sobrien
253159764Sobrien		/* If this line is _longer_ than MAXLINELEN, remember it. */
25468349Sobrien		if (i > last_line_end + MAXLINELEN)
25568349Sobrien			has_long_lines = 1;
25668349Sobrien
25768349Sobrien		if (ubuf[i] == '\033')
25868349Sobrien			has_escapes = 1;
25968349Sobrien		if (ubuf[i] == '\b')
26068349Sobrien			has_backspace = 1;
26168349Sobrien	}
26268349Sobrien
263159764Sobrien	/* Beware, if the data has been truncated, the final CR could have
264159764Sobrien	   been followed by a LF.  If we have HOWMANY bytes, it indicates
265159764Sobrien	   that the data might have been truncated, probably even before
266159764Sobrien	   this function was called. */
267159764Sobrien	if (seen_cr && nbytes < HOWMANY)
268159764Sobrien		n_cr++;
269159764Sobrien
270133359Sobrien	if ((ms->flags & MAGIC_MIME)) {
271133359Sobrien		if (subtype_mime) {
272133359Sobrien			if (file_printf(ms, subtype_mime) == -1)
273159764Sobrien				goto done;
274133359Sobrien		} else {
275133359Sobrien			if (file_printf(ms, "text/plain") == -1)
276159764Sobrien				goto done;
277133359Sobrien		}
27868349Sobrien
27968349Sobrien		if (code_mime) {
280133359Sobrien			if (file_printf(ms, "; charset=") == -1)
281159764Sobrien				goto done;
282133359Sobrien			if (file_printf(ms, code_mime) == -1)
283159764Sobrien				goto done;
28468349Sobrien		}
28568349Sobrien	} else {
286133359Sobrien		if (file_printf(ms, code) == -1)
287159764Sobrien			goto done;
28868349Sobrien
28968349Sobrien		if (subtype) {
290133359Sobrien			if (file_printf(ms, " ") == -1)
291159764Sobrien				goto done;
292133359Sobrien			if (file_printf(ms, subtype) == -1)
293159764Sobrien				goto done;
29468349Sobrien		}
29568349Sobrien
296133359Sobrien		if (file_printf(ms, " ") == -1)
297159764Sobrien			goto done;
298133359Sobrien		if (file_printf(ms, type) == -1)
299159764Sobrien			goto done;
30068349Sobrien
30168349Sobrien		if (has_long_lines)
302133359Sobrien			if (file_printf(ms, ", with very long lines") == -1)
303159764Sobrien				goto done;
30468349Sobrien
30568349Sobrien		/*
30668349Sobrien		 * Only report line terminators if we find one other than LF,
30768349Sobrien		 * or if we find none at all.
30868349Sobrien		 */
30968349Sobrien		if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
31068349Sobrien		    (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
311133359Sobrien			if (file_printf(ms, ", with") == -1)
312159764Sobrien				goto done;
31368349Sobrien
314133359Sobrien			if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)			{
315133359Sobrien				if (file_printf(ms, " no") == -1)
316159764Sobrien					goto done;
317133359Sobrien			} else {
31868349Sobrien				if (n_crlf) {
319133359Sobrien					if (file_printf(ms, " CRLF") == -1)
320159764Sobrien						goto done;
32168349Sobrien					if (n_cr || n_lf || n_nel)
322133359Sobrien						if (file_printf(ms, ",") == -1)
323159764Sobrien							goto done;
32468349Sobrien				}
32568349Sobrien				if (n_cr) {
326133359Sobrien					if (file_printf(ms, " CR") == -1)
327159764Sobrien						goto done;
32868349Sobrien					if (n_lf || n_nel)
329133359Sobrien						if (file_printf(ms, ",") == -1)
330159764Sobrien							goto done;
33168349Sobrien				}
33268349Sobrien				if (n_lf) {
333133359Sobrien					if (file_printf(ms, " LF") == -1)
334159764Sobrien						goto done;
33568349Sobrien					if (n_nel)
336133359Sobrien						if (file_printf(ms, ",") == -1)
337159764Sobrien							goto done;
33868349Sobrien				}
33968349Sobrien				if (n_nel)
340133359Sobrien					if (file_printf(ms, " NEL") == -1)
341159764Sobrien						goto done;
34268349Sobrien			}
34368349Sobrien
344133359Sobrien			if (file_printf(ms, " line terminators") == -1)
345159764Sobrien				goto done;
34668349Sobrien		}
34768349Sobrien
34868349Sobrien		if (has_escapes)
349133359Sobrien			if (file_printf(ms, ", with escape sequences") == -1)
350159764Sobrien				goto done;
35168349Sobrien		if (has_backspace)
352133359Sobrien			if (file_printf(ms, ", with overstriking") == -1)
353159764Sobrien				goto done;
35468349Sobrien	}
355159764Sobrien	rv = 1;
356159764Sobriendone:
357159764Sobrien	if (nbuf)
358159764Sobrien		free(nbuf);
359159764Sobrien	if (ubuf)
360159764Sobrien		free(ubuf);
36168349Sobrien
362159764Sobrien	return rv;
36368349Sobrien}
36468349Sobrien
365133359Sobrienprivate int
366133359Sobrienascmatch(const unsigned char *s, const unichar *us, size_t ulen)
36768349Sobrien{
36868349Sobrien	size_t i;
36968349Sobrien
37068349Sobrien	for (i = 0; i < ulen; i++) {
37168349Sobrien		if (s[i] != us[i])
37268349Sobrien			return 0;
37368349Sobrien	}
37468349Sobrien
37568349Sobrien	if (s[i])
37668349Sobrien		return 0;
37768349Sobrien	else
37868349Sobrien		return 1;
37968349Sobrien}
38068349Sobrien
38168349Sobrien/*
38268349Sobrien * This table reflects a particular philosophy about what constitutes
38368349Sobrien * "text," and there is room for disagreement about it.
38468349Sobrien *
38568349Sobrien * Version 3.31 of the file command considered a file to be ASCII if
38668349Sobrien * each of its characters was approved by either the isascii() or
38768349Sobrien * isalpha() function.  On most systems, this would mean that any
38868349Sobrien * file consisting only of characters in the range 0x00 ... 0x7F
38968349Sobrien * would be called ASCII text, but many systems might reasonably
39068349Sobrien * consider some characters outside this range to be alphabetic,
39168349Sobrien * so the file command would call such characters ASCII.  It might
39268349Sobrien * have been more accurate to call this "considered textual on the
39368349Sobrien * local system" than "ASCII."
39468349Sobrien *
39568349Sobrien * It considered a file to be "International language text" if each
39668349Sobrien * of its characters was either an ASCII printing character (according
39768349Sobrien * to the real ASCII standard, not the above test), a character in
39868349Sobrien * the range 0x80 ... 0xFF, or one of the following control characters:
39968349Sobrien * backspace, tab, line feed, vertical tab, form feed, carriage return,
40068349Sobrien * escape.  No attempt was made to determine the language in which files
40168349Sobrien * of this type were written.
40268349Sobrien *
40368349Sobrien *
40468349Sobrien * The table below considers a file to be ASCII if all of its characters
40568349Sobrien * are either ASCII printing characters (again, according to the X3.4
40668349Sobrien * standard, not isascii()) or any of the following controls: bell,
40768349Sobrien * backspace, tab, line feed, form feed, carriage return, esc, nextline.
40868349Sobrien *
40968349Sobrien * I include bell because some programs (particularly shell scripts)
41068349Sobrien * use it literally, even though it is rare in normal text.  I exclude
41168349Sobrien * vertical tab because it never seems to be used in real text.  I also
41268349Sobrien * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
41368349Sobrien * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
41468349Sobrien * character to.  It might be more appropriate to include it in the 8859
41568349Sobrien * set instead of the ASCII set, but it's got to be included in *something*
41668349Sobrien * we recognize or EBCDIC files aren't going to be considered textual.
41768349Sobrien * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
41868349Sobrien * and Latin characters, so these should possibly be allowed.  But they
41968349Sobrien * make a real mess on VT100-style displays if they're not paired properly,
42068349Sobrien * so we are probably better off not calling them text.
42168349Sobrien *
42268349Sobrien * A file is considered to be ISO-8859 text if its characters are all
42368349Sobrien * either ASCII, according to the above definition, or printing characters
42468349Sobrien * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
42568349Sobrien *
42668349Sobrien * Finally, a file is considered to be international text from some other
42768349Sobrien * character code if its characters are all either ISO-8859 (according to
42868349Sobrien * the above definition) or characters in the range 0x80 ... 0x9F, which
42968349Sobrien * ISO-8859 considers to be control characters but the IBM PC and Macintosh
43068349Sobrien * consider to be printing characters.
43168349Sobrien */
43268349Sobrien
43368349Sobrien#define F 0   /* character never appears in text */
43468349Sobrien#define T 1   /* character appears in plain ASCII text */
43568349Sobrien#define I 2   /* character appears in ISO-8859 text */
43668349Sobrien#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
43768349Sobrien
438133359Sobrienprivate char text_chars[256] = {
43968349Sobrien	/*                  BEL BS HT LF    FF CR    */
44068349Sobrien	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
44168349Sobrien        /*                              ESC          */
44268349Sobrien	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
44368349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
44468349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
44568349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
44668349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
44768349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
44868349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
44968349Sobrien	/*            NEL                            */
45068349Sobrien	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
45168349Sobrien	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
45268349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
45368349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
45468349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
45568349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
45668349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
45768349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
45868349Sobrien};
45968349Sobrien
460133359Sobrienprivate int
461133359Sobrienlooks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
462133359Sobrien    size_t *ulen)
46368349Sobrien{
46468349Sobrien	int i;
46568349Sobrien
46668349Sobrien	*ulen = 0;
46768349Sobrien
46868349Sobrien	for (i = 0; i < nbytes; i++) {
46968349Sobrien		int t = text_chars[buf[i]];
47068349Sobrien
47168349Sobrien		if (t != T)
47268349Sobrien			return 0;
47368349Sobrien
47468349Sobrien		ubuf[(*ulen)++] = buf[i];
47568349Sobrien	}
47668349Sobrien
47768349Sobrien	return 1;
47868349Sobrien}
47968349Sobrien
480133359Sobrienprivate int
481133359Sobrienlooks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
48268349Sobrien{
48368349Sobrien	int i;
48468349Sobrien
48568349Sobrien	*ulen = 0;
48668349Sobrien
48768349Sobrien	for (i = 0; i < nbytes; i++) {
48868349Sobrien		int t = text_chars[buf[i]];
48968349Sobrien
49068349Sobrien		if (t != T && t != I)
49168349Sobrien			return 0;
49268349Sobrien
49368349Sobrien		ubuf[(*ulen)++] = buf[i];
49468349Sobrien	}
49568349Sobrien
49668349Sobrien	return 1;
49768349Sobrien}
49868349Sobrien
499133359Sobrienprivate int
500133359Sobrienlooks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
501133359Sobrien    size_t *ulen)
50268349Sobrien{
50368349Sobrien	int i;
50468349Sobrien
50568349Sobrien	*ulen = 0;
50668349Sobrien
50768349Sobrien	for (i = 0; i < nbytes; i++) {
50868349Sobrien		int t = text_chars[buf[i]];
50968349Sobrien
51068349Sobrien		if (t != T && t != I && t != X)
51168349Sobrien			return 0;
51268349Sobrien
51368349Sobrien		ubuf[(*ulen)++] = buf[i];
51468349Sobrien	}
51568349Sobrien
51668349Sobrien	return 1;
51768349Sobrien}
51868349Sobrien
519133359Sobrienprivate int
520133359Sobrienlooks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
52168349Sobrien{
52268349Sobrien	int i, n;
52368349Sobrien	unichar c;
52468349Sobrien	int gotone = 0;
52568349Sobrien
52668349Sobrien	*ulen = 0;
52768349Sobrien
52868349Sobrien	for (i = 0; i < nbytes; i++) {
52968349Sobrien		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
53068349Sobrien			/*
53168349Sobrien			 * Even if the whole file is valid UTF-8 sequences,
53268349Sobrien			 * still reject it if it uses weird control characters.
53368349Sobrien			 */
53468349Sobrien
53568349Sobrien			if (text_chars[buf[i]] != T)
53668349Sobrien				return 0;
53768349Sobrien
53868349Sobrien			ubuf[(*ulen)++] = buf[i];
53968349Sobrien		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
54068349Sobrien			return 0;
54168349Sobrien		} else {			   /* 11xxxxxx begins UTF-8 */
54268349Sobrien			int following;
54368349Sobrien
54468349Sobrien			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
54568349Sobrien				c = buf[i] & 0x1f;
54668349Sobrien				following = 1;
54768349Sobrien			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
54868349Sobrien				c = buf[i] & 0x0f;
54968349Sobrien				following = 2;
55068349Sobrien			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
55168349Sobrien				c = buf[i] & 0x07;
55268349Sobrien				following = 3;
55368349Sobrien			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
55468349Sobrien				c = buf[i] & 0x03;
55568349Sobrien				following = 4;
55668349Sobrien			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
55768349Sobrien				c = buf[i] & 0x01;
55868349Sobrien				following = 5;
55968349Sobrien			} else
56068349Sobrien				return 0;
56168349Sobrien
56268349Sobrien			for (n = 0; n < following; n++) {
56368349Sobrien				i++;
56468349Sobrien				if (i >= nbytes)
56568349Sobrien					goto done;
56668349Sobrien
56768349Sobrien				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
56868349Sobrien					return 0;
56968349Sobrien
57068349Sobrien				c = (c << 6) + (buf[i] & 0x3f);
57168349Sobrien			}
57268349Sobrien
57368349Sobrien			ubuf[(*ulen)++] = c;
57468349Sobrien			gotone = 1;
57568349Sobrien		}
57668349Sobrien	}
57768349Sobriendone:
57868349Sobrien	return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
57968349Sobrien}
58068349Sobrien
581133359Sobrienprivate int
582133359Sobrienlooks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
583133359Sobrien    size_t *ulen)
58468349Sobrien{
58568349Sobrien	int bigend;
58668349Sobrien	int i;
58768349Sobrien
58868349Sobrien	if (nbytes < 2)
58968349Sobrien		return 0;
59068349Sobrien
59168349Sobrien	if (buf[0] == 0xff && buf[1] == 0xfe)
59268349Sobrien		bigend = 0;
59368349Sobrien	else if (buf[0] == 0xfe && buf[1] == 0xff)
59468349Sobrien		bigend = 1;
59568349Sobrien	else
59668349Sobrien		return 0;
59768349Sobrien
59868349Sobrien	*ulen = 0;
59968349Sobrien
60068349Sobrien	for (i = 2; i + 1 < nbytes; i += 2) {
60168349Sobrien		/* XXX fix to properly handle chars > 65536 */
60268349Sobrien
60368349Sobrien		if (bigend)
60468349Sobrien			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
60568349Sobrien		else
60668349Sobrien			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
60768349Sobrien
60868349Sobrien		if (ubuf[*ulen - 1] == 0xfffe)
60968349Sobrien			return 0;
610133359Sobrien		if (ubuf[*ulen - 1] < 128 &&
611133359Sobrien		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
61268349Sobrien			return 0;
61368349Sobrien	}
61468349Sobrien
615110949Sobrien	return 1 + bigend;
61668349Sobrien}
61768349Sobrien
61868349Sobrien#undef F
61968349Sobrien#undef T
62068349Sobrien#undef I
62168349Sobrien#undef X
62268349Sobrien
62368349Sobrien/*
62468349Sobrien * This table maps each EBCDIC character to an (8-bit extended) ASCII
62568349Sobrien * character, as specified in the rationale for the dd(1) command in
62668349Sobrien * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
62768349Sobrien *
62868349Sobrien * Unfortunately it does not seem to correspond exactly to any of the
62968349Sobrien * five variants of EBCDIC documented in IBM's _Enterprise Systems
63068349Sobrien * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
63168349Sobrien * Edition, July, 1999, pp. I-1 - I-4.
63268349Sobrien *
63368349Sobrien * Fortunately, though, all versions of EBCDIC, including this one, agree
63468349Sobrien * on most of the printing characters that also appear in (7-bit) ASCII.
63568349Sobrien * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
63668349Sobrien *
63768349Sobrien * Fortunately too, there is general agreement that codes 0x00 through
63868349Sobrien * 0x3F represent control characters, 0x41 a nonbreaking space, and the
63968349Sobrien * remainder printing characters.
64068349Sobrien *
64168349Sobrien * This is sufficient to allow us to identify EBCDIC text and to distinguish
64268349Sobrien * between old-style and internationalized examples of text.
64368349Sobrien */
64468349Sobrien
645133359Sobrienprivate unsigned char ebcdic_to_ascii[] = {
64668349Sobrien  0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
64768349Sobrien 16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
64868349Sobrien128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
64968349Sobrien144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
65068349Sobrien' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
65168349Sobrien'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
65268349Sobrien'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
65368349Sobrien186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
65468349Sobrien195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
65568349Sobrien202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
65668349Sobrien209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
65768349Sobrien216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
65868349Sobrien'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
65968349Sobrien'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
66068349Sobrien'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
66168349Sobrien'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
66268349Sobrien};
66368349Sobrien
664133359Sobrien#ifdef notdef
66568349Sobrien/*
66668349Sobrien * The following EBCDIC-to-ASCII table may relate more closely to reality,
66768349Sobrien * or at least to modern reality.  It comes from
66868349Sobrien *
66968349Sobrien *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
67068349Sobrien *
67168349Sobrien * and maps the characters of EBCDIC code page 1047 (the code used for
67268349Sobrien * Unix-derived software on IBM's 390 systems) to the corresponding
67368349Sobrien * characters from ISO 8859-1.
67468349Sobrien *
67568349Sobrien * If this table is used instead of the above one, some of the special
67668349Sobrien * cases for the NEL character can be taken out of the code.
67768349Sobrien */
67868349Sobrien
679133359Sobrienprivate unsigned char ebcdic_1047_to_8859[] = {
68068349Sobrien0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
68168349Sobrien0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
68268349Sobrien0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
68368349Sobrien0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
68468349Sobrien0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
68568349Sobrien0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
68668349Sobrien0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
68768349Sobrien0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
68868349Sobrien0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
68968349Sobrien0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
69068349Sobrien0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
69168349Sobrien0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
69268349Sobrien0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
69368349Sobrien0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
69468349Sobrien0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
69568349Sobrien0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
69668349Sobrien};
697133359Sobrien#endif
69868349Sobrien
69968349Sobrien/*
70068349Sobrien * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
70168349Sobrien */
702133359Sobrienprivate void
703133359Sobrienfrom_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
70468349Sobrien{
70568349Sobrien	int i;
70668349Sobrien
70768349Sobrien	for (i = 0; i < nbytes; i++) {
70868349Sobrien		out[i] = ebcdic_to_ascii[buf[i]];
70968349Sobrien	}
71068349Sobrien}
711