contrib/file/ascmagic.c

68349Sobrien/*
133359Sobrien * Copyright (c) Ian F. Darwin 1986-1995.
133359Sobrien * Software written by Ian F. Darwin and others;
133359Sobrien * maintained 1995-present by Christos Zoulas and others.
133359Sobrien *
133359Sobrien * Redistribution and use in source and binary forms, with or without
133359Sobrien * modification, are permitted provided that the following conditions
133359Sobrien * are met:
133359Sobrien * 1. Redistributions of source code must retain the above copyright
133359Sobrien *    notice immediately at the beginning of the file, without modification,
133359Sobrien *    this list of conditions, and the following disclaimer.
133359Sobrien * 2. Redistributions in binary form must reproduce the above copyright
133359Sobrien *    notice, this list of conditions and the following disclaimer in the
133359Sobrien *    documentation and/or other materials provided with the distribution.
133359Sobrien *
133359Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
133359Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
133359Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
133359Sobrien * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
133359Sobrien * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
133359Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
133359Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
133359Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
133359Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
133359Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
133359Sobrien * SUCH DAMAGE.
133359Sobrien */
133359Sobrien/*
68349Sobrien * ASCII magic -- file types that we know based on keywords
68349Sobrien * that can appear anywhere in the file.
68349Sobrien *
68349Sobrien * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
68349Sobrien * to handle character codes other than ASCII on a unified basis.
68349Sobrien *
68349Sobrien * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
68349Sobrien * international characters, now subsumed into this file.
68349Sobrien */
68349Sobrien
68349Sobrien#include "file.h"
133359Sobrien#include "magic.h"
133359Sobrien#include <stdio.h>
68349Sobrien#include <string.h>
68349Sobrien#include <memory.h>
68349Sobrien#include <ctype.h>
68349Sobrien#include <stdlib.h>
68349Sobrien#ifdef HAVE_UNISTD_H
68349Sobrien#include <unistd.h>
68349Sobrien#endif
68349Sobrien#include "names.h"
68349Sobrien
68349Sobrien#ifndef	lint
159764SobrienFILE_RCSID("@(#)$Id: ascmagic.c,v 1.45 2006/03/12 22:09:33 christos Exp $")
68349Sobrien#endif	/* lint */
68349Sobrien
68349Sobrientypedef unsigned long unichar;
68349Sobrien
68349Sobrien#define MAXLINELEN 300	/* longest sane line length */
68349Sobrien#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
68349Sobrien		  || (x) == 0x85 || (x) == '\f')
68349Sobrien
133359Sobrienprivate int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
133359Sobrienprivate int looks_utf8(const unsigned char *, size_t, unichar *, size_t *);
133359Sobrienprivate int looks_unicode(const unsigned char *, size_t, unichar *, size_t *);
133359Sobrienprivate int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
133359Sobrienprivate int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
133359Sobrienprivate void from_ebcdic(const unsigned char *, size_t, unsigned char *);
133359Sobrienprivate int ascmatch(const unsigned char *, const unichar *, size_t);
68349Sobrien
133359Sobrien
133359Sobrienprotected int
133359Sobrienfile_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
68349Sobrien{
133359Sobrien	size_t i;
159764Sobrien	unsigned char *nbuf = NULL;
159764Sobrien	unichar *ubuf = NULL;
133359Sobrien	size_t ulen;
68349Sobrien	struct names *p;
159764Sobrien	int rv = -1;
68349Sobrien
133359Sobrien	const char *code = NULL;
133359Sobrien	const char *code_mime = NULL;
133359Sobrien	const char *type = NULL;
133359Sobrien	const char *subtype = NULL;
133359Sobrien	const char *subtype_mime = NULL;
68349Sobrien
68349Sobrien	int has_escapes = 0;
68349Sobrien	int has_backspace = 0;
159764Sobrien	int seen_cr = 0;
68349Sobrien
68349Sobrien	int n_crlf = 0;
68349Sobrien	int n_lf = 0;
68349Sobrien	int n_cr = 0;
68349Sobrien	int n_nel = 0;
68349Sobrien
68349Sobrien	int last_line_end = -1;
68349Sobrien	int has_long_lines = 0;
68349Sobrien
68349Sobrien	/*
84685Sobrien	 * Undo the NUL-termination kindly provided by process()
84685Sobrien	 * but leave at least one byte to look at
84685Sobrien	 */
84685Sobrien	while (nbytes > 1 && buf[nbytes - 1] == '\0')
68349Sobrien		nbytes--;
68349Sobrien
159764Sobrien	if ((nbuf = malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
159764Sobrien		goto done;
159764Sobrien	if ((ubuf = malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
159764Sobrien		goto done;
133359Sobrien
68349Sobrien	/*
68349Sobrien	 * Then try to determine whether it's any character code we can
68349Sobrien	 * identify.  Each of these tests, if it succeeds, will leave
68349Sobrien	 * the text converted into one-unichar-per-character Unicode in
68349Sobrien	 * ubuf, and the number of characters converted in ulen.
68349Sobrien	 */
68349Sobrien	if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
68349Sobrien		code = "ASCII";
68349Sobrien		code_mime = "us-ascii";
68349Sobrien		type = "text";
68349Sobrien	} else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
68349Sobrien		code = "UTF-8 Unicode";
68349Sobrien		code_mime = "utf-8";
68349Sobrien		type = "text";
133359Sobrien	} else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
68349Sobrien		if (i == 1)
68349Sobrien			code = "Little-endian UTF-16 Unicode";
68349Sobrien		else
68349Sobrien			code = "Big-endian UTF-16 Unicode";
68349Sobrien
68349Sobrien		type = "character data";
68349Sobrien		code_mime = "utf-16";    /* is this defined? */
68349Sobrien	} else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
68349Sobrien		code = "ISO-8859";
68349Sobrien		type = "text";
68349Sobrien		code_mime = "iso-8859-1";
68349Sobrien	} else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
68349Sobrien		code = "Non-ISO extended-ASCII";
68349Sobrien		type = "text";
68349Sobrien		code_mime = "unknown";
68349Sobrien	} else {
68349Sobrien		from_ebcdic(buf, nbytes, nbuf);
68349Sobrien
68349Sobrien		if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
68349Sobrien			code = "EBCDIC";
68349Sobrien			type = "character data";
68349Sobrien			code_mime = "ebcdic";
68349Sobrien		} else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
68349Sobrien			code = "International EBCDIC";
68349Sobrien			type = "character data";
68349Sobrien			code_mime = "ebcdic";
68349Sobrien		} else {
159764Sobrien			rv = 0;
159764Sobrien			goto done;  /* doesn't look like text at all */
68349Sobrien		}
68349Sobrien	}
68349Sobrien
159764Sobrien	if (nbytes <= 1) {
159764Sobrien		rv = 0;
159764Sobrien		goto done;
159764Sobrien	}
159764Sobrien
68349Sobrien	/*
68349Sobrien	 * for troff, look for . + letter + letter or .\";
68349Sobrien	 * this must be done to disambiguate tar archives' ./file
68349Sobrien	 * and other trash from real troff input.
68349Sobrien	 *
68349Sobrien	 * I believe Plan 9 troff allows non-ASCII characters in the names
68349Sobrien	 * of macros, so this test might possibly fail on such a file.
68349Sobrien	 */
68349Sobrien	if (*ubuf == '.') {
68349Sobrien		unichar *tp = ubuf + 1;
68349Sobrien
68349Sobrien		while (ISSPC(*tp))
68349Sobrien			++tp;	/* skip leading whitespace */
68349Sobrien		if ((tp[0] == '\\' && tp[1] == '\"') ||
133359Sobrien		    (isascii((unsigned char)tp[0]) &&
133359Sobrien		     isalnum((unsigned char)tp[0]) &&
133359Sobrien		     isascii((unsigned char)tp[1]) &&
133359Sobrien		     isalnum((unsigned char)tp[1]) &&
68349Sobrien		     ISSPC(tp[2]))) {
68349Sobrien			subtype_mime = "text/troff";
68349Sobrien			subtype = "troff or preprocessor input";
68349Sobrien			goto subtype_identified;
68349Sobrien		}
68349Sobrien	}
68349Sobrien
68349Sobrien	if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
68349Sobrien		subtype_mime = "text/fortran";
68349Sobrien		subtype = "fortran program";
68349Sobrien		goto subtype_identified;
68349Sobrien	}
68349Sobrien
68349Sobrien	/* look for tokens from names.h - this is expensive! */
68349Sobrien
68349Sobrien	i = 0;
68349Sobrien	while (i < ulen) {
133359Sobrien		size_t end;
68349Sobrien
68349Sobrien		/*
68349Sobrien		 * skip past any leading space
68349Sobrien		 */
68349Sobrien		while (i < ulen && ISSPC(ubuf[i]))
68349Sobrien			i++;
68349Sobrien		if (i >= ulen)
68349Sobrien			break;
68349Sobrien
68349Sobrien		/*
68349Sobrien		 * find the next whitespace
68349Sobrien		 */
68349Sobrien		for (end = i + 1; end < nbytes; end++)
68349Sobrien			if (ISSPC(ubuf[end]))
68349Sobrien				break;
68349Sobrien
68349Sobrien		/*
68349Sobrien		 * compare the word thus isolated against the token list
68349Sobrien		 */
68349Sobrien		for (p = names; p < names + NNAMES; p++) {
133359Sobrien			if (ascmatch((const unsigned char *)p->name, ubuf + i,
110949Sobrien			    end - i)) {
68349Sobrien				subtype = types[p->type].human;
68349Sobrien				subtype_mime = types[p->type].mime;
68349Sobrien				goto subtype_identified;
68349Sobrien			}
68349Sobrien		}
68349Sobrien
68349Sobrien		i = end;
68349Sobrien	}
68349Sobrien
68349Sobriensubtype_identified:
68349Sobrien
68349Sobrien	/*
68349Sobrien	 * Now try to discover other details about the file.
68349Sobrien	 */
68349Sobrien	for (i = 0; i < ulen; i++) {
159764Sobrien		if (ubuf[i] == '\n') {
159764Sobrien			if (seen_cr)
159764Sobrien				n_crlf++;
159764Sobrien			else
159764Sobrien				n_lf++;
159764Sobrien			last_line_end = i;
159764Sobrien		} else if (seen_cr)
159764Sobrien			n_cr++;
159764Sobrien
159764Sobrien		seen_cr = (ubuf[i] == '\r');
159764Sobrien		if (seen_cr)
159764Sobrien			last_line_end = i;
159764Sobrien
159764Sobrien		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
159764Sobrien			n_nel++;
159764Sobrien			last_line_end = i;
159764Sobrien		}
159764Sobrien
159764Sobrien		/* If this line is _longer_ than MAXLINELEN, remember it. */
68349Sobrien		if (i > last_line_end + MAXLINELEN)
68349Sobrien			has_long_lines = 1;
68349Sobrien
68349Sobrien		if (ubuf[i] == '\033')
68349Sobrien			has_escapes = 1;
68349Sobrien		if (ubuf[i] == '\b')
68349Sobrien			has_backspace = 1;
68349Sobrien	}
68349Sobrien
159764Sobrien	/* Beware, if the data has been truncated, the final CR could have
159764Sobrien	   been followed by a LF.  If we have HOWMANY bytes, it indicates
159764Sobrien	   that the data might have been truncated, probably even before
159764Sobrien	   this function was called. */
159764Sobrien	if (seen_cr && nbytes < HOWMANY)
159764Sobrien		n_cr++;
159764Sobrien
133359Sobrien	if ((ms->flags & MAGIC_MIME)) {
133359Sobrien		if (subtype_mime) {
133359Sobrien			if (file_printf(ms, subtype_mime) == -1)
159764Sobrien				goto done;
133359Sobrien		} else {
133359Sobrien			if (file_printf(ms, "text/plain") == -1)
159764Sobrien				goto done;
133359Sobrien		}
68349Sobrien
68349Sobrien		if (code_mime) {
133359Sobrien			if (file_printf(ms, "; charset=") == -1)
159764Sobrien				goto done;
133359Sobrien			if (file_printf(ms, code_mime) == -1)
159764Sobrien				goto done;
68349Sobrien		}
68349Sobrien	} else {
133359Sobrien		if (file_printf(ms, code) == -1)
159764Sobrien			goto done;
68349Sobrien
68349Sobrien		if (subtype) {
133359Sobrien			if (file_printf(ms, " ") == -1)
159764Sobrien				goto done;
133359Sobrien			if (file_printf(ms, subtype) == -1)
159764Sobrien				goto done;
68349Sobrien		}
68349Sobrien
133359Sobrien		if (file_printf(ms, " ") == -1)
159764Sobrien			goto done;
133359Sobrien		if (file_printf(ms, type) == -1)
159764Sobrien			goto done;
68349Sobrien
68349Sobrien		if (has_long_lines)
133359Sobrien			if (file_printf(ms, ", with very long lines") == -1)
159764Sobrien				goto done;
68349Sobrien
68349Sobrien		/*
68349Sobrien		 * Only report line terminators if we find one other than LF,
68349Sobrien		 * or if we find none at all.
68349Sobrien		 */
68349Sobrien		if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
68349Sobrien		    (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
133359Sobrien			if (file_printf(ms, ", with") == -1)
159764Sobrien				goto done;
68349Sobrien
133359Sobrien			if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)			{
133359Sobrien				if (file_printf(ms, " no") == -1)
159764Sobrien					goto done;
133359Sobrien			} else {
68349Sobrien				if (n_crlf) {
133359Sobrien					if (file_printf(ms, " CRLF") == -1)
159764Sobrien						goto done;
68349Sobrien					if (n_cr || n_lf || n_nel)
133359Sobrien						if (file_printf(ms, ",") == -1)
159764Sobrien							goto done;
68349Sobrien				}
68349Sobrien				if (n_cr) {
133359Sobrien					if (file_printf(ms, " CR") == -1)
159764Sobrien						goto done;
68349Sobrien					if (n_lf || n_nel)
133359Sobrien						if (file_printf(ms, ",") == -1)
159764Sobrien							goto done;
68349Sobrien				}
68349Sobrien				if (n_lf) {
133359Sobrien					if (file_printf(ms, " LF") == -1)
159764Sobrien						goto done;
68349Sobrien					if (n_nel)
133359Sobrien						if (file_printf(ms, ",") == -1)
159764Sobrien							goto done;
68349Sobrien				}
68349Sobrien				if (n_nel)
133359Sobrien					if (file_printf(ms, " NEL") == -1)
159764Sobrien						goto done;
68349Sobrien			}
68349Sobrien
133359Sobrien			if (file_printf(ms, " line terminators") == -1)
159764Sobrien				goto done;
68349Sobrien		}
68349Sobrien
68349Sobrien		if (has_escapes)
133359Sobrien			if (file_printf(ms, ", with escape sequences") == -1)
159764Sobrien				goto done;
68349Sobrien		if (has_backspace)
133359Sobrien			if (file_printf(ms, ", with overstriking") == -1)
159764Sobrien				goto done;
68349Sobrien	}
159764Sobrien	rv = 1;
159764Sobriendone:
159764Sobrien	if (nbuf)
159764Sobrien		free(nbuf);
159764Sobrien	if (ubuf)
159764Sobrien		free(ubuf);
68349Sobrien
159764Sobrien	return rv;
68349Sobrien}
68349Sobrien
133359Sobrienprivate int
133359Sobrienascmatch(const unsigned char *s, const unichar *us, size_t ulen)
68349Sobrien{
68349Sobrien	size_t i;
68349Sobrien
68349Sobrien	for (i = 0; i < ulen; i++) {
68349Sobrien		if (s[i] != us[i])
68349Sobrien			return 0;
68349Sobrien	}
68349Sobrien
68349Sobrien	if (s[i])
68349Sobrien		return 0;
68349Sobrien	else
68349Sobrien		return 1;
68349Sobrien}
68349Sobrien
68349Sobrien/*
68349Sobrien * This table reflects a particular philosophy about what constitutes
68349Sobrien * "text," and there is room for disagreement about it.
68349Sobrien *
68349Sobrien * Version 3.31 of the file command considered a file to be ASCII if
68349Sobrien * each of its characters was approved by either the isascii() or
68349Sobrien * isalpha() function.  On most systems, this would mean that any
68349Sobrien * file consisting only of characters in the range 0x00 ... 0x7F
68349Sobrien * would be called ASCII text, but many systems might reasonably
68349Sobrien * consider some characters outside this range to be alphabetic,
68349Sobrien * so the file command would call such characters ASCII.  It might
68349Sobrien * have been more accurate to call this "considered textual on the
68349Sobrien * local system" than "ASCII."
68349Sobrien *
68349Sobrien * It considered a file to be "International language text" if each
68349Sobrien * of its characters was either an ASCII printing character (according
68349Sobrien * to the real ASCII standard, not the above test), a character in
68349Sobrien * the range 0x80 ... 0xFF, or one of the following control characters:
68349Sobrien * backspace, tab, line feed, vertical tab, form feed, carriage return,
68349Sobrien * escape.  No attempt was made to determine the language in which files
68349Sobrien * of this type were written.
68349Sobrien *
68349Sobrien *
68349Sobrien * The table below considers a file to be ASCII if all of its characters
68349Sobrien * are either ASCII printing characters (again, according to the X3.4
68349Sobrien * standard, not isascii()) or any of the following controls: bell,
68349Sobrien * backspace, tab, line feed, form feed, carriage return, esc, nextline.
68349Sobrien *
68349Sobrien * I include bell because some programs (particularly shell scripts)
68349Sobrien * use it literally, even though it is rare in normal text.  I exclude
68349Sobrien * vertical tab because it never seems to be used in real text.  I also
68349Sobrien * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
68349Sobrien * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
68349Sobrien * character to.  It might be more appropriate to include it in the 8859
68349Sobrien * set instead of the ASCII set, but it's got to be included in *something*
68349Sobrien * we recognize or EBCDIC files aren't going to be considered textual.
68349Sobrien * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
68349Sobrien * and Latin characters, so these should possibly be allowed.  But they
68349Sobrien * make a real mess on VT100-style displays if they're not paired properly,
68349Sobrien * so we are probably better off not calling them text.
68349Sobrien *
68349Sobrien * A file is considered to be ISO-8859 text if its characters are all
68349Sobrien * either ASCII, according to the above definition, or printing characters
68349Sobrien * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
68349Sobrien *
68349Sobrien * Finally, a file is considered to be international text from some other
68349Sobrien * character code if its characters are all either ISO-8859 (according to
68349Sobrien * the above definition) or characters in the range 0x80 ... 0x9F, which
68349Sobrien * ISO-8859 considers to be control characters but the IBM PC and Macintosh
68349Sobrien * consider to be printing characters.
68349Sobrien */
68349Sobrien
68349Sobrien#define F 0   /* character never appears in text */
68349Sobrien#define T 1   /* character appears in plain ASCII text */
68349Sobrien#define I 2   /* character appears in ISO-8859 text */
68349Sobrien#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
68349Sobrien
133359Sobrienprivate char text_chars[256] = {
68349Sobrien	/*                  BEL BS HT LF    FF CR    */
68349Sobrien	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
68349Sobrien        /*                              ESC          */
68349Sobrien	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
68349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
68349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
68349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
68349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
68349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
68349Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
68349Sobrien	/*            NEL                            */
68349Sobrien	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
68349Sobrien	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
68349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
68349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
68349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
68349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
68349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
68349Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
68349Sobrien};
68349Sobrien
133359Sobrienprivate int
133359Sobrienlooks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
133359Sobrien    size_t *ulen)
68349Sobrien{
68349Sobrien	int i;
68349Sobrien
68349Sobrien	*ulen = 0;
68349Sobrien
68349Sobrien	for (i = 0; i < nbytes; i++) {
68349Sobrien		int t = text_chars[buf[i]];
68349Sobrien
68349Sobrien		if (t != T)
68349Sobrien			return 0;
68349Sobrien
68349Sobrien		ubuf[(*ulen)++] = buf[i];
68349Sobrien	}
68349Sobrien
68349Sobrien	return 1;
68349Sobrien}
68349Sobrien
133359Sobrienprivate int
133359Sobrienlooks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
68349Sobrien{
68349Sobrien	int i;
68349Sobrien
68349Sobrien	*ulen = 0;
68349Sobrien
68349Sobrien	for (i = 0; i < nbytes; i++) {
68349Sobrien		int t = text_chars[buf[i]];
68349Sobrien
68349Sobrien		if (t != T && t != I)
68349Sobrien			return 0;
68349Sobrien
68349Sobrien		ubuf[(*ulen)++] = buf[i];
68349Sobrien	}
68349Sobrien
68349Sobrien	return 1;
68349Sobrien}
68349Sobrien
133359Sobrienprivate int
133359Sobrienlooks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
133359Sobrien    size_t *ulen)
68349Sobrien{
68349Sobrien	int i;
68349Sobrien
68349Sobrien	*ulen = 0;
68349Sobrien
68349Sobrien	for (i = 0; i < nbytes; i++) {
68349Sobrien		int t = text_chars[buf[i]];
68349Sobrien
68349Sobrien		if (t != T && t != I && t != X)
68349Sobrien			return 0;
68349Sobrien
68349Sobrien		ubuf[(*ulen)++] = buf[i];
68349Sobrien	}
68349Sobrien
68349Sobrien	return 1;
68349Sobrien}
68349Sobrien
133359Sobrienprivate int
133359Sobrienlooks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
68349Sobrien{
68349Sobrien	int i, n;
68349Sobrien	unichar c;
68349Sobrien	int gotone = 0;
68349Sobrien
68349Sobrien	*ulen = 0;
68349Sobrien
68349Sobrien	for (i = 0; i < nbytes; i++) {
68349Sobrien		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
68349Sobrien			/*
68349Sobrien			 * Even if the whole file is valid UTF-8 sequences,
68349Sobrien			 * still reject it if it uses weird control characters.
68349Sobrien			 */
68349Sobrien
68349Sobrien			if (text_chars[buf[i]] != T)
68349Sobrien				return 0;
68349Sobrien
68349Sobrien			ubuf[(*ulen)++] = buf[i];
68349Sobrien		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
68349Sobrien			return 0;
68349Sobrien		} else {			   /* 11xxxxxx begins UTF-8 */
68349Sobrien			int following;
68349Sobrien
68349Sobrien			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
68349Sobrien				c = buf[i] & 0x1f;
68349Sobrien				following = 1;
68349Sobrien			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
68349Sobrien				c = buf[i] & 0x0f;
68349Sobrien				following = 2;
68349Sobrien			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
68349Sobrien				c = buf[i] & 0x07;
68349Sobrien				following = 3;
68349Sobrien			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
68349Sobrien				c = buf[i] & 0x03;
68349Sobrien				following = 4;
68349Sobrien			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
68349Sobrien				c = buf[i] & 0x01;
68349Sobrien				following = 5;
68349Sobrien			} else
68349Sobrien				return 0;
68349Sobrien
68349Sobrien			for (n = 0; n < following; n++) {
68349Sobrien				i++;
68349Sobrien				if (i >= nbytes)
68349Sobrien					goto done;
68349Sobrien
68349Sobrien				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
68349Sobrien					return 0;
68349Sobrien
68349Sobrien				c = (c << 6) + (buf[i] & 0x3f);
68349Sobrien			}
68349Sobrien
68349Sobrien			ubuf[(*ulen)++] = c;
68349Sobrien			gotone = 1;
68349Sobrien		}
68349Sobrien	}
68349Sobriendone:
68349Sobrien	return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
68349Sobrien}
68349Sobrien
133359Sobrienprivate int
133359Sobrienlooks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
133359Sobrien    size_t *ulen)
68349Sobrien{
68349Sobrien	int bigend;
68349Sobrien	int i;
68349Sobrien
68349Sobrien	if (nbytes < 2)
68349Sobrien		return 0;
68349Sobrien
68349Sobrien	if (buf[0] == 0xff && buf[1] == 0xfe)
68349Sobrien		bigend = 0;
68349Sobrien	else if (buf[0] == 0xfe && buf[1] == 0xff)
68349Sobrien		bigend = 1;
68349Sobrien	else
68349Sobrien		return 0;
68349Sobrien
68349Sobrien	*ulen = 0;
68349Sobrien
68349Sobrien	for (i = 2; i + 1 < nbytes; i += 2) {
68349Sobrien		/* XXX fix to properly handle chars > 65536 */
68349Sobrien
68349Sobrien		if (bigend)
68349Sobrien			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
68349Sobrien		else
68349Sobrien			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
68349Sobrien
68349Sobrien		if (ubuf[*ulen - 1] == 0xfffe)
68349Sobrien			return 0;
133359Sobrien		if (ubuf[*ulen - 1] < 128 &&
133359Sobrien		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
68349Sobrien			return 0;
68349Sobrien	}
68349Sobrien
110949Sobrien	return 1 + bigend;
68349Sobrien}
68349Sobrien
68349Sobrien#undef F
68349Sobrien#undef T
68349Sobrien#undef I
68349Sobrien#undef X
68349Sobrien
68349Sobrien/*
68349Sobrien * This table maps each EBCDIC character to an (8-bit extended) ASCII
68349Sobrien * character, as specified in the rationale for the dd(1) command in
68349Sobrien * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
68349Sobrien *
68349Sobrien * Unfortunately it does not seem to correspond exactly to any of the
68349Sobrien * five variants of EBCDIC documented in IBM's _Enterprise Systems
68349Sobrien * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
68349Sobrien * Edition, July, 1999, pp. I-1 - I-4.
68349Sobrien *
68349Sobrien * Fortunately, though, all versions of EBCDIC, including this one, agree
68349Sobrien * on most of the printing characters that also appear in (7-bit) ASCII.
68349Sobrien * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
68349Sobrien *
68349Sobrien * Fortunately too, there is general agreement that codes 0x00 through
68349Sobrien * 0x3F represent control characters, 0x41 a nonbreaking space, and the
68349Sobrien * remainder printing characters.
68349Sobrien *
68349Sobrien * This is sufficient to allow us to identify EBCDIC text and to distinguish
68349Sobrien * between old-style and internationalized examples of text.
68349Sobrien */
68349Sobrien
133359Sobrienprivate unsigned char ebcdic_to_ascii[] = {
68349Sobrien  0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
68349Sobrien 16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
68349Sobrien128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
68349Sobrien144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
68349Sobrien' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
68349Sobrien'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
68349Sobrien'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
68349Sobrien186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
68349Sobrien195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
68349Sobrien202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
68349Sobrien209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
68349Sobrien216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
68349Sobrien'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
68349Sobrien'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
68349Sobrien'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
68349Sobrien'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
68349Sobrien};
68349Sobrien
133359Sobrien#ifdef notdef
68349Sobrien/*
68349Sobrien * The following EBCDIC-to-ASCII table may relate more closely to reality,
68349Sobrien * or at least to modern reality.  It comes from
68349Sobrien *
68349Sobrien *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
68349Sobrien *
68349Sobrien * and maps the characters of EBCDIC code page 1047 (the code used for
68349Sobrien * Unix-derived software on IBM's 390 systems) to the corresponding
68349Sobrien * characters from ISO 8859-1.
68349Sobrien *
68349Sobrien * If this table is used instead of the above one, some of the special
68349Sobrien * cases for the NEL character can be taken out of the code.
68349Sobrien */
68349Sobrien
133359Sobrienprivate unsigned char ebcdic_1047_to_8859[] = {
68349Sobrien0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
68349Sobrien0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
68349Sobrien0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
68349Sobrien0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
68349Sobrien0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
68349Sobrien0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
68349Sobrien0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
68349Sobrien0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
68349Sobrien0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
68349Sobrien0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
68349Sobrien0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
68349Sobrien0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
68349Sobrien0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
68349Sobrien0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
68349Sobrien0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
68349Sobrien0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
68349Sobrien};
133359Sobrien#endif
68349Sobrien
68349Sobrien/*
68349Sobrien * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
68349Sobrien */
133359Sobrienprivate void
133359Sobrienfrom_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
68349Sobrien{
68349Sobrien	int i;
68349Sobrien
68349Sobrien	for (i = 0; i < nbytes; i++) {
68349Sobrien		out[i] = ebcdic_to_ascii[buf[i]];
68349Sobrien	}
68349Sobrien}