file.c revision 278175
1/*	$NetBSD: file.c,v 1.5 2011/02/16 18:35:39 joerg Exp $	*/
2/*	$FreeBSD: stable/10/usr.bin/grep/file.c 278175 2015-02-04 00:45:02Z delphij $	*/
3/*	$OpenBSD: file.c,v 1.11 2010/07/02 20:48:48 nicm Exp $	*/
4
5/*-
6 * Copyright (c) 1999 James Howard and Dag-Erling Co��dan Sm��rgrav
7 * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
8 * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com>
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: stable/10/usr.bin/grep/file.c 278175 2015-02-04 00:45:02Z delphij $");
35
36#include <sys/param.h>
37#include <sys/mman.h>
38#include <sys/stat.h>
39#include <sys/types.h>
40
41#include <err.h>
42#include <errno.h>
43#include <fcntl.h>
44#include <stddef.h>
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
48#include <wchar.h>
49#include <wctype.h>
50#include <zlib.h>
51
52#ifndef WITHOUT_LZMA
53#include <lzma.h>
54#endif
55
56#ifndef WITHOUT_BZIP2
57#include <bzlib.h>
58#endif
59
60#include "grep.h"
61
62#define	MAXBUFSIZ	(32 * 1024)
63#define	LNBUFBUMP	80
64
65static gzFile gzbufdesc;
66#ifndef WITHOUT_LZMA
67static lzma_stream lstrm = LZMA_STREAM_INIT;
68static lzma_action laction;
69static uint8_t lin_buf[MAXBUFSIZ];
70#endif
71#ifndef WITHOUT_BZIP2
72static BZFILE* bzbufdesc;
73#endif
74
75static unsigned char *buffer;
76static unsigned char *bufpos;
77static size_t bufrem;
78static size_t fsiz;
79
80static unsigned char *lnbuf;
81static size_t lnbuflen;
82
83static inline int
84grep_refill(struct file *f)
85{
86	ssize_t nr;
87
88	if (filebehave == FILE_MMAP)
89		return (0);
90
91	bufpos = buffer;
92	bufrem = 0;
93
94	if (filebehave == FILE_GZIP) {
95		nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
96#ifndef WITHOUT_BZIP2
97	} else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
98		int bzerr;
99
100		nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
101		switch (bzerr) {
102		case BZ_OK:
103		case BZ_STREAM_END:
104			/* No problem, nr will be okay */
105			break;
106		case BZ_DATA_ERROR_MAGIC:
107			/*
108			 * As opposed to gzread(), which simply returns the
109			 * plain file data, if it is not in the correct
110			 * compressed format, BZ2_bzRead() instead aborts.
111			 *
112			 * So, just restart at the beginning of the file again,
113			 * and use plain reads from now on.
114			 */
115			BZ2_bzReadClose(&bzerr, bzbufdesc);
116			bzbufdesc = NULL;
117			if (lseek(f->fd, 0, SEEK_SET) == -1)
118				return (-1);
119			nr = read(f->fd, buffer, MAXBUFSIZ);
120			break;
121		default:
122			/* Make sure we exit with an error */
123			nr = -1;
124		}
125#endif
126#ifndef WITHOUT_LZMA
127	} else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
128		lzma_ret ret;
129		lstrm.next_out = buffer;
130
131		do {
132			if (lstrm.avail_in == 0) {
133				lstrm.next_in = lin_buf;
134				nr = read(f->fd, lin_buf, MAXBUFSIZ);
135
136				if (nr < 0)
137					return (-1);
138				else if (nr == 0)
139					laction = LZMA_FINISH;
140
141				lstrm.avail_in = nr;
142			}
143
144			ret = lzma_code(&lstrm, laction);
145
146			if (ret != LZMA_OK && ret != LZMA_STREAM_END)
147				return (-1);
148
149			if (lstrm.avail_out == 0 || ret == LZMA_STREAM_END) {
150				bufrem = MAXBUFSIZ - lstrm.avail_out;
151				lstrm.next_out = buffer;
152				lstrm.avail_out = MAXBUFSIZ;
153			}
154		} while (bufrem == 0 && ret != LZMA_STREAM_END);
155
156		return (0);
157#endif	/* WIHTOUT_LZMA */
158	} else
159		nr = read(f->fd, buffer, MAXBUFSIZ);
160
161	if (nr < 0)
162		return (-1);
163
164	bufrem = nr;
165	return (0);
166}
167
168static inline int
169grep_lnbufgrow(size_t newlen)
170{
171
172	if (lnbuflen < newlen) {
173		lnbuf = grep_realloc(lnbuf, newlen);
174		lnbuflen = newlen;
175	}
176
177	return (0);
178}
179
180char *
181grep_fgetln(struct file *f, size_t *lenp)
182{
183	unsigned char *p;
184	char *ret;
185	size_t len;
186	size_t off;
187	ptrdiff_t diff;
188
189	/* Fill the buffer, if necessary */
190	if (bufrem == 0 && grep_refill(f) != 0)
191		goto error;
192
193	if (bufrem == 0) {
194		/* Return zero length to indicate EOF */
195		*lenp = 0;
196		return (bufpos);
197	}
198
199	/* Look for a newline in the remaining part of the buffer */
200	if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
201		++p; /* advance over newline */
202		ret = bufpos;
203		len = p - bufpos;
204		bufrem -= len;
205		bufpos = p;
206		*lenp = len;
207		return (ret);
208	}
209
210	/* We have to copy the current buffered data to the line buffer */
211	for (len = bufrem, off = 0; ; len += bufrem) {
212		/* Make sure there is room for more data */
213		if (grep_lnbufgrow(len + LNBUFBUMP))
214			goto error;
215		memcpy(lnbuf + off, bufpos, len - off);
216		off = len;
217		if (grep_refill(f) != 0)
218			goto error;
219		if (bufrem == 0)
220			/* EOF: return partial line */
221			break;
222		if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
223			continue;
224		/* got it: finish up the line (like code above) */
225		++p;
226		diff = p - bufpos;
227		len += diff;
228		if (grep_lnbufgrow(len))
229		    goto error;
230		memcpy(lnbuf + off, bufpos, diff);
231		bufrem -= diff;
232		bufpos = p;
233		break;
234	}
235	*lenp = len;
236	return (lnbuf);
237
238error:
239	*lenp = 0;
240	return (NULL);
241}
242
243/*
244 * Opens a file for processing.
245 */
246struct file *
247grep_open(const char *path)
248{
249	struct file *f;
250
251	f = grep_malloc(sizeof *f);
252	memset(f, 0, sizeof *f);
253	if (path == NULL) {
254		/* Processing stdin implies --line-buffered. */
255		lbflag = true;
256		f->fd = STDIN_FILENO;
257	} else if ((f->fd = open(path, O_RDONLY)) == -1)
258		goto error1;
259
260	if (filebehave == FILE_MMAP) {
261		struct stat st;
262
263		if ((fstat(f->fd, &st) == -1) || (st.st_size > OFF_MAX) ||
264		    (!S_ISREG(st.st_mode)))
265			filebehave = FILE_STDIO;
266		else {
267			int flags = MAP_PRIVATE | MAP_NOCORE | MAP_NOSYNC;
268#ifdef MAP_PREFAULT_READ
269			flags |= MAP_PREFAULT_READ;
270#endif
271			fsiz = st.st_size;
272			buffer = mmap(NULL, fsiz, PROT_READ, flags,
273			     f->fd, (off_t)0);
274			if (buffer == MAP_FAILED)
275				filebehave = FILE_STDIO;
276			else {
277				bufrem = st.st_size;
278				bufpos = buffer;
279				madvise(buffer, st.st_size, MADV_SEQUENTIAL);
280			}
281		}
282	}
283
284	if ((buffer == NULL) || (buffer == MAP_FAILED))
285		buffer = grep_malloc(MAXBUFSIZ);
286
287	if (filebehave == FILE_GZIP &&
288	    (gzbufdesc = gzdopen(f->fd, "r")) == NULL)
289		goto error2;
290
291#ifndef WITHOUT_BZIP2
292	if (filebehave == FILE_BZIP &&
293	    (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
294		goto error2;
295#endif
296#ifndef WITHOUT_LZMA
297	else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
298		lzma_ret ret;
299
300		ret = (filebehave == FILE_XZ) ?
301			lzma_stream_decoder(&lstrm, UINT64_MAX,
302					LZMA_CONCATENATED) :
303			lzma_alone_decoder(&lstrm, UINT64_MAX);
304
305		if (ret != LZMA_OK)
306			goto error2;
307
308		lstrm.avail_in = 0;
309		lstrm.avail_out = MAXBUFSIZ;
310		laction = LZMA_RUN;
311	}
312#endif
313
314	/* Fill read buffer, also catches errors early */
315	if (bufrem == 0 && grep_refill(f) != 0)
316		goto error2;
317
318	/* Check for binary stuff, if necessary */
319	if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
320	f->binary = true;
321
322	return (f);
323
324error2:
325	close(f->fd);
326error1:
327	free(f);
328	return (NULL);
329}
330
331/*
332 * Closes a file.
333 */
334void
335grep_close(struct file *f)
336{
337
338	close(f->fd);
339
340	/* Reset read buffer and line buffer */
341	if (filebehave == FILE_MMAP) {
342		munmap(buffer, fsiz);
343		buffer = NULL;
344	}
345	bufpos = buffer;
346	bufrem = 0;
347
348	free(lnbuf);
349	lnbuf = NULL;
350	lnbuflen = 0;
351}
352