1/* $NetBSD: csplit.c,v 1.5 2009/07/13 19:05:40 roy Exp $ */ 2/* $FreeBSD: src/usr.bin/csplit/csplit.c,v 1.9 2004/03/22 11:15:03 tjr Exp$ */ 3 4/*- 5 * Copyright (c) 2002 Tim J. Robbins. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30/* 31 * csplit -- split files based on context 32 * 33 * This utility splits its input into numbered output files by line number 34 * or by a regular expression. Regular expression matches have an optional 35 * offset with them, allowing the split to occur a specified number of 36 * lines before or after the match. 37 * 38 * To handle negative offsets, we stop reading when the match occurs and 39 * store the offset that the file should have been split at, then use 40 * this output file as input until all the "overflowed" lines have been read. 41 * The file is then closed and truncated to the correct length. 42 * 43 * We assume that the output files can be seeked upon (ie. they cannot be 44 * symlinks to named pipes or character devices), but make no such 45 * assumption about the input. 46 */ 47 48#include <sys/cdefs.h> 49#ifndef lint 50__RCSID("$NetBSD: csplit.c,v 1.5 2009/07/13 19:05:40 roy Exp $"); 51#endif 52 53#include <sys/types.h> 54 55#include <ctype.h> 56#include <err.h> 57#include <errno.h> 58#include <limits.h> 59#include <locale.h> 60#include <regex.h> 61#include <signal.h> 62#include <stdint.h> 63#include <stdio.h> 64#include <stdlib.h> 65#include <string.h> 66#include <unistd.h> 67#include <util.h> 68 69static void cleanup(void); 70static void do_lineno(const char *); 71static void do_rexp(const char *); 72static char *get_line(void); 73static void handlesig(int); 74static FILE *newfile(void); 75static void toomuch(FILE *, long); 76static void usage(void) __dead; 77 78/* 79 * Command line options 80 */ 81static const char *prefix; /* File name prefix */ 82static long sufflen; /* Number of decimal digits for suffix */ 83static int sflag; /* Suppress output of file names */ 84static int kflag; /* Keep output if error occurs */ 85 86/* 87 * Other miscellaneous globals (XXX too many) 88 */ 89static long lineno; /* Current line number in input file */ 90static long reps; /* Number of repetitions for this pattern */ 91static long nfiles; /* Number of files output so far */ 92static long maxfiles; /* Maximum number of files we can create */ 93static char currfile[PATH_MAX]; /* Current output file */ 94static const char *infn; /* Name of the input file */ 95static FILE *infile; /* Input file handle */ 96static FILE *overfile; /* Overflow file for toomuch() */ 97static off_t truncofs; /* Offset this file should be truncated at */ 98static int doclean; /* Should cleanup() remove output? */ 99 100int 101main(int argc, char *argv[]) 102{ 103 struct sigaction sa; 104 long i; 105 int ch; 106 const char *expr; 107 char *ep, *p; 108 FILE *ofp; 109 110 (void)setlocale(LC_ALL, ""); 111 112 kflag = sflag = 0; 113 prefix = "xx"; 114 sufflen = 2; 115 while ((ch = getopt(argc, argv, "ksf:n:")) > 0) { 116 switch (ch) { 117 case 'f': 118 prefix = optarg; 119 break; 120 case 'k': 121 kflag = 1; 122 break; 123 case 'n': 124 errno = 0; 125 sufflen = strtol(optarg, &ep, 10); 126 if (sufflen <= 0 || *ep != '\0' || errno != 0) 127 errx(1, "%s: bad suffix length", optarg); 128 break; 129 case 's': 130 sflag = 1; 131 break; 132 default: 133 usage(); 134 /*NOTREACHED*/ 135 } 136 } 137 138 if (sufflen + strlen(prefix) >= PATH_MAX) 139 errx(1, "name too long"); 140 141 argc -= optind; 142 argv += optind; 143 144 if ((infn = *argv++) == NULL) 145 usage(); 146 if (strcmp(infn, "-") == 0) { 147 infile = stdin; 148 infn = "stdin"; 149 } else if ((infile = fopen(infn, "r")) == NULL) 150 err(1, "%s", infn); 151 152 if (!kflag) { 153 doclean = 1; 154 (void)atexit(cleanup); 155 sa.sa_flags = 0; 156 sa.sa_handler = handlesig; 157 (void)sigemptyset(&sa.sa_mask); 158 (void)sigaddset(&sa.sa_mask, SIGHUP); 159 (void)sigaddset(&sa.sa_mask, SIGINT); 160 (void)sigaddset(&sa.sa_mask, SIGTERM); 161 (void)sigaction(SIGHUP, &sa, NULL); 162 (void)sigaction(SIGINT, &sa, NULL); 163 (void)sigaction(SIGTERM, &sa, NULL); 164 } 165 166 lineno = 0; 167 nfiles = 0; 168 truncofs = 0; 169 overfile = NULL; 170 171 /* Ensure 10^sufflen < LONG_MAX. */ 172 for (maxfiles = 1, i = 0; i < sufflen; i++) { 173 if (maxfiles > LONG_MAX / 10) 174 errx(1, "%ld: suffix too long (limit %ld)", 175 sufflen, i); 176 maxfiles *= 10; 177 } 178 179 /* Create files based on supplied patterns. */ 180 while (nfiles < maxfiles - 1 && (expr = *argv++) != NULL) { 181 /* Look ahead & see if this pattern has any repetitions. */ 182 if (*argv != NULL && **argv == '{') { 183 errno = 0; 184 reps = strtol(*argv + 1, &ep, 10); 185 if (reps < 0 || *ep != '}' || errno != 0) 186 errx(1, "%s: bad repetition count", *argv + 1); 187 argv++; 188 } else 189 reps = 0; 190 191 if (*expr == '/' || *expr == '%') { 192 do 193 do_rexp(expr); 194 while (reps-- != 0 && nfiles < maxfiles - 1); 195 } else if (isdigit((unsigned char)*expr)) 196 do_lineno(expr); 197 else 198 errx(1, "%s: unrecognised pattern", expr); 199 } 200 201 /* Copy the rest into a new file. */ 202 if (!feof(infile)) { 203 ofp = newfile(); 204 while ((p = get_line()) != NULL && fputs(p, ofp) == 0) 205 ; 206 if (!sflag) 207 (void)printf("%jd\n", (intmax_t)ftello(ofp)); 208 if (fclose(ofp) != 0) 209 err(1, "%s", currfile); 210 } 211 212 toomuch(NULL, 0L); 213 doclean = 0; 214 215 return (0); 216} 217 218static void 219usage(void) 220{ 221 222 (void)fprintf(stderr, 223"Usage: %s [-ks] [-f prefix] [-n number] file args ...\n", getprogname()); 224 exit(1); 225} 226 227__dead static void 228handlesig(int sig) 229{ 230 char msg[BUFSIZ]; 231 size_t len; 232 233 len = snprintf(msg, sizeof(msg), "%s: Caught %s, cleaning up\n", 234 getprogname(), strsignal(sig)); 235 if (len < sizeof(msg)) 236 (void)write(STDERR_FILENO, msg, len); 237 cleanup(); 238 (void)raise_default_signal(sig); 239 _exit(2); 240} 241 242/* Create a new output file. */ 243static FILE * 244newfile(void) 245{ 246 FILE *fp; 247 248 if ((size_t)snprintf(currfile, sizeof(currfile), "%s%0*ld", prefix, 249 (int)sufflen, nfiles) >= sizeof(currfile)) 250 errx(1, "%s: %s", currfile, strerror(ENAMETOOLONG)); 251 if ((fp = fopen(currfile, "w+")) == NULL) 252 err(1, "%s", currfile); 253 nfiles++; 254 255 return (fp); 256} 257 258/* Remove partial output, called before exiting. */ 259static void 260cleanup(void) 261{ 262 char fnbuf[PATH_MAX]; 263 long i; 264 265 if (!doclean) 266 return; 267 268 /* 269 * NOTE: One cannot portably assume to be able to call snprintf() 270 * from inside a signal handler. It does, however, appear to be safe 271 * to do on FreeBSD and NetBSD. The solution to this problem is worse 272 * than the problem itself. 273 */ 274 275 for (i = 0; i < nfiles; i++) { 276 (void)snprintf(fnbuf, sizeof(fnbuf), "%s%0*ld", prefix, 277 (int)sufflen, i); 278 (void)unlink(fnbuf); 279 } 280} 281 282/* Read a line from the input into a static buffer. */ 283static char * 284get_line(void) 285{ 286 static char lbuf[LINE_MAX]; 287 FILE *src; 288 289 src = overfile != NULL ? overfile : infile; 290 291again: if (fgets(lbuf, sizeof(lbuf), src) == NULL) { 292 if (src == overfile) { 293 src = infile; 294 goto again; 295 } 296 return (NULL); 297 } 298 if (ferror(src)) 299 err(1, "%s", infn); 300 lineno++; 301 302 return (lbuf); 303} 304 305/* Conceptually rewind the input (as obtained by get_line()) back `n' lines. */ 306static void 307toomuch(FILE *ofp, long n) 308{ 309 char buf[BUFSIZ]; 310 size_t i, nread; 311 312 if (overfile != NULL) { 313 /* 314 * Truncate the previous file we overflowed into back to 315 * the correct length, close it. 316 */ 317 if (fflush(overfile) != 0) 318 err(1, "overflow"); 319 if (ftruncate(fileno(overfile), truncofs) != 0) 320 err(1, "overflow"); 321 if (fclose(overfile) != 0) 322 err(1, "overflow"); 323 overfile = NULL; 324 } 325 326 if (n == 0) 327 /* Just tidying up */ 328 return; 329 330 lineno -= n; 331 332 /* 333 * Wind the overflow file backwards to `n' lines before the 334 * current one. 335 */ 336 do { 337 if (ftello(ofp) < (off_t)sizeof(buf)) 338 rewind(ofp); 339 else 340 (void)fseeko(ofp, -(off_t)sizeof(buf), SEEK_CUR); 341 if (ferror(ofp)) 342 errx(1, "%s: can't seek", currfile); 343 if ((nread = fread(buf, 1, sizeof(buf), ofp)) == 0) 344 errx(1, "can't read overflowed output"); 345 if (fseeko(ofp, -(off_t)nread, SEEK_CUR) != 0) 346 err(1, "%s", currfile); 347 for (i = 1; i <= nread; i++) 348 if (buf[nread - i] == '\n' && n-- == 0) 349 break; 350 if (ftello(ofp) == 0) 351 break; 352 } while (n > 0); 353 if (fseeko(ofp, (off_t)nread - i + 1, SEEK_CUR) != 0) 354 err(1, "%s", currfile); 355 356 /* 357 * get_line() will read from here. Next call will truncate to 358 * truncofs in this file. 359 */ 360 overfile = ofp; 361 truncofs = ftello(overfile); 362} 363 364/* Handle splits for /regexp/ and %regexp% patterns. */ 365static void 366do_rexp(const char *expr) 367{ 368 regex_t cre; 369 intmax_t nwritten; 370 long ofs; 371 int first; 372 char *ecopy, *ep, *p, *pofs, *re; 373 FILE *ofp; 374 375 if ((ecopy = strdup(expr)) == NULL) 376 err(1, "strdup"); 377 378 re = ecopy + 1; 379 if ((pofs = strrchr(ecopy, *expr)) == NULL || pofs[-1] == '\\') 380 errx(1, "%s: missing trailing %c", expr, *expr); 381 *pofs++ = '\0'; 382 383 if (*pofs != '\0') { 384 errno = 0; 385 ofs = strtol(pofs, &ep, 10); 386 if (*ep != '\0' || errno != 0) 387 errx(1, "%s: bad offset", pofs); 388 } else 389 ofs = 0; 390 391 if (regcomp(&cre, re, REG_BASIC|REG_NOSUB) != 0) 392 errx(1, "%s: bad regular expression", re); 393 394 if (*expr == '/') 395 /* /regexp/: Save results to a file. */ 396 ofp = newfile(); 397 else { 398 /* %regexp%: Make a temporary file for overflow. */ 399 if ((ofp = tmpfile()) == NULL) 400 err(1, "tmpfile"); 401 } 402 403 /* Read and output lines until we get a match. */ 404 first = 1; 405 while ((p = get_line()) != NULL) { 406 if (fputs(p, ofp) != 0) 407 break; 408 if (!first && regexec(&cre, p, 0, NULL, 0) == 0) 409 break; 410 first = 0; 411 } 412 413 if (p == NULL) 414 errx(1, "%s: no match", re); 415 416 if (ofs <= 0) { 417 /* 418 * Negative (or zero) offset: throw back any lines we should 419 * not have read yet. 420 */ 421 if (p != NULL) { 422 toomuch(ofp, -ofs + 1); 423 nwritten = (intmax_t)truncofs; 424 } else 425 nwritten = (intmax_t)ftello(ofp); 426 } else { 427 /* 428 * Positive offset: copy the requested number of lines 429 * after the match. 430 */ 431 while (--ofs > 0 && (p = get_line()) != NULL) 432 if (fputs(p, ofp) != 0) 433 break; 434 toomuch(NULL, 0L); 435 nwritten = (intmax_t)ftello(ofp); 436 if (fclose(ofp) != 0) 437 err(1, "%s", currfile); 438 } 439 440 if (!sflag && *expr == '/') 441 (void)printf("%jd\n", nwritten); 442 443 regfree(&cre); 444 free(ecopy); 445} 446 447/* Handle splits based on line number. */ 448static void 449do_lineno(const char *expr) 450{ 451 long lastline, tgtline; 452 char *ep, *p; 453 FILE *ofp; 454 455 errno = 0; 456 tgtline = strtol(expr, &ep, 10); 457 if (tgtline <= 0 || errno != 0 || *ep != '\0') 458 errx(1, "%s: bad line number", expr); 459 lastline = tgtline; 460 if (lastline <= lineno) 461 errx(1, "%s: can't go backwards", expr); 462 463 while (nfiles < maxfiles - 1) { 464 ofp = newfile(); 465 while (lineno + 1 != lastline) { 466 if ((p = get_line()) == NULL) 467 errx(1, "%ld: out of range", lastline); 468 if (fputs(p, ofp) != 0) 469 break; 470 } 471 if (!sflag) 472 (void)printf("%jd\n", (intmax_t)ftello(ofp)); 473 if (fclose(ofp) != 0) 474 err(1, "%s", currfile); 475 if (reps-- == 0) 476 break; 477 lastline += tgtline; 478 } 479} 480