join.c revision 19069
1/*-
2 * Copyright (c) 1991, 1993, 1994
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Steve Hayman of the Computer Science Department, Indiana University,
7 * Michiro Hikida and David Goodenough.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38#ifndef lint
39static char copyright[] =
40"@(#) Copyright (c) 1991, 1993, 1994\n\
41	The Regents of the University of California.  All rights reserved.\n";
42#endif /* not lint */
43
44#ifndef lint
45static char sccsid[] = "@(#)join.c	8.3 (Berkeley) 4/16/94";
46#endif /* not lint */
47
48#include <sys/param.h>
49
50#include <ctype.h>
51#include <err.h>
52#include <errno.h>
53#include <stdio.h>
54#include <stdlib.h>
55#include <string.h>
56
57/*
58 * There's a structure per input file which encapsulates the state of the
59 * file.  We repeatedly read lines from each file until we've read in all
60 * the consecutive lines from the file with a common join field.  Then we
61 * compare the set of lines with an equivalent set from the other file.
62 */
63typedef struct {
64	char *line;		/* line */
65	u_long linealloc;	/* line allocated count */
66	char **fields;		/* line field(s) */
67	u_long fieldcnt;	/* line field(s) count */
68	u_long fieldalloc;	/* line field(s) allocated count */
69} LINE;
70
71typedef struct {
72	FILE *fp;		/* file descriptor */
73	u_long joinf;		/* join field (-1, -2, -j) */
74	int unpair;		/* output unpairable lines (-a) */
75	int number;		/* 1 for file 1, 2 for file 2 */
76
77	LINE *set;		/* set of lines with same field */
78	int pushbool;		/* if pushback is set */
79	u_long pushback;	/* line on the stack */
80	u_long setcnt;		/* set count */
81	u_long setalloc;	/* set allocated count */
82} INPUT;
83INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, },
84      input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, };
85
86typedef struct {
87	u_long	filenum;	/* file number */
88	u_long	fieldno;	/* field number */
89} OLIST;
90OLIST *olist;			/* output field list */
91u_long olistcnt;		/* output field list count */
92u_long olistalloc;		/* output field allocated count */
93
94int joinout = 1;		/* show lines with matched join fields (-v) */
95int needsep;			/* need separator character */
96int spans = 1;			/* span multiple delimiters (-t) */
97char *empty;			/* empty field replacement string (-e) */
98char *tabchar = " \t";		/* delimiter characters (-t) */
99
100int  cmp __P((LINE *, u_long, LINE *, u_long));
101void fieldarg __P((char *));
102void joinlines __P((INPUT *, INPUT *));
103void obsolete __P((char **));
104void outfield __P((LINE *, u_long, int));
105void outoneline __P((INPUT *, LINE *));
106void outtwoline __P((INPUT *, LINE *, INPUT *, LINE *));
107void slurp __P((INPUT *));
108void usage __P((void));
109
110int
111main(argc, argv)
112	int argc;
113	char *argv[];
114{
115	INPUT *F1, *F2;
116	int aflag, ch, cval, vflag;
117	char *end;
118
119	F1 = &input1;
120	F2 = &input2;
121
122	aflag = vflag = 0;
123	obsolete(argv);
124	while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != EOF) {
125		switch (ch) {
126		case '\01':		/* See comment in obsolete(). */
127			aflag = 1;
128			F1->unpair = F2->unpair = 1;
129			break;
130		case '1':
131			if ((F1->joinf = strtol(optarg, &end, 10)) < 1)
132				errx(1, "-1 option field number less than 1");
133			if (*end)
134				errx(1, "illegal field number -- %s", optarg);
135			--F1->joinf;
136			break;
137		case '2':
138			if ((F2->joinf = strtol(optarg, &end, 10)) < 1)
139				errx(1, "-2 option field number less than 1");
140			if (*end)
141				errx(1, "illegal field number -- %s", optarg);
142			--F2->joinf;
143			break;
144		case 'a':
145			aflag = 1;
146			switch(strtol(optarg, &end, 10)) {
147			case 1:
148				F1->unpair = 1;
149				break;
150			case 2:
151				F2->unpair = 1;
152				break;
153			default:
154				errx(1, "-a option file number not 1 or 2");
155				break;
156			}
157			if (*end)
158				errx(1, "illegal file number -- %s", optarg);
159			break;
160		case 'e':
161			empty = optarg;
162			break;
163		case 'j':
164			if ((F1->joinf = F2->joinf =
165			    strtol(optarg, &end, 10)) < 1)
166				errx(1, "-j option field number less than 1");
167			if (*end)
168				errx(1, "illegal field number -- %s", optarg);
169			--F1->joinf;
170			--F2->joinf;
171			break;
172		case 'o':
173			fieldarg(optarg);
174			break;
175		case 't':
176			spans = 0;
177			if (strlen(tabchar = optarg) != 1)
178				errx(1, "illegal tab character specification");
179			break;
180		case 'v':
181			vflag = 1;
182			joinout = 0;
183			switch (strtol(optarg, &end, 10)) {
184			case 1:
185				F1->unpair = 1;
186				break;
187			case 2:
188				F2->unpair = 1;
189				break;
190			default:
191				errx(1, "-v option file number not 1 or 2");
192				break;
193			}
194			if (*end)
195				errx(1, "illegal file number -- %s", optarg);
196			break;
197		case '?':
198		default:
199			usage();
200		}
201	}
202	argc -= optind;
203	argv += optind;
204
205	if (aflag && vflag)
206		errx(1, "the -a and -v options are mutually exclusive");
207
208	if (argc != 2)
209		usage();
210
211	/* Open the files; "-" means stdin. */
212	if (!strcmp(*argv, "-"))
213		F1->fp = stdin;
214	else if ((F1->fp = fopen(*argv, "r")) == NULL)
215		err(1, "%s", *argv);
216	++argv;
217	if (!strcmp(*argv, "-"))
218		F2->fp = stdin;
219	else if ((F2->fp = fopen(*argv, "r")) == NULL)
220		err(1, "%s", *argv);
221	if (F1->fp == stdin && F2->fp == stdin)
222		errx(1, "only one input file may be stdin");
223
224	slurp(F1);
225	slurp(F2);
226	while (F1->setcnt && F2->setcnt) {
227		cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
228		if (cval == 0) {
229			/* Oh joy, oh rapture, oh beauty divine! */
230			if (joinout)
231				joinlines(F1, F2);
232			slurp(F1);
233			slurp(F2);
234		} else if (cval < 0) {
235			/* File 1 takes the lead... */
236			if (F1->unpair)
237				joinlines(F1, NULL);
238			slurp(F1);
239		} else {
240			/* File 2 takes the lead... */
241			if (F2->unpair)
242				joinlines(F2, NULL);
243			slurp(F2);
244		}
245	}
246
247	/*
248	 * Now that one of the files is used up, optionally output any
249	 * remaining lines from the other file.
250	 */
251	if (F1->unpair)
252		while (F1->setcnt) {
253			joinlines(F1, NULL);
254			slurp(F1);
255		}
256	if (F2->unpair)
257		while (F2->setcnt) {
258			joinlines(F2, NULL);
259			slurp(F2);
260		}
261	exit(0);
262}
263
264void
265slurp(F)
266	INPUT *F;
267{
268	LINE *lp, *lastlp, tmp;
269	size_t len;
270	int cnt;
271	char *bp, *fieldp;
272
273	/*
274	 * Read all of the lines from an input file that have the same
275	 * join field.
276	 */
277	F->setcnt = 0;
278	for (lastlp = NULL;; ++F->setcnt) {
279		/*
280		 * If we're out of space to hold line structures, allocate
281		 * more.  Initialize the structure so that we know that this
282		 * is new space.
283		 */
284		if (F->setcnt == F->setalloc) {
285			cnt = F->setalloc;
286			F->setalloc += 50;
287			if ((F->set = realloc(F->set,
288			    F->setalloc * sizeof(LINE))) == NULL)
289				err(1, NULL);
290			memset(F->set + cnt, 0, 50 * sizeof(LINE));
291		}
292
293		/*
294		 * Get any pushed back line, else get the next line.  Allocate
295		 * space as necessary.  If taking the line from the stack swap
296		 * the two structures so that we don't lose space allocated to
297		 * either structure.  This could be avoided by doing another
298		 * level of indirection, but it's probably okay as is.
299		 */
300		lp = &F->set[F->setcnt];
301		if (F->setcnt)
302			lastlp = &F->set[F->setcnt - 1];
303		if (F->pushbool) {
304			tmp = F->set[F->setcnt];
305			F->set[F->setcnt] = F->set[F->pushback];
306			F->set[F->pushback] = tmp;
307			F->pushbool = 0;
308			continue;
309		}
310		if ((bp = fgetln(F->fp, &len)) == NULL)
311			return;
312		if (lp->linealloc <= len + 1) {
313			lp->linealloc += MAX(100, len + 1);
314			if ((lp->line =
315			    realloc(lp->line, lp->linealloc)) == NULL)
316				err(1, NULL);
317		}
318		memmove(lp->line, bp, len);
319
320		/* Replace trailing newline, if it exists. */
321		if (bp[len - 1] == '\n')
322			lp->line[len - 1] = '\0';
323		else
324			lp->line[len] = '\0';
325		bp = lp->line;
326
327		/* Split the line into fields, allocate space as necessary. */
328		lp->fieldcnt = 0;
329		while ((fieldp = strsep(&bp, tabchar)) != NULL) {
330			if (spans && *fieldp == '\0')
331				continue;
332			if (lp->fieldcnt == lp->fieldalloc) {
333				lp->fieldalloc += 50;
334				if ((lp->fields = realloc(lp->fields,
335				    lp->fieldalloc * sizeof(char *))) == NULL)
336					err(1, NULL);
337			}
338			lp->fields[lp->fieldcnt++] = fieldp;
339		}
340
341		/* See if the join field value has changed. */
342		if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) {
343			F->pushbool = 1;
344			F->pushback = F->setcnt;
345			break;
346		}
347	}
348}
349
350int
351cmp(lp1, fieldno1, lp2, fieldno2)
352	LINE *lp1, *lp2;
353	u_long fieldno1, fieldno2;
354{
355	if (lp1->fieldcnt < fieldno1)
356		return (lp2->fieldcnt < fieldno2 ? 0 : 1);
357	if (lp2->fieldcnt < fieldno2)
358		return (-1);
359	return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2]));
360}
361
362void
363joinlines(F1, F2)
364	INPUT *F1, *F2;
365{
366	int cnt1, cnt2;
367
368	/*
369	 * Output the results of a join comparison.  The output may be from
370	 * either file 1 or file 2 (in which case the first argument is the
371	 * file from which to output) or from both.
372	 */
373	if (F2 == NULL) {
374		for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
375			outoneline(F1, &F1->set[cnt1]);
376		return;
377	}
378	for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
379		for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
380			outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
381}
382
383void
384outoneline(F, lp)
385	INPUT *F;
386	LINE *lp;
387{
388	int cnt;
389
390	/*
391	 * Output a single line from one of the files, according to the
392	 * join rules.  This happens when we are writing unmatched single
393	 * lines.  Output empty fields in the right places.
394	 */
395	if (olist)
396		for (cnt = 0; cnt < olistcnt; ++cnt) {
397			if (olist[cnt].filenum == F->number)
398				outfield(lp, olist[cnt].fieldno, 0);
399			else
400				outfield(lp, 0, 1);
401		}
402	else
403		for (cnt = 0; cnt < lp->fieldcnt; ++cnt)
404			outfield(lp, cnt, 0);
405	(void)printf("\n");
406	if (ferror(stdout))
407		err(1, "stdout");
408	needsep = 0;
409}
410
411void
412outtwoline(F1, lp1, F2, lp2)
413	INPUT *F1, *F2;
414	LINE *lp1, *lp2;
415{
416	int cnt;
417
418	/* Output a pair of lines according to the join list (if any). */
419	if (olist)
420		for (cnt = 0; cnt < olistcnt; ++cnt)
421			if (olist[cnt].filenum == 1)
422				outfield(lp1, olist[cnt].fieldno, 0);
423			else /* if (olist[cnt].filenum == 2) */
424				outfield(lp2, olist[cnt].fieldno, 0);
425	else {
426		/*
427		 * Output the join field, then the remaining fields from F1
428		 * and F2.
429		 */
430		outfield(lp1, F1->joinf, 0);
431		for (cnt = 0; cnt < lp1->fieldcnt; ++cnt)
432			if (F1->joinf != cnt)
433				outfield(lp1, cnt, 0);
434		for (cnt = 0; cnt < lp2->fieldcnt; ++cnt)
435			if (F2->joinf != cnt)
436				outfield(lp2, cnt, 0);
437	}
438	(void)printf("\n");
439	if (ferror(stdout))
440		err(1, "stdout");
441	needsep = 0;
442}
443
444void
445outfield(lp, fieldno, out_empty)
446	LINE *lp;
447	u_long fieldno;
448	int out_empty;
449{
450	if (needsep++)
451		(void)printf("%c", *tabchar);
452	if (!ferror(stdout))
453		if (lp->fieldcnt < fieldno || out_empty) {
454			if (empty != NULL)
455				(void)printf("%s", empty);
456		} else {
457			if (*lp->fields[fieldno] == '\0')
458				return;
459			(void)printf("%s", lp->fields[fieldno]);
460		}
461	if (ferror(stdout))
462		err(1, "stdout");
463}
464
465/*
466 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
467 * fields.
468 */
469void
470fieldarg(option)
471	char *option;
472{
473	u_long fieldno;
474	char *end, *token;
475
476	while ((token = strsep(&option, " \t")) != NULL) {
477		if (*token == '\0')
478			continue;
479		if (token[0] != '1' && token[0] != '2' || token[1] != '.')
480			errx(1, "malformed -o option field");
481		fieldno = strtol(token + 2, &end, 10);
482		if (*end)
483			errx(1, "malformed -o option field");
484		if (fieldno == 0)
485			errx(1, "field numbers are 1 based");
486		if (olistcnt == olistalloc) {
487			olistalloc += 50;
488			if ((olist = realloc(olist,
489			    olistalloc * sizeof(OLIST))) == NULL)
490				err(1, NULL);
491		}
492		olist[olistcnt].filenum = token[0] - '0';
493		olist[olistcnt].fieldno = fieldno - 1;
494		++olistcnt;
495	}
496}
497
498void
499obsolete(argv)
500	char **argv;
501{
502	int len;
503	char **p, *ap, *t;
504
505	while ((ap = *++argv) != NULL) {
506		/* Return if "--". */
507		if (ap[0] == '-' && ap[1] == '-')
508			return;
509		switch (ap[1]) {
510		case 'a':
511			/*
512			 * The original join allowed "-a", which meant the
513			 * same as -a1 plus -a2.  POSIX 1003.2, Draft 11.2
514			 * only specifies this as "-a 1" and "a -2", so we
515			 * have to use another option flag, one that is
516			 * unlikely to ever be used or accidentally entered
517			 * on the command line.  (Well, we could reallocate
518			 * the argv array, but that hardly seems worthwhile.)
519			 */
520			if (ap[2] == '\0')
521				ap[1] = '\01';
522			break;
523		case 'j':
524			/*
525			 * The original join allowed "-j[12] arg" and "-j arg".
526			 * Convert the former to "-[12] arg".  Don't convert
527			 * the latter since getopt(3) can handle it.
528			 */
529			switch(ap[2]) {
530			case '1':
531				if (ap[3] != '\0')
532					goto jbad;
533				ap[1] = '1';
534				ap[2] = '\0';
535				break;
536			case '2':
537				if (ap[3] != '\0')
538					goto jbad;
539				ap[1] = '2';
540				ap[2] = '\0';
541				break;
542			case '\0':
543				break;
544			default:
545jbad:				errx(1, "illegal option -- %s", ap);
546				usage();
547			}
548			break;
549		case 'o':
550			/*
551			 * The original join allowed "-o arg arg".
552			 * Convert to "-o arg -o arg".
553			 */
554			if (ap[2] != '\0')
555				break;
556			for (p = argv + 2; *p; ++p) {
557				if (p[0][0] != '1' &&
558				    p[0][0] != '2' || p[0][1] != '.')
559					break;
560				len = strlen(*p);
561				if (len - 2 != strspn(*p + 2, "0123456789"))
562					break;
563				if ((t = malloc(len + 3)) == NULL)
564					err(1, NULL);
565				t[0] = '-';
566				t[1] = 'o';
567				memmove(t + 2, *p, len + 1);
568				*p = t;
569			}
570			argv = p - 1;
571			break;
572		}
573	}
574}
575
576void
577usage()
578{
579	(void)fprintf(stderr, "%s%s\n",
580	    "usage: join [-a fileno | -v fileno ] [-e string] [-1 field] ",
581	    "[-2 field]\n            [-o list] [-t char] file1 file2");
582	exit(1);
583}
584