vsort.c revision 330449
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
5 * Copyright (C) 2012 Gabor Kovesdan <gabor@FreeBSD.org>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: stable/11/usr.bin/sort/vsort.c 330449 2018-03-05 07:26:05Z eadler $");
32
33#include <sys/types.h>
34
35#include <ctype.h>
36#include <stdlib.h>
37#include <string.h>
38
39#include "sort.h"
40#include "vsort.h"
41
42static inline bool
43isdigit_clocale(wchar_t c)
44{
45
46	return (c >= L'0' && c <= L'9');
47}
48
49static inline bool
50isalpha_clocale(wchar_t c)
51{
52
53	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'));
54}
55
56static inline bool
57isalnum_clocale(wchar_t c)
58{
59
60	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') ||
61	    (c >= L'0' && c <= L'9'));
62}
63
64/*
65 * Find string suffix of format: (\.[A-Za-z~][A-Za-z0-9~]*)*$
66 * Set length of string before suffix.
67 */
68static void
69find_suffix(bwstring_iterator si, bwstring_iterator se, size_t *len)
70{
71	wchar_t c;
72	size_t clen;
73	bool expect_alpha, sfx;
74
75	sfx = false;
76	expect_alpha = false;
77	*len = 0;
78	clen = 0;
79
80	while ((si < se) && (c = bws_get_iter_value(si))) {
81		if (expect_alpha) {
82			expect_alpha = false;
83			if (!isalpha_clocale(c) && (c != L'~'))
84				sfx = false;
85		} else if (c == L'.') {
86			expect_alpha = true;
87			if (!sfx) {
88				sfx = true;
89				*len = clen;
90			}
91		} else if (!isalnum_clocale(c) && (c != L'~'))
92			sfx = false;
93
94		si = bws_iterator_inc(si, 1);
95		++clen;
96	}
97
98	/* This code must be here to make the implementation compatible
99	 * with WORDING of GNU sort documentation.
100	 * But the GNU sort implementation is not following its own
101	 * documentation.  GNU sort allows empty file extensions
102	 * (just dot with nothing after); but the regular expression in
103	 * their documentation does not allow empty file extensions.
104	 * We chose to make our implementation compatible with GNU sort
105	 * implementation.  If they will ever fix their bug, this code
106	 * must be uncommented. Or they may choose to fix the info page,
107	 * then the code stays commented.
108	 *
109	 if (expect_alpha)
110	 	sfx = false;
111	 */
112
113	if (!sfx)
114		*len = clen;
115}
116
117static inline int
118cmp_chars(wchar_t c1, wchar_t c2)
119{
120
121	if (c1 == c2)
122		return (0);
123
124	if (c1 == L'~')
125		return (-1);
126	if (c2 == L'~')
127		return (+1);
128
129	if (isdigit_clocale(c1) || !c1)
130		return ((isdigit_clocale(c2) || !c2) ? 0 : -1);
131
132	if (isdigit_clocale(c2) || !c2)
133		return (+1);
134
135	if (isalpha_clocale(c1))
136		return ((isalpha_clocale(c2)) ? ((int) c1 - (int) c2) : -1);
137
138	if (isalpha_clocale(c2))
139		return (+1);
140
141	return ((int) c1 - (int) c2);
142}
143
144static int
145cmpversions(bwstring_iterator si1, bwstring_iterator se1,
146    bwstring_iterator si2, bwstring_iterator se2)
147{
148	int cmp, diff;
149
150	while ((si1 < se1) || (si2 < se2)) {
151		diff = 0;
152
153		while (((si1 < se1) &&
154		    !isdigit_clocale(bws_get_iter_value(si1))) ||
155		    ((si2 < se2) && !isdigit_clocale(bws_get_iter_value(si2)))) {
156			wchar_t c1, c2;
157
158			c1 = (si1 < se1) ? bws_get_iter_value(si1) : 0;
159			c2 = (si2 < se2) ? bws_get_iter_value(si2) : 0;
160
161			cmp = cmp_chars(c1, c2);
162			if (cmp)
163				return (cmp);
164
165			if (si1 < se1)
166				si1 = bws_iterator_inc(si1, 1);
167			if (si2 < se2)
168				si2 = bws_iterator_inc(si2, 1);
169		}
170
171		while (bws_get_iter_value(si1) == L'0')
172			si1 = bws_iterator_inc(si1, 1);
173
174		while (bws_get_iter_value(si2) == L'0')
175			si2 = bws_iterator_inc(si2, 1);
176
177		while (isdigit_clocale(bws_get_iter_value(si1)) &&
178		    isdigit_clocale(bws_get_iter_value(si2))) {
179			if (!diff)
180				diff = ((int)bws_get_iter_value(si1) -
181				    (int)bws_get_iter_value(si2));
182			si1 = bws_iterator_inc(si1, 1);
183			si2 = bws_iterator_inc(si2, 1);
184		}
185
186		if (isdigit_clocale(bws_get_iter_value(si1)))
187			return (1);
188
189		if (isdigit_clocale(bws_get_iter_value(si2)))
190			return (-1);
191
192		if (diff)
193			return (diff);
194	}
195
196	return (0);
197}
198
199/*
200 * Compare two version strings
201 */
202int
203vcmp(struct bwstring *s1, struct bwstring *s2)
204{
205	bwstring_iterator si1, si2;
206	wchar_t c1, c2;
207	size_t len1, len2, slen1, slen2;
208	int cmp_bytes, cmp_res;
209
210	if (s1 == s2)
211		return (0);
212
213	cmp_bytes = bwscmp(s1, s2, 0);
214	if (cmp_bytes == 0)
215		return (0);
216
217	len1 = slen1 = BWSLEN(s1);
218	len2 = slen2 = BWSLEN(s2);
219
220	if (slen1 < 1)
221		return (-1);
222	if (slen2 < 1)
223		return (+1);
224
225	si1 = bws_begin(s1);
226	si2 = bws_begin(s2);
227
228	c1 = bws_get_iter_value(si1);
229	c2 = bws_get_iter_value(si2);
230
231	if (c1 == L'.' && (slen1 == 1))
232		return (-1);
233
234	if (c2 == L'.' && (slen2 == 1))
235		return (+1);
236
237	if (slen1 == 2 && c1 == L'.' &&
238	    bws_get_iter_value(bws_iterator_inc(si1, 1)) == L'.')
239		return (-1);
240	if (slen2 == 2 && c2 == L'.' &&
241	    bws_get_iter_value(bws_iterator_inc(si2, 1)) == L'.')
242		return (+1);
243
244	if (c1 == L'.' && c2 != L'.')
245		return (-1);
246	if (c1 != L'.' && c2 == L'.')
247		return (+1);
248
249	if (c1 == L'.' && c2 == L'.') {
250		si1 = bws_iterator_inc(si1, 1);
251		si2 = bws_iterator_inc(si2, 1);
252	}
253
254	find_suffix(si1, bws_end(s1), &len1);
255	find_suffix(si2, bws_end(s2), &len2);
256
257	if ((len1 == len2) && (bws_iterator_cmp(si1, si2, len1) == 0))
258		return (cmp_bytes);
259
260	cmp_res = cmpversions(si1, bws_iterator_inc(si1, len1), si2,
261	    bws_iterator_inc(si2, len2));
262
263	if (cmp_res == 0)
264	  cmp_res = cmp_bytes;
265
266	return (cmp_res);
267}
268