1/**
2 * Test the UTF-8 decoding routines
3 *
4 * author: Daniel Veillard
5 * copy: see Copyright for the status of this software.
6 */
7
8#include <stdio.h>
9#include <string.h>
10#include <libxml/parser.h>
11#include <libxml/parserInternals.h>
12
13#include "buf.h"
14
15int lastError;
16
17static void errorHandler(void *unused, xmlErrorPtr err) {
18    if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
19        lastError = err->code;
20    }
21}
22
23char document1[100] = "<doc>XXXX</doc>";
24char document2[100] = "<doc foo='XXXX'/>";
25
26static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
27                  int len,  char *data, int forbid1, int forbid2) {
28    int i;
29    xmlDocPtr res;
30
31    for (i = 0;i <= 0xFF;i++) {
32	lastError = 0;
33	xmlCtxtReset(ctxt);
34
35        data[0] = i;
36
37	res = xmlReadMemory(document, len, "test", NULL, 0);
38
39	if ((i == forbid1) || (i == forbid2)) {
40	    if ((lastError == 0) || (res != NULL))
41	        fprintf(stderr,
42		    "Failed to detect invalid char for Byte 0x%02X: %c\n",
43		        i, i);
44	}
45
46	else if ((i == '<') || (i == '&')) {
47	    if ((lastError == 0) || (res != NULL))
48	        fprintf(stderr,
49		    "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
50	}
51	else if (((i < 0x20) || (i >= 0x80)) &&
52	    (i != 0x9) && (i != 0xA) && (i != 0xD)) {
53	    if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL))
54	        fprintf(stderr,
55		    "Failed to detect invalid char for Byte 0x%02X\n", i);
56	}
57	else if (res == NULL) {
58	    fprintf(stderr,
59		"Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
60	}
61	if (res != NULL)
62	    xmlFreeDoc(res);
63    }
64}
65
66static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
67                  int len,  char *data) {
68    int i, j;
69    xmlDocPtr res;
70
71    for (i = 0x80;i <= 0xFF;i++) {
72    for (j = 0;j <= 0xFF;j++) {
73	lastError = 0;
74	xmlCtxtReset(ctxt);
75
76        data[0] = i;
77        data[1] = j;
78
79	res = xmlReadMemory(document, len, "test", NULL, 0);
80
81	/* if first bit of first char is set, then second bit must too */
82	if ((i & 0x80) && ((i & 0x40) == 0)) {
83	    if ((lastError == 0) || (res != NULL))
84		fprintf(stderr,
85		"Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
86			i, j);
87	}
88
89	/*
90	 * if first bit of first char is set, then second char first
91	 * bits must be 10
92	 */
93	else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
94	    if ((lastError == 0) || (res != NULL))
95		fprintf(stderr,
96	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
97			i, j);
98	}
99
100	/*
101	 * if using a 2 byte encoding then the value must be greater
102	 * than 0x80, i.e. one of bits 5 to 1 of i must be set
103	 */
104	else if ((i & 0x80) && ((i & 0x1E) == 0)) {
105	    if ((lastError == 0) || (res != NULL))
106		fprintf(stderr,
107	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
108			i, j);
109	}
110
111	/*
112	 * if third bit of first char is set, then the sequence would need
113	 * at least 3 bytes, but we give only 2 !
114	 */
115	else if ((i & 0xE0) == 0xE0) {
116	    if ((lastError == 0) || (res != NULL))
117		fprintf(stderr,
118	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
119			i, j);
120	}
121
122	/*
123	 * We should see no error in remaning cases
124	 */
125	else if ((lastError != 0) || (res == NULL)) {
126	    fprintf(stderr,
127		"Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
128	}
129	if (res != NULL)
130	    xmlFreeDoc(res);
131    }
132    }
133}
134
135/**
136 * testDocumentRanges:
137 *
138 * Test the correct UTF8 character parsing in context of XML documents
139 * Those are in-context injection tests checking the parser behaviour on
140 * edge case values at different point in content, beginning and end of
141 * CDATA in text or in attribute values.
142 */
143
144static void testDocumentRanges(void) {
145    xmlParserCtxtPtr ctxt;
146    char *data;
147
148    /*
149     * Set up a parsing context using the first document as
150     * the current input source.
151     */
152    ctxt = xmlNewParserCtxt();
153    if (ctxt == NULL) {
154        fprintf(stderr, "Failed to allocate parser context\n");
155	return;
156    }
157
158    printf("testing 1 byte char in document: 1");
159    fflush(stdout);
160    data = &document1[5];
161    data[0] = ' ';
162    data[1] = ' ';
163    data[2] = ' ';
164    data[3] = ' ';
165    /* test 1 byte injection at beginning of area */
166    testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
167                           data, -1, -1);
168    printf(" 2");
169    fflush(stdout);
170    data[0] = ' ';
171    data[1] = ' ';
172    data[2] = ' ';
173    data[3] = ' ';
174    /* test 1 byte injection at end of area */
175    testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
176                           data + 3, -1, -1);
177
178    printf(" 3");
179    fflush(stdout);
180    data = &document2[10];
181    data[0] = ' ';
182    data[1] = ' ';
183    data[2] = ' ';
184    data[3] = ' ';
185    /* test 1 byte injection at beginning of area */
186    testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
187                           data, '\'', -1);
188    printf(" 4");
189    fflush(stdout);
190    data[0] = ' ';
191    data[1] = ' ';
192    data[2] = ' ';
193    data[3] = ' ';
194    /* test 1 byte injection at end of area */
195    testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
196                           data + 3, '\'', -1);
197    printf(" done\n");
198
199    printf("testing 2 byte char in document: 1");
200    fflush(stdout);
201    data = &document1[5];
202    data[0] = ' ';
203    data[1] = ' ';
204    data[2] = ' ';
205    data[3] = ' ';
206    /* test 2 byte injection at beginning of area */
207    testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
208                           data);
209    printf(" 2");
210    fflush(stdout);
211    data[0] = ' ';
212    data[1] = ' ';
213    data[2] = ' ';
214    data[3] = ' ';
215    /* test 2 byte injection at end of area */
216    testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
217                           data + 2);
218
219    printf(" 3");
220    fflush(stdout);
221    data = &document2[10];
222    data[0] = ' ';
223    data[1] = ' ';
224    data[2] = ' ';
225    data[3] = ' ';
226    /* test 2 byte injection at beginning of area */
227    testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
228                           data);
229    printf(" 4");
230    fflush(stdout);
231    data[0] = ' ';
232    data[1] = ' ';
233    data[2] = ' ';
234    data[3] = ' ';
235    /* test 2 byte injection at end of area */
236    testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
237                           data + 2);
238    printf(" done\n");
239
240    xmlFreeParserCtxt(ctxt);
241}
242
243static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) {
244    int i = 0;
245    int len, c;
246
247    data[1] = 0;
248    data[2] = 0;
249    data[3] = 0;
250    for (i = 0;i <= 0xFF;i++) {
251        data[0] = i;
252	ctxt->charset = XML_CHAR_ENCODING_UTF8;
253
254	lastError = 0;
255        c = xmlCurrentChar(ctxt, &len);
256	if ((i == 0) || (i >= 0x80)) {
257	    /* we must see an error there */
258	    if (lastError != XML_ERR_INVALID_CHAR)
259	        fprintf(stderr,
260		    "Failed to detect invalid char for Byte 0x%02X\n", i);
261	} else if (i == 0xD) {
262	    if ((c != 0xA) || (len != 1))
263		fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
264	} else if ((c != i) || (len != 1)) {
265	    fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
266	}
267    }
268}
269
270static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) {
271    int i, j;
272    int len, c;
273
274    data[2] = 0;
275    data[3] = 0;
276    for (i = 0x80;i <= 0xFF;i++) {
277	for (j = 0;j <= 0xFF;j++) {
278	    data[0] = i;
279	    data[1] = j;
280	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
281
282	    lastError = 0;
283	    c = xmlCurrentChar(ctxt, &len);
284
285	    /* if first bit of first char is set, then second bit must too */
286	    if ((i & 0x80) && ((i & 0x40) == 0)) {
287		if (lastError != XML_ERR_INVALID_CHAR)
288		    fprintf(stderr,
289		    "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
290		            i, j);
291	    }
292
293	    /*
294	     * if first bit of first char is set, then second char first
295	     * bits must be 10
296	     */
297	    else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
298		if (lastError != XML_ERR_INVALID_CHAR)
299		    fprintf(stderr,
300		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
301		            i, j, c);
302	    }
303
304	    /*
305	     * if using a 2 byte encoding then the value must be greater
306	     * than 0x80, i.e. one of bits 5 to 1 of i must be set
307	     */
308	    else if ((i & 0x80) && ((i & 0x1E) == 0)) {
309		if (lastError != XML_ERR_INVALID_CHAR)
310		    fprintf(stderr,
311		"Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
312		            i, j, c);
313	    }
314
315	    /*
316	     * if third bit of first char is set, then the sequence would need
317	     * at least 3 bytes, but we give only 2 !
318	     */
319	    else if ((i & 0xE0) == 0xE0) {
320		if (lastError != XML_ERR_INVALID_CHAR)
321		    fprintf(stderr,
322		"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
323		            i, j);
324	    }
325
326            /*
327	     * We should see no error in remaning cases
328	     */
329	    else if ((lastError != 0) || (len != 2)) {
330		fprintf(stderr,
331		    "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
332	    }
333
334            /*
335	     * Finally check the value is right
336	     */
337	    else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
338		fprintf(stderr,
339	"Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
340	                i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
341	    }
342        }
343    }
344}
345
346static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) {
347    int i, j, k, K;
348    int len, c;
349    unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
350    int value;
351
352    data[3] = 0;
353    for (i = 0xE0;i <= 0xFF;i++) {
354    for (j = 0;j <= 0xFF;j++) {
355    for (k = 0;k < 6;k++) {
356	data[0] = i;
357	data[1] = j;
358	K = lows[k];
359	data[2] = (char) K;
360	value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
361	ctxt->charset = XML_CHAR_ENCODING_UTF8;
362
363	lastError = 0;
364	c = xmlCurrentChar(ctxt, &len);
365
366	/*
367	 * if fourth bit of first char is set, then the sequence would need
368	 * at least 4 bytes, but we give only 3 !
369	 */
370	if ((i & 0xF0) == 0xF0) {
371	    if (lastError != XML_ERR_INVALID_CHAR)
372		fprintf(stderr,
373	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
374			i, j, K, data[3]);
375	}
376
377        /*
378	 * The second and the third bytes must start with 10
379	 */
380	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
381	    if (lastError != XML_ERR_INVALID_CHAR)
382		fprintf(stderr,
383	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
384			i, j, K);
385	}
386
387	/*
388	 * if using a 3 byte encoding then the value must be greater
389	 * than 0x800, i.e. one of bits 4 to 0 of i must be set or
390	 * the 6th byte of data[1] must be set
391	 */
392	else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
393	    if (lastError != XML_ERR_INVALID_CHAR)
394		fprintf(stderr,
395	    "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
396			i, j, K);
397	}
398
399        /*
400	 * There are values in that range that are not allowed in XML-1.0
401	 */
402	else if (((value > 0xD7FF) && (value <0xE000)) ||
403	         ((value > 0xFFFD) && (value <0x10000))) {
404	    if (lastError != XML_ERR_INVALID_CHAR)
405		fprintf(stderr,
406	"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
407			value, i, j, K);
408	}
409
410	/*
411	 * We should see no error in remaining cases
412	 */
413	else if ((lastError != 0) || (len != 3)) {
414	    fprintf(stderr,
415		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
416		    i, j, K);
417	}
418
419	/*
420	 * Finally check the value is right
421	 */
422	else if (c != value) {
423	    fprintf(stderr,
424    "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
425		i, j, data[2], value, c);
426	}
427    }
428    }
429    }
430}
431
432static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) {
433    int i, j, k, K, l, L;
434    int len, c;
435    unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
436    int value;
437
438    data[4] = 0;
439    for (i = 0xF0;i <= 0xFF;i++) {
440    for (j = 0;j <= 0xFF;j++) {
441    for (k = 0;k < 6;k++) {
442    for (l = 0;l < 6;l++) {
443	data[0] = i;
444	data[1] = j;
445	K = lows[k];
446	data[2] = (char) K;
447	L = lows[l];
448	data[3] = (char) L;
449	value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
450	        ((i & 0x7) << 18);
451	ctxt->charset = XML_CHAR_ENCODING_UTF8;
452
453	lastError = 0;
454	c = xmlCurrentChar(ctxt, &len);
455
456	/*
457	 * if fifth bit of first char is set, then the sequence would need
458	 * at least 5 bytes, but we give only 4 !
459	 */
460	if ((i & 0xF8) == 0xF8) {
461	    if (lastError != XML_ERR_INVALID_CHAR)
462		fprintf(stderr,
463  "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
464			i, j, K, data[3]);
465	}
466
467        /*
468	 * The second, third and fourth bytes must start with 10
469	 */
470	else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
471	         ((L & 0xC0) != 0x80)) {
472	    if (lastError != XML_ERR_INVALID_CHAR)
473		fprintf(stderr,
474	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
475			i, j, K, L);
476	}
477
478	/*
479	 * if using a 3 byte encoding then the value must be greater
480	 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
481	 * the 6 or 5th byte of j must be set
482	 */
483	else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
484	    if (lastError != XML_ERR_INVALID_CHAR)
485		fprintf(stderr,
486	"Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
487			i, j, K, L);
488	}
489
490        /*
491	 * There are values in that range that are not allowed in XML-1.0
492	 */
493	else if (((value > 0xD7FF) && (value <0xE000)) ||
494	         ((value > 0xFFFD) && (value <0x10000)) ||
495		 (value > 0x10FFFF)) {
496	    if (lastError != XML_ERR_INVALID_CHAR)
497		fprintf(stderr,
498"Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
499			value, i, j, K, L);
500	}
501
502	/*
503	 * We should see no error in remaining cases
504	 */
505	else if ((lastError != 0) || (len != 4)) {
506	    fprintf(stderr,
507		"Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
508		    i, j, K);
509	}
510
511	/*
512	 * Finally check the value is right
513	 */
514	else if (c != value) {
515	    fprintf(stderr,
516    "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
517		i, j, data[2], value, c);
518	}
519    }
520    }
521    }
522    }
523}
524
525/**
526 * testCharRanges:
527 *
528 * Test the correct UTF8 character parsing in isolation i.e.
529 * not when parsing a full document, this is less expensive and we can
530 * cover the full range of UTF-8 chars accepted by XML-1.0
531 */
532
533static void testCharRanges(void) {
534    char data[5];
535    xmlParserCtxtPtr ctxt;
536    xmlParserInputBufferPtr buf;
537    xmlParserInputPtr input;
538
539    memset(data, 0, 5);
540
541    /*
542     * Set up a parsing context using the above data buffer as
543     * the current input source.
544     */
545    ctxt = xmlNewParserCtxt();
546    if (ctxt == NULL) {
547        fprintf(stderr, "Failed to allocate parser context\n");
548	return;
549    }
550    buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
551                                           XML_CHAR_ENCODING_NONE);
552    if (buf == NULL) {
553        fprintf(stderr, "Failed to allocate input buffer\n");
554	goto error;
555    }
556    input = xmlNewInputStream(ctxt);
557    if (input == NULL) {
558        xmlFreeParserInputBuffer(buf);
559	goto error;
560    }
561    input->filename = NULL;
562    input->buf = buf;
563    input->cur =
564    input->base = xmlBufContent(input->buf->buffer);
565    input->end = input->base + 4;
566    inputPush(ctxt, input);
567
568    printf("testing char range: 1");
569    fflush(stdout);
570    testCharRangeByte1(ctxt, data);
571    printf(" 2");
572    fflush(stdout);
573    testCharRangeByte2(ctxt, data);
574    printf(" 3");
575    fflush(stdout);
576    testCharRangeByte3(ctxt, data);
577    printf(" 4");
578    fflush(stdout);
579    testCharRangeByte4(ctxt, data);
580    printf(" done\n");
581    fflush(stdout);
582
583error:
584    xmlFreeParserCtxt(ctxt);
585}
586
587int main(void) {
588
589    /*
590     * this initialize the library and check potential ABI mismatches
591     * between the version it was compiled for and the actual shared
592     * library used.
593     */
594    LIBXML_TEST_VERSION
595
596    /*
597     * Catch errors separately
598     */
599
600    xmlSetStructuredErrorFunc(NULL, errorHandler);
601
602    /*
603     * Run the tests
604     */
605    testCharRanges();
606    testDocumentRanges();
607
608    /*
609     * Cleanup function for the XML library.
610     */
611    xmlCleanupParser();
612    /*
613     * this is to debug memory for regression tests
614     */
615    xmlMemoryDump();
616    return(0);
617}
618