1/* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)!
2                            __  __            _
3                         ___\ \/ /_ __   __ _| |_
4                        / _ \\  /| '_ \ / _` | __|
5                       |  __//  \| |_) | (_| | |_
6                        \___/_/\_\ .__/ \__,_|\__|
7                                 |_| XML parser
8
9   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10   Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11   Copyright (c) 2002      Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12   Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
13   Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
14   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
15   Copyright (c) 2018      Benjamin Peterson <benjamin@python.org>
16   Copyright (c) 2018      Anton Maklakov <antmak.pub@gmail.com>
17   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
18   Copyright (c) 2020      Boris Kolpackov <boris@codesynthesis.com>
19   Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
20   Licensed under the MIT license:
21
22   Permission is  hereby granted,  free of charge,  to any  person obtaining
23   a  copy  of  this  software   and  associated  documentation  files  (the
24   "Software"),  to  deal in  the  Software  without restriction,  including
25   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
26   distribute, sublicense, and/or sell copies of the Software, and to permit
27   persons  to whom  the Software  is  furnished to  do so,  subject to  the
28   following conditions:
29
30   The above copyright  notice and this permission notice  shall be included
31   in all copies or substantial portions of the Software.
32
33   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
34   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
35   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
36   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
37   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
38   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
39   USE OR OTHER DEALINGS IN THE SOFTWARE.
40*/
41
42#ifdef XML_TOK_IMPL_C
43
44#  ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined
45#    define IS_INVALID_CHAR(enc, ptr, n) (0)
46#  endif
47
48#  define INVALID_LEAD_CASE(n, ptr, nextTokPtr)                                \
49  case BT_LEAD##n:                                                             \
50    if (end - ptr < n)                                                         \
51      return XML_TOK_PARTIAL_CHAR;                                             \
52    if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
53      *(nextTokPtr) = (ptr);                                                   \
54      return XML_TOK_INVALID;                                                  \
55    }                                                                          \
56    ptr += n;                                                                  \
57    break;
58
59#  define INVALID_CASES(ptr, nextTokPtr)                                       \
60    INVALID_LEAD_CASE(2, ptr, nextTokPtr)                                      \
61    INVALID_LEAD_CASE(3, ptr, nextTokPtr)                                      \
62    INVALID_LEAD_CASE(4, ptr, nextTokPtr)                                      \
63  case BT_NONXML:                                                              \
64  case BT_MALFORM:                                                             \
65  case BT_TRAIL:                                                               \
66    *(nextTokPtr) = (ptr);                                                     \
67    return XML_TOK_INVALID;
68
69#  define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr)                        \
70  case BT_LEAD##n:                                                             \
71    if (end - ptr < n)                                                         \
72      return XML_TOK_PARTIAL_CHAR;                                             \
73    if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) {         \
74      *nextTokPtr = ptr;                                                       \
75      return XML_TOK_INVALID;                                                  \
76    }                                                                          \
77    ptr += n;                                                                  \
78    break;
79
80#  define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)                          \
81  case BT_NONASCII:                                                            \
82    if (! IS_NAME_CHAR_MINBPC(enc, ptr)) {                                     \
83      *nextTokPtr = ptr;                                                       \
84      return XML_TOK_INVALID;                                                  \
85    }                                                                          \
86    /* fall through */                                                         \
87  case BT_NMSTRT:                                                              \
88  case BT_HEX:                                                                 \
89  case BT_DIGIT:                                                               \
90  case BT_NAME:                                                                \
91  case BT_MINUS:                                                               \
92    ptr += MINBPC(enc);                                                        \
93    break;                                                                     \
94    CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr)                              \
95    CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr)                              \
96    CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
97
98#  define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr)                      \
99  case BT_LEAD##n:                                                             \
100    if ((end) - (ptr) < (n))                                                   \
101      return XML_TOK_PARTIAL_CHAR;                                             \
102    if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) {       \
103      *nextTokPtr = ptr;                                                       \
104      return XML_TOK_INVALID;                                                  \
105    }                                                                          \
106    ptr += n;                                                                  \
107    break;
108
109#  define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)                        \
110  case BT_NONASCII:                                                            \
111    if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {                                   \
112      *nextTokPtr = ptr;                                                       \
113      return XML_TOK_INVALID;                                                  \
114    }                                                                          \
115    /* fall through */                                                         \
116  case BT_NMSTRT:                                                              \
117  case BT_HEX:                                                                 \
118    ptr += MINBPC(enc);                                                        \
119    break;                                                                     \
120    CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr)                            \
121    CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr)                            \
122    CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
123
124#  ifndef PREFIX
125#    define PREFIX(ident) ident
126#  endif
127
128#  define HAS_CHARS(enc, ptr, end, count)                                      \
129    ((end) - (ptr) >= ((count) * MINBPC(enc)))
130
131#  define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
132
133#  define REQUIRE_CHARS(enc, ptr, end, count)                                  \
134    {                                                                          \
135      if (! HAS_CHARS(enc, ptr, end, count)) {                                 \
136        return XML_TOK_PARTIAL;                                                \
137      }                                                                        \
138    }
139
140#  define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
141
142/* ptr points to character following "<!-" */
143
144static int PTRCALL
145PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
146                    const char **nextTokPtr) {
147  if (HAS_CHAR(enc, ptr, end)) {
148    if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
149      *nextTokPtr = ptr;
150      return XML_TOK_INVALID;
151    }
152    ptr += MINBPC(enc);
153    while (HAS_CHAR(enc, ptr, end)) {
154      switch (BYTE_TYPE(enc, ptr)) {
155        INVALID_CASES(ptr, nextTokPtr)
156      case BT_MINUS:
157        ptr += MINBPC(enc);
158        REQUIRE_CHAR(enc, ptr, end);
159        if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
160          ptr += MINBPC(enc);
161          REQUIRE_CHAR(enc, ptr, end);
162          if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
163            *nextTokPtr = ptr;
164            return XML_TOK_INVALID;
165          }
166          *nextTokPtr = ptr + MINBPC(enc);
167          return XML_TOK_COMMENT;
168        }
169        break;
170      default:
171        ptr += MINBPC(enc);
172        break;
173      }
174    }
175  }
176  return XML_TOK_PARTIAL;
177}
178
179/* ptr points to character following "<!" */
180
181static int PTRCALL
182PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
183                 const char **nextTokPtr) {
184  REQUIRE_CHAR(enc, ptr, end);
185  switch (BYTE_TYPE(enc, ptr)) {
186  case BT_MINUS:
187    return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
188  case BT_LSQB:
189    *nextTokPtr = ptr + MINBPC(enc);
190    return XML_TOK_COND_SECT_OPEN;
191  case BT_NMSTRT:
192  case BT_HEX:
193    ptr += MINBPC(enc);
194    break;
195  default:
196    *nextTokPtr = ptr;
197    return XML_TOK_INVALID;
198  }
199  while (HAS_CHAR(enc, ptr, end)) {
200    switch (BYTE_TYPE(enc, ptr)) {
201    case BT_PERCNT:
202      REQUIRE_CHARS(enc, ptr, end, 2);
203      /* don't allow <!ENTITY% foo "whatever"> */
204      switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
205      case BT_S:
206      case BT_CR:
207      case BT_LF:
208      case BT_PERCNT:
209        *nextTokPtr = ptr;
210        return XML_TOK_INVALID;
211      }
212      /* fall through */
213    case BT_S:
214    case BT_CR:
215    case BT_LF:
216      *nextTokPtr = ptr;
217      return XML_TOK_DECL_OPEN;
218    case BT_NMSTRT:
219    case BT_HEX:
220      ptr += MINBPC(enc);
221      break;
222    default:
223      *nextTokPtr = ptr;
224      return XML_TOK_INVALID;
225    }
226  }
227  return XML_TOK_PARTIAL;
228}
229
230static int PTRCALL
231PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
232                      int *tokPtr) {
233  int upper = 0;
234  UNUSED_P(enc);
235  *tokPtr = XML_TOK_PI;
236  if (end - ptr != MINBPC(enc) * 3)
237    return 1;
238  switch (BYTE_TO_ASCII(enc, ptr)) {
239  case ASCII_x:
240    break;
241  case ASCII_X:
242    upper = 1;
243    break;
244  default:
245    return 1;
246  }
247  ptr += MINBPC(enc);
248  switch (BYTE_TO_ASCII(enc, ptr)) {
249  case ASCII_m:
250    break;
251  case ASCII_M:
252    upper = 1;
253    break;
254  default:
255    return 1;
256  }
257  ptr += MINBPC(enc);
258  switch (BYTE_TO_ASCII(enc, ptr)) {
259  case ASCII_l:
260    break;
261  case ASCII_L:
262    upper = 1;
263    break;
264  default:
265    return 1;
266  }
267  if (upper)
268    return 0;
269  *tokPtr = XML_TOK_XML_DECL;
270  return 1;
271}
272
273/* ptr points to character following "<?" */
274
275static int PTRCALL
276PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
277               const char **nextTokPtr) {
278  int tok;
279  const char *target = ptr;
280  REQUIRE_CHAR(enc, ptr, end);
281  switch (BYTE_TYPE(enc, ptr)) {
282    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
283  default:
284    *nextTokPtr = ptr;
285    return XML_TOK_INVALID;
286  }
287  while (HAS_CHAR(enc, ptr, end)) {
288    switch (BYTE_TYPE(enc, ptr)) {
289      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
290    case BT_S:
291    case BT_CR:
292    case BT_LF:
293      if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
294        *nextTokPtr = ptr;
295        return XML_TOK_INVALID;
296      }
297      ptr += MINBPC(enc);
298      while (HAS_CHAR(enc, ptr, end)) {
299        switch (BYTE_TYPE(enc, ptr)) {
300          INVALID_CASES(ptr, nextTokPtr)
301        case BT_QUEST:
302          ptr += MINBPC(enc);
303          REQUIRE_CHAR(enc, ptr, end);
304          if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
305            *nextTokPtr = ptr + MINBPC(enc);
306            return tok;
307          }
308          break;
309        default:
310          ptr += MINBPC(enc);
311          break;
312        }
313      }
314      return XML_TOK_PARTIAL;
315    case BT_QUEST:
316      if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
317        *nextTokPtr = ptr;
318        return XML_TOK_INVALID;
319      }
320      ptr += MINBPC(enc);
321      REQUIRE_CHAR(enc, ptr, end);
322      if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
323        *nextTokPtr = ptr + MINBPC(enc);
324        return tok;
325      }
326      /* fall through */
327    default:
328      *nextTokPtr = ptr;
329      return XML_TOK_INVALID;
330    }
331  }
332  return XML_TOK_PARTIAL;
333}
334
335static int PTRCALL
336PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
337                         const char **nextTokPtr) {
338  static const char CDATA_LSQB[]
339      = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
340  int i;
341  UNUSED_P(enc);
342  /* CDATA[ */
343  REQUIRE_CHARS(enc, ptr, end, 6);
344  for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
345    if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
346      *nextTokPtr = ptr;
347      return XML_TOK_INVALID;
348    }
349  }
350  *nextTokPtr = ptr;
351  return XML_TOK_CDATA_SECT_OPEN;
352}
353
354static int PTRCALL
355PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
356                        const char **nextTokPtr) {
357  if (ptr >= end)
358    return XML_TOK_NONE;
359  if (MINBPC(enc) > 1) {
360    size_t n = end - ptr;
361    if (n & (MINBPC(enc) - 1)) {
362      n &= ~(MINBPC(enc) - 1);
363      if (n == 0)
364        return XML_TOK_PARTIAL;
365      end = ptr + n;
366    }
367  }
368  switch (BYTE_TYPE(enc, ptr)) {
369  case BT_RSQB:
370    ptr += MINBPC(enc);
371    REQUIRE_CHAR(enc, ptr, end);
372    if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
373      break;
374    ptr += MINBPC(enc);
375    REQUIRE_CHAR(enc, ptr, end);
376    if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
377      ptr -= MINBPC(enc);
378      break;
379    }
380    *nextTokPtr = ptr + MINBPC(enc);
381    return XML_TOK_CDATA_SECT_CLOSE;
382  case BT_CR:
383    ptr += MINBPC(enc);
384    REQUIRE_CHAR(enc, ptr, end);
385    if (BYTE_TYPE(enc, ptr) == BT_LF)
386      ptr += MINBPC(enc);
387    *nextTokPtr = ptr;
388    return XML_TOK_DATA_NEWLINE;
389  case BT_LF:
390    *nextTokPtr = ptr + MINBPC(enc);
391    return XML_TOK_DATA_NEWLINE;
392    INVALID_CASES(ptr, nextTokPtr)
393  default:
394    ptr += MINBPC(enc);
395    break;
396  }
397  while (HAS_CHAR(enc, ptr, end)) {
398    switch (BYTE_TYPE(enc, ptr)) {
399#  define LEAD_CASE(n)                                                         \
400  case BT_LEAD##n:                                                             \
401    if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
402      *nextTokPtr = ptr;                                                       \
403      return XML_TOK_DATA_CHARS;                                               \
404    }                                                                          \
405    ptr += n;                                                                  \
406    break;
407      LEAD_CASE(2)
408      LEAD_CASE(3)
409      LEAD_CASE(4)
410#  undef LEAD_CASE
411    case BT_NONXML:
412    case BT_MALFORM:
413    case BT_TRAIL:
414    case BT_CR:
415    case BT_LF:
416    case BT_RSQB:
417      *nextTokPtr = ptr;
418      return XML_TOK_DATA_CHARS;
419    default:
420      ptr += MINBPC(enc);
421      break;
422    }
423  }
424  *nextTokPtr = ptr;
425  return XML_TOK_DATA_CHARS;
426}
427
428/* ptr points to character following "</" */
429
430static int PTRCALL
431PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
432                   const char **nextTokPtr) {
433  REQUIRE_CHAR(enc, ptr, end);
434  switch (BYTE_TYPE(enc, ptr)) {
435    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
436  default:
437    *nextTokPtr = ptr;
438    return XML_TOK_INVALID;
439  }
440  while (HAS_CHAR(enc, ptr, end)) {
441    switch (BYTE_TYPE(enc, ptr)) {
442      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
443    case BT_S:
444    case BT_CR:
445    case BT_LF:
446      for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
447        switch (BYTE_TYPE(enc, ptr)) {
448        case BT_S:
449        case BT_CR:
450        case BT_LF:
451          break;
452        case BT_GT:
453          *nextTokPtr = ptr + MINBPC(enc);
454          return XML_TOK_END_TAG;
455        default:
456          *nextTokPtr = ptr;
457          return XML_TOK_INVALID;
458        }
459      }
460      return XML_TOK_PARTIAL;
461#  ifdef XML_NS
462    case BT_COLON:
463      /* no need to check qname syntax here,
464         since end-tag must match exactly */
465      ptr += MINBPC(enc);
466      break;
467#  endif
468    case BT_GT:
469      *nextTokPtr = ptr + MINBPC(enc);
470      return XML_TOK_END_TAG;
471    default:
472      *nextTokPtr = ptr;
473      return XML_TOK_INVALID;
474    }
475  }
476  return XML_TOK_PARTIAL;
477}
478
479/* ptr points to character following "&#X" */
480
481static int PTRCALL
482PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
483                       const char **nextTokPtr) {
484  if (HAS_CHAR(enc, ptr, end)) {
485    switch (BYTE_TYPE(enc, ptr)) {
486    case BT_DIGIT:
487    case BT_HEX:
488      break;
489    default:
490      *nextTokPtr = ptr;
491      return XML_TOK_INVALID;
492    }
493    for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
494      switch (BYTE_TYPE(enc, ptr)) {
495      case BT_DIGIT:
496      case BT_HEX:
497        break;
498      case BT_SEMI:
499        *nextTokPtr = ptr + MINBPC(enc);
500        return XML_TOK_CHAR_REF;
501      default:
502        *nextTokPtr = ptr;
503        return XML_TOK_INVALID;
504      }
505    }
506  }
507  return XML_TOK_PARTIAL;
508}
509
510/* ptr points to character following "&#" */
511
512static int PTRCALL
513PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
514                    const char **nextTokPtr) {
515  if (HAS_CHAR(enc, ptr, end)) {
516    if (CHAR_MATCHES(enc, ptr, ASCII_x))
517      return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
518    switch (BYTE_TYPE(enc, ptr)) {
519    case BT_DIGIT:
520      break;
521    default:
522      *nextTokPtr = ptr;
523      return XML_TOK_INVALID;
524    }
525    for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
526      switch (BYTE_TYPE(enc, ptr)) {
527      case BT_DIGIT:
528        break;
529      case BT_SEMI:
530        *nextTokPtr = ptr + MINBPC(enc);
531        return XML_TOK_CHAR_REF;
532      default:
533        *nextTokPtr = ptr;
534        return XML_TOK_INVALID;
535      }
536    }
537  }
538  return XML_TOK_PARTIAL;
539}
540
541/* ptr points to character following "&" */
542
543static int PTRCALL
544PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
545                const char **nextTokPtr) {
546  REQUIRE_CHAR(enc, ptr, end);
547  switch (BYTE_TYPE(enc, ptr)) {
548    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
549  case BT_NUM:
550    return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
551  default:
552    *nextTokPtr = ptr;
553    return XML_TOK_INVALID;
554  }
555  while (HAS_CHAR(enc, ptr, end)) {
556    switch (BYTE_TYPE(enc, ptr)) {
557      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
558    case BT_SEMI:
559      *nextTokPtr = ptr + MINBPC(enc);
560      return XML_TOK_ENTITY_REF;
561    default:
562      *nextTokPtr = ptr;
563      return XML_TOK_INVALID;
564    }
565  }
566  return XML_TOK_PARTIAL;
567}
568
569/* ptr points to character following first character of attribute name */
570
571static int PTRCALL
572PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
573                 const char **nextTokPtr) {
574#  ifdef XML_NS
575  int hadColon = 0;
576#  endif
577  while (HAS_CHAR(enc, ptr, end)) {
578    switch (BYTE_TYPE(enc, ptr)) {
579      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
580#  ifdef XML_NS
581    case BT_COLON:
582      if (hadColon) {
583        *nextTokPtr = ptr;
584        return XML_TOK_INVALID;
585      }
586      hadColon = 1;
587      ptr += MINBPC(enc);
588      REQUIRE_CHAR(enc, ptr, end);
589      switch (BYTE_TYPE(enc, ptr)) {
590        CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
591      default:
592        *nextTokPtr = ptr;
593        return XML_TOK_INVALID;
594      }
595      break;
596#  endif
597    case BT_S:
598    case BT_CR:
599    case BT_LF:
600      for (;;) {
601        int t;
602
603        ptr += MINBPC(enc);
604        REQUIRE_CHAR(enc, ptr, end);
605        t = BYTE_TYPE(enc, ptr);
606        if (t == BT_EQUALS)
607          break;
608        switch (t) {
609        case BT_S:
610        case BT_LF:
611        case BT_CR:
612          break;
613        default:
614          *nextTokPtr = ptr;
615          return XML_TOK_INVALID;
616        }
617      }
618      /* fall through */
619    case BT_EQUALS: {
620      int open;
621#  ifdef XML_NS
622      hadColon = 0;
623#  endif
624      for (;;) {
625        ptr += MINBPC(enc);
626        REQUIRE_CHAR(enc, ptr, end);
627        open = BYTE_TYPE(enc, ptr);
628        if (open == BT_QUOT || open == BT_APOS)
629          break;
630        switch (open) {
631        case BT_S:
632        case BT_LF:
633        case BT_CR:
634          break;
635        default:
636          *nextTokPtr = ptr;
637          return XML_TOK_INVALID;
638        }
639      }
640      ptr += MINBPC(enc);
641      /* in attribute value */
642      for (;;) {
643        int t;
644        REQUIRE_CHAR(enc, ptr, end);
645        t = BYTE_TYPE(enc, ptr);
646        if (t == open)
647          break;
648        switch (t) {
649          INVALID_CASES(ptr, nextTokPtr)
650        case BT_AMP: {
651          int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
652          if (tok <= 0) {
653            if (tok == XML_TOK_INVALID)
654              *nextTokPtr = ptr;
655            return tok;
656          }
657          break;
658        }
659        case BT_LT:
660          *nextTokPtr = ptr;
661          return XML_TOK_INVALID;
662        default:
663          ptr += MINBPC(enc);
664          break;
665        }
666      }
667      ptr += MINBPC(enc);
668      REQUIRE_CHAR(enc, ptr, end);
669      switch (BYTE_TYPE(enc, ptr)) {
670      case BT_S:
671      case BT_CR:
672      case BT_LF:
673        break;
674      case BT_SOL:
675        goto sol;
676      case BT_GT:
677        goto gt;
678      default:
679        *nextTokPtr = ptr;
680        return XML_TOK_INVALID;
681      }
682      /* ptr points to closing quote */
683      for (;;) {
684        ptr += MINBPC(enc);
685        REQUIRE_CHAR(enc, ptr, end);
686        switch (BYTE_TYPE(enc, ptr)) {
687          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
688        case BT_S:
689        case BT_CR:
690        case BT_LF:
691          continue;
692        case BT_GT:
693        gt:
694          *nextTokPtr = ptr + MINBPC(enc);
695          return XML_TOK_START_TAG_WITH_ATTS;
696        case BT_SOL:
697        sol:
698          ptr += MINBPC(enc);
699          REQUIRE_CHAR(enc, ptr, end);
700          if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
701            *nextTokPtr = ptr;
702            return XML_TOK_INVALID;
703          }
704          *nextTokPtr = ptr + MINBPC(enc);
705          return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
706        default:
707          *nextTokPtr = ptr;
708          return XML_TOK_INVALID;
709        }
710        break;
711      }
712      break;
713    }
714    default:
715      *nextTokPtr = ptr;
716      return XML_TOK_INVALID;
717    }
718  }
719  return XML_TOK_PARTIAL;
720}
721
722/* ptr points to character following "<" */
723
724static int PTRCALL
725PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
726               const char **nextTokPtr) {
727#  ifdef XML_NS
728  int hadColon;
729#  endif
730  REQUIRE_CHAR(enc, ptr, end);
731  switch (BYTE_TYPE(enc, ptr)) {
732    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
733  case BT_EXCL:
734    ptr += MINBPC(enc);
735    REQUIRE_CHAR(enc, ptr, end);
736    switch (BYTE_TYPE(enc, ptr)) {
737    case BT_MINUS:
738      return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
739    case BT_LSQB:
740      return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
741    }
742    *nextTokPtr = ptr;
743    return XML_TOK_INVALID;
744  case BT_QUEST:
745    return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
746  case BT_SOL:
747    return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
748  default:
749    *nextTokPtr = ptr;
750    return XML_TOK_INVALID;
751  }
752#  ifdef XML_NS
753  hadColon = 0;
754#  endif
755  /* we have a start-tag */
756  while (HAS_CHAR(enc, ptr, end)) {
757    switch (BYTE_TYPE(enc, ptr)) {
758      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
759#  ifdef XML_NS
760    case BT_COLON:
761      if (hadColon) {
762        *nextTokPtr = ptr;
763        return XML_TOK_INVALID;
764      }
765      hadColon = 1;
766      ptr += MINBPC(enc);
767      REQUIRE_CHAR(enc, ptr, end);
768      switch (BYTE_TYPE(enc, ptr)) {
769        CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
770      default:
771        *nextTokPtr = ptr;
772        return XML_TOK_INVALID;
773      }
774      break;
775#  endif
776    case BT_S:
777    case BT_CR:
778    case BT_LF: {
779      ptr += MINBPC(enc);
780      while (HAS_CHAR(enc, ptr, end)) {
781        switch (BYTE_TYPE(enc, ptr)) {
782          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
783        case BT_GT:
784          goto gt;
785        case BT_SOL:
786          goto sol;
787        case BT_S:
788        case BT_CR:
789        case BT_LF:
790          ptr += MINBPC(enc);
791          continue;
792        default:
793          *nextTokPtr = ptr;
794          return XML_TOK_INVALID;
795        }
796        return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
797      }
798      return XML_TOK_PARTIAL;
799    }
800    case BT_GT:
801    gt:
802      *nextTokPtr = ptr + MINBPC(enc);
803      return XML_TOK_START_TAG_NO_ATTS;
804    case BT_SOL:
805    sol:
806      ptr += MINBPC(enc);
807      REQUIRE_CHAR(enc, ptr, end);
808      if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
809        *nextTokPtr = ptr;
810        return XML_TOK_INVALID;
811      }
812      *nextTokPtr = ptr + MINBPC(enc);
813      return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
814    default:
815      *nextTokPtr = ptr;
816      return XML_TOK_INVALID;
817    }
818  }
819  return XML_TOK_PARTIAL;
820}
821
822static int PTRCALL
823PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
824                   const char **nextTokPtr) {
825  if (ptr >= end)
826    return XML_TOK_NONE;
827  if (MINBPC(enc) > 1) {
828    size_t n = end - ptr;
829    if (n & (MINBPC(enc) - 1)) {
830      n &= ~(MINBPC(enc) - 1);
831      if (n == 0)
832        return XML_TOK_PARTIAL;
833      end = ptr + n;
834    }
835  }
836  switch (BYTE_TYPE(enc, ptr)) {
837  case BT_LT:
838    return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
839  case BT_AMP:
840    return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
841  case BT_CR:
842    ptr += MINBPC(enc);
843    if (! HAS_CHAR(enc, ptr, end))
844      return XML_TOK_TRAILING_CR;
845    if (BYTE_TYPE(enc, ptr) == BT_LF)
846      ptr += MINBPC(enc);
847    *nextTokPtr = ptr;
848    return XML_TOK_DATA_NEWLINE;
849  case BT_LF:
850    *nextTokPtr = ptr + MINBPC(enc);
851    return XML_TOK_DATA_NEWLINE;
852  case BT_RSQB:
853    ptr += MINBPC(enc);
854    if (! HAS_CHAR(enc, ptr, end))
855      return XML_TOK_TRAILING_RSQB;
856    if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
857      break;
858    ptr += MINBPC(enc);
859    if (! HAS_CHAR(enc, ptr, end))
860      return XML_TOK_TRAILING_RSQB;
861    if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
862      ptr -= MINBPC(enc);
863      break;
864    }
865    *nextTokPtr = ptr;
866    return XML_TOK_INVALID;
867    INVALID_CASES(ptr, nextTokPtr)
868  default:
869    ptr += MINBPC(enc);
870    break;
871  }
872  while (HAS_CHAR(enc, ptr, end)) {
873    switch (BYTE_TYPE(enc, ptr)) {
874#  define LEAD_CASE(n)                                                         \
875  case BT_LEAD##n:                                                             \
876    if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
877      *nextTokPtr = ptr;                                                       \
878      return XML_TOK_DATA_CHARS;                                               \
879    }                                                                          \
880    ptr += n;                                                                  \
881    break;
882      LEAD_CASE(2)
883      LEAD_CASE(3)
884      LEAD_CASE(4)
885#  undef LEAD_CASE
886    case BT_RSQB:
887      if (HAS_CHARS(enc, ptr, end, 2)) {
888        if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
889          ptr += MINBPC(enc);
890          break;
891        }
892        if (HAS_CHARS(enc, ptr, end, 3)) {
893          if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
894            ptr += MINBPC(enc);
895            break;
896          }
897          *nextTokPtr = ptr + 2 * MINBPC(enc);
898          return XML_TOK_INVALID;
899        }
900      }
901      /* fall through */
902    case BT_AMP:
903    case BT_LT:
904    case BT_NONXML:
905    case BT_MALFORM:
906    case BT_TRAIL:
907    case BT_CR:
908    case BT_LF:
909      *nextTokPtr = ptr;
910      return XML_TOK_DATA_CHARS;
911    default:
912      ptr += MINBPC(enc);
913      break;
914    }
915  }
916  *nextTokPtr = ptr;
917  return XML_TOK_DATA_CHARS;
918}
919
920/* ptr points to character following "%" */
921
922static int PTRCALL
923PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
924                    const char **nextTokPtr) {
925  REQUIRE_CHAR(enc, ptr, end);
926  switch (BYTE_TYPE(enc, ptr)) {
927    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
928  case BT_S:
929  case BT_LF:
930  case BT_CR:
931  case BT_PERCNT:
932    *nextTokPtr = ptr;
933    return XML_TOK_PERCENT;
934  default:
935    *nextTokPtr = ptr;
936    return XML_TOK_INVALID;
937  }
938  while (HAS_CHAR(enc, ptr, end)) {
939    switch (BYTE_TYPE(enc, ptr)) {
940      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
941    case BT_SEMI:
942      *nextTokPtr = ptr + MINBPC(enc);
943      return XML_TOK_PARAM_ENTITY_REF;
944    default:
945      *nextTokPtr = ptr;
946      return XML_TOK_INVALID;
947    }
948  }
949  return XML_TOK_PARTIAL;
950}
951
952static int PTRCALL
953PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
954                      const char **nextTokPtr) {
955  REQUIRE_CHAR(enc, ptr, end);
956  switch (BYTE_TYPE(enc, ptr)) {
957    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
958  default:
959    *nextTokPtr = ptr;
960    return XML_TOK_INVALID;
961  }
962  while (HAS_CHAR(enc, ptr, end)) {
963    switch (BYTE_TYPE(enc, ptr)) {
964      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
965    case BT_CR:
966    case BT_LF:
967    case BT_S:
968    case BT_RPAR:
969    case BT_GT:
970    case BT_PERCNT:
971    case BT_VERBAR:
972      *nextTokPtr = ptr;
973      return XML_TOK_POUND_NAME;
974    default:
975      *nextTokPtr = ptr;
976      return XML_TOK_INVALID;
977    }
978  }
979  return -XML_TOK_POUND_NAME;
980}
981
982static int PTRCALL
983PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
984                const char **nextTokPtr) {
985  while (HAS_CHAR(enc, ptr, end)) {
986    int t = BYTE_TYPE(enc, ptr);
987    switch (t) {
988      INVALID_CASES(ptr, nextTokPtr)
989    case BT_QUOT:
990    case BT_APOS:
991      ptr += MINBPC(enc);
992      if (t != open)
993        break;
994      if (! HAS_CHAR(enc, ptr, end))
995        return -XML_TOK_LITERAL;
996      *nextTokPtr = ptr;
997      switch (BYTE_TYPE(enc, ptr)) {
998      case BT_S:
999      case BT_CR:
1000      case BT_LF:
1001      case BT_GT:
1002      case BT_PERCNT:
1003      case BT_LSQB:
1004        return XML_TOK_LITERAL;
1005      default:
1006        return XML_TOK_INVALID;
1007      }
1008    default:
1009      ptr += MINBPC(enc);
1010      break;
1011    }
1012  }
1013  return XML_TOK_PARTIAL;
1014}
1015
1016static int PTRCALL
1017PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1018                  const char **nextTokPtr) {
1019  int tok;
1020  if (ptr >= end)
1021    return XML_TOK_NONE;
1022  if (MINBPC(enc) > 1) {
1023    size_t n = end - ptr;
1024    if (n & (MINBPC(enc) - 1)) {
1025      n &= ~(MINBPC(enc) - 1);
1026      if (n == 0)
1027        return XML_TOK_PARTIAL;
1028      end = ptr + n;
1029    }
1030  }
1031  switch (BYTE_TYPE(enc, ptr)) {
1032  case BT_QUOT:
1033    return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1034  case BT_APOS:
1035    return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1036  case BT_LT: {
1037    ptr += MINBPC(enc);
1038    REQUIRE_CHAR(enc, ptr, end);
1039    switch (BYTE_TYPE(enc, ptr)) {
1040    case BT_EXCL:
1041      return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1042    case BT_QUEST:
1043      return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1044    case BT_NMSTRT:
1045    case BT_HEX:
1046    case BT_NONASCII:
1047    case BT_LEAD2:
1048    case BT_LEAD3:
1049    case BT_LEAD4:
1050      *nextTokPtr = ptr - MINBPC(enc);
1051      return XML_TOK_INSTANCE_START;
1052    }
1053    *nextTokPtr = ptr;
1054    return XML_TOK_INVALID;
1055  }
1056  case BT_CR:
1057    if (ptr + MINBPC(enc) == end) {
1058      *nextTokPtr = end;
1059      /* indicate that this might be part of a CR/LF pair */
1060      return -XML_TOK_PROLOG_S;
1061    }
1062    /* fall through */
1063  case BT_S:
1064  case BT_LF:
1065    for (;;) {
1066      ptr += MINBPC(enc);
1067      if (! HAS_CHAR(enc, ptr, end))
1068        break;
1069      switch (BYTE_TYPE(enc, ptr)) {
1070      case BT_S:
1071      case BT_LF:
1072        break;
1073      case BT_CR:
1074        /* don't split CR/LF pair */
1075        if (ptr + MINBPC(enc) != end)
1076          break;
1077        /* fall through */
1078      default:
1079        *nextTokPtr = ptr;
1080        return XML_TOK_PROLOG_S;
1081      }
1082    }
1083    *nextTokPtr = ptr;
1084    return XML_TOK_PROLOG_S;
1085  case BT_PERCNT:
1086    return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1087  case BT_COMMA:
1088    *nextTokPtr = ptr + MINBPC(enc);
1089    return XML_TOK_COMMA;
1090  case BT_LSQB:
1091    *nextTokPtr = ptr + MINBPC(enc);
1092    return XML_TOK_OPEN_BRACKET;
1093  case BT_RSQB:
1094    ptr += MINBPC(enc);
1095    if (! HAS_CHAR(enc, ptr, end))
1096      return -XML_TOK_CLOSE_BRACKET;
1097    if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1098      REQUIRE_CHARS(enc, ptr, end, 2);
1099      if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1100        *nextTokPtr = ptr + 2 * MINBPC(enc);
1101        return XML_TOK_COND_SECT_CLOSE;
1102      }
1103    }
1104    *nextTokPtr = ptr;
1105    return XML_TOK_CLOSE_BRACKET;
1106  case BT_LPAR:
1107    *nextTokPtr = ptr + MINBPC(enc);
1108    return XML_TOK_OPEN_PAREN;
1109  case BT_RPAR:
1110    ptr += MINBPC(enc);
1111    if (! HAS_CHAR(enc, ptr, end))
1112      return -XML_TOK_CLOSE_PAREN;
1113    switch (BYTE_TYPE(enc, ptr)) {
1114    case BT_AST:
1115      *nextTokPtr = ptr + MINBPC(enc);
1116      return XML_TOK_CLOSE_PAREN_ASTERISK;
1117    case BT_QUEST:
1118      *nextTokPtr = ptr + MINBPC(enc);
1119      return XML_TOK_CLOSE_PAREN_QUESTION;
1120    case BT_PLUS:
1121      *nextTokPtr = ptr + MINBPC(enc);
1122      return XML_TOK_CLOSE_PAREN_PLUS;
1123    case BT_CR:
1124    case BT_LF:
1125    case BT_S:
1126    case BT_GT:
1127    case BT_COMMA:
1128    case BT_VERBAR:
1129    case BT_RPAR:
1130      *nextTokPtr = ptr;
1131      return XML_TOK_CLOSE_PAREN;
1132    }
1133    *nextTokPtr = ptr;
1134    return XML_TOK_INVALID;
1135  case BT_VERBAR:
1136    *nextTokPtr = ptr + MINBPC(enc);
1137    return XML_TOK_OR;
1138  case BT_GT:
1139    *nextTokPtr = ptr + MINBPC(enc);
1140    return XML_TOK_DECL_CLOSE;
1141  case BT_NUM:
1142    return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1143#  define LEAD_CASE(n)                                                         \
1144  case BT_LEAD##n:                                                             \
1145    if (end - ptr < n)                                                         \
1146      return XML_TOK_PARTIAL_CHAR;                                             \
1147    if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
1148      *nextTokPtr = ptr;                                                       \
1149      return XML_TOK_INVALID;                                                  \
1150    }                                                                          \
1151    if (IS_NMSTRT_CHAR(enc, ptr, n)) {                                         \
1152      ptr += n;                                                                \
1153      tok = XML_TOK_NAME;                                                      \
1154      break;                                                                   \
1155    }                                                                          \
1156    if (IS_NAME_CHAR(enc, ptr, n)) {                                           \
1157      ptr += n;                                                                \
1158      tok = XML_TOK_NMTOKEN;                                                   \
1159      break;                                                                   \
1160    }                                                                          \
1161    *nextTokPtr = ptr;                                                         \
1162    return XML_TOK_INVALID;
1163    LEAD_CASE(2)
1164    LEAD_CASE(3)
1165    LEAD_CASE(4)
1166#  undef LEAD_CASE
1167  case BT_NMSTRT:
1168  case BT_HEX:
1169    tok = XML_TOK_NAME;
1170    ptr += MINBPC(enc);
1171    break;
1172  case BT_DIGIT:
1173  case BT_NAME:
1174  case BT_MINUS:
1175#  ifdef XML_NS
1176  case BT_COLON:
1177#  endif
1178    tok = XML_TOK_NMTOKEN;
1179    ptr += MINBPC(enc);
1180    break;
1181  case BT_NONASCII:
1182    if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1183      ptr += MINBPC(enc);
1184      tok = XML_TOK_NAME;
1185      break;
1186    }
1187    if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1188      ptr += MINBPC(enc);
1189      tok = XML_TOK_NMTOKEN;
1190      break;
1191    }
1192    /* fall through */
1193  default:
1194    *nextTokPtr = ptr;
1195    return XML_TOK_INVALID;
1196  }
1197  while (HAS_CHAR(enc, ptr, end)) {
1198    switch (BYTE_TYPE(enc, ptr)) {
1199      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1200    case BT_GT:
1201    case BT_RPAR:
1202    case BT_COMMA:
1203    case BT_VERBAR:
1204    case BT_LSQB:
1205    case BT_PERCNT:
1206    case BT_S:
1207    case BT_CR:
1208    case BT_LF:
1209      *nextTokPtr = ptr;
1210      return tok;
1211#  ifdef XML_NS
1212    case BT_COLON:
1213      ptr += MINBPC(enc);
1214      switch (tok) {
1215      case XML_TOK_NAME:
1216        REQUIRE_CHAR(enc, ptr, end);
1217        tok = XML_TOK_PREFIXED_NAME;
1218        switch (BYTE_TYPE(enc, ptr)) {
1219          CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1220        default:
1221          tok = XML_TOK_NMTOKEN;
1222          break;
1223        }
1224        break;
1225      case XML_TOK_PREFIXED_NAME:
1226        tok = XML_TOK_NMTOKEN;
1227        break;
1228      }
1229      break;
1230#  endif
1231    case BT_PLUS:
1232      if (tok == XML_TOK_NMTOKEN) {
1233        *nextTokPtr = ptr;
1234        return XML_TOK_INVALID;
1235      }
1236      *nextTokPtr = ptr + MINBPC(enc);
1237      return XML_TOK_NAME_PLUS;
1238    case BT_AST:
1239      if (tok == XML_TOK_NMTOKEN) {
1240        *nextTokPtr = ptr;
1241        return XML_TOK_INVALID;
1242      }
1243      *nextTokPtr = ptr + MINBPC(enc);
1244      return XML_TOK_NAME_ASTERISK;
1245    case BT_QUEST:
1246      if (tok == XML_TOK_NMTOKEN) {
1247        *nextTokPtr = ptr;
1248        return XML_TOK_INVALID;
1249      }
1250      *nextTokPtr = ptr + MINBPC(enc);
1251      return XML_TOK_NAME_QUESTION;
1252    default:
1253      *nextTokPtr = ptr;
1254      return XML_TOK_INVALID;
1255    }
1256  }
1257  return -tok;
1258}
1259
1260static int PTRCALL
1261PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1262                          const char **nextTokPtr) {
1263  const char *start;
1264  if (ptr >= end)
1265    return XML_TOK_NONE;
1266  else if (! HAS_CHAR(enc, ptr, end)) {
1267    /* This line cannot be executed.  The incoming data has already
1268     * been tokenized once, so incomplete characters like this have
1269     * already been eliminated from the input.  Retaining the paranoia
1270     * check is still valuable, however.
1271     */
1272    return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1273  }
1274  start = ptr;
1275  while (HAS_CHAR(enc, ptr, end)) {
1276    switch (BYTE_TYPE(enc, ptr)) {
1277#  define LEAD_CASE(n)                                                         \
1278  case BT_LEAD##n:                                                             \
1279    ptr += n; /* NOTE: The encoding has already been validated. */             \
1280    break;
1281      LEAD_CASE(2)
1282      LEAD_CASE(3)
1283      LEAD_CASE(4)
1284#  undef LEAD_CASE
1285    case BT_AMP:
1286      if (ptr == start)
1287        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1288      *nextTokPtr = ptr;
1289      return XML_TOK_DATA_CHARS;
1290    case BT_LT:
1291      /* this is for inside entity references */
1292      *nextTokPtr = ptr;
1293      return XML_TOK_INVALID;
1294    case BT_LF:
1295      if (ptr == start) {
1296        *nextTokPtr = ptr + MINBPC(enc);
1297        return XML_TOK_DATA_NEWLINE;
1298      }
1299      *nextTokPtr = ptr;
1300      return XML_TOK_DATA_CHARS;
1301    case BT_CR:
1302      if (ptr == start) {
1303        ptr += MINBPC(enc);
1304        if (! HAS_CHAR(enc, ptr, end))
1305          return XML_TOK_TRAILING_CR;
1306        if (BYTE_TYPE(enc, ptr) == BT_LF)
1307          ptr += MINBPC(enc);
1308        *nextTokPtr = ptr;
1309        return XML_TOK_DATA_NEWLINE;
1310      }
1311      *nextTokPtr = ptr;
1312      return XML_TOK_DATA_CHARS;
1313    case BT_S:
1314      if (ptr == start) {
1315        *nextTokPtr = ptr + MINBPC(enc);
1316        return XML_TOK_ATTRIBUTE_VALUE_S;
1317      }
1318      *nextTokPtr = ptr;
1319      return XML_TOK_DATA_CHARS;
1320    default:
1321      ptr += MINBPC(enc);
1322      break;
1323    }
1324  }
1325  *nextTokPtr = ptr;
1326  return XML_TOK_DATA_CHARS;
1327}
1328
1329static int PTRCALL
1330PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1331                       const char **nextTokPtr) {
1332  const char *start;
1333  if (ptr >= end)
1334    return XML_TOK_NONE;
1335  else if (! HAS_CHAR(enc, ptr, end)) {
1336    /* This line cannot be executed.  The incoming data has already
1337     * been tokenized once, so incomplete characters like this have
1338     * already been eliminated from the input.  Retaining the paranoia
1339     * check is still valuable, however.
1340     */
1341    return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1342  }
1343  start = ptr;
1344  while (HAS_CHAR(enc, ptr, end)) {
1345    switch (BYTE_TYPE(enc, ptr)) {
1346#  define LEAD_CASE(n)                                                         \
1347  case BT_LEAD##n:                                                             \
1348    ptr += n; /* NOTE: The encoding has already been validated. */             \
1349    break;
1350      LEAD_CASE(2)
1351      LEAD_CASE(3)
1352      LEAD_CASE(4)
1353#  undef LEAD_CASE
1354    case BT_AMP:
1355      if (ptr == start)
1356        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1357      *nextTokPtr = ptr;
1358      return XML_TOK_DATA_CHARS;
1359    case BT_PERCNT:
1360      if (ptr == start) {
1361        int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1362        return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1363      }
1364      *nextTokPtr = ptr;
1365      return XML_TOK_DATA_CHARS;
1366    case BT_LF:
1367      if (ptr == start) {
1368        *nextTokPtr = ptr + MINBPC(enc);
1369        return XML_TOK_DATA_NEWLINE;
1370      }
1371      *nextTokPtr = ptr;
1372      return XML_TOK_DATA_CHARS;
1373    case BT_CR:
1374      if (ptr == start) {
1375        ptr += MINBPC(enc);
1376        if (! HAS_CHAR(enc, ptr, end))
1377          return XML_TOK_TRAILING_CR;
1378        if (BYTE_TYPE(enc, ptr) == BT_LF)
1379          ptr += MINBPC(enc);
1380        *nextTokPtr = ptr;
1381        return XML_TOK_DATA_NEWLINE;
1382      }
1383      *nextTokPtr = ptr;
1384      return XML_TOK_DATA_CHARS;
1385    default:
1386      ptr += MINBPC(enc);
1387      break;
1388    }
1389  }
1390  *nextTokPtr = ptr;
1391  return XML_TOK_DATA_CHARS;
1392}
1393
1394#  ifdef XML_DTD
1395
1396static int PTRCALL
1397PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1398                         const char **nextTokPtr) {
1399  int level = 0;
1400  if (MINBPC(enc) > 1) {
1401    size_t n = end - ptr;
1402    if (n & (MINBPC(enc) - 1)) {
1403      n &= ~(MINBPC(enc) - 1);
1404      end = ptr + n;
1405    }
1406  }
1407  while (HAS_CHAR(enc, ptr, end)) {
1408    switch (BYTE_TYPE(enc, ptr)) {
1409      INVALID_CASES(ptr, nextTokPtr)
1410    case BT_LT:
1411      ptr += MINBPC(enc);
1412      REQUIRE_CHAR(enc, ptr, end);
1413      if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1414        ptr += MINBPC(enc);
1415        REQUIRE_CHAR(enc, ptr, end);
1416        if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1417          ++level;
1418          ptr += MINBPC(enc);
1419        }
1420      }
1421      break;
1422    case BT_RSQB:
1423      ptr += MINBPC(enc);
1424      REQUIRE_CHAR(enc, ptr, end);
1425      if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1426        ptr += MINBPC(enc);
1427        REQUIRE_CHAR(enc, ptr, end);
1428        if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1429          ptr += MINBPC(enc);
1430          if (level == 0) {
1431            *nextTokPtr = ptr;
1432            return XML_TOK_IGNORE_SECT;
1433          }
1434          --level;
1435        }
1436      }
1437      break;
1438    default:
1439      ptr += MINBPC(enc);
1440      break;
1441    }
1442  }
1443  return XML_TOK_PARTIAL;
1444}
1445
1446#  endif /* XML_DTD */
1447
1448static int PTRCALL
1449PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1450                   const char **badPtr) {
1451  ptr += MINBPC(enc);
1452  end -= MINBPC(enc);
1453  for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1454    switch (BYTE_TYPE(enc, ptr)) {
1455    case BT_DIGIT:
1456    case BT_HEX:
1457    case BT_MINUS:
1458    case BT_APOS:
1459    case BT_LPAR:
1460    case BT_RPAR:
1461    case BT_PLUS:
1462    case BT_COMMA:
1463    case BT_SOL:
1464    case BT_EQUALS:
1465    case BT_QUEST:
1466    case BT_CR:
1467    case BT_LF:
1468    case BT_SEMI:
1469    case BT_EXCL:
1470    case BT_AST:
1471    case BT_PERCNT:
1472    case BT_NUM:
1473#  ifdef XML_NS
1474    case BT_COLON:
1475#  endif
1476      break;
1477    case BT_S:
1478      if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1479        *badPtr = ptr;
1480        return 0;
1481      }
1482      break;
1483    case BT_NAME:
1484    case BT_NMSTRT:
1485      if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1486        break;
1487      /* fall through */
1488    default:
1489      switch (BYTE_TO_ASCII(enc, ptr)) {
1490      case 0x24: /* $ */
1491      case 0x40: /* @ */
1492        break;
1493      default:
1494        *badPtr = ptr;
1495        return 0;
1496      }
1497      break;
1498    }
1499  }
1500  return 1;
1501}
1502
1503/* This must only be called for a well-formed start-tag or empty
1504   element tag.  Returns the number of attributes.  Pointers to the
1505   first attsMax attributes are stored in atts.
1506*/
1507
1508static int PTRCALL
1509PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1510                ATTRIBUTE *atts) {
1511  enum { other, inName, inValue } state = inName;
1512  int nAtts = 0;
1513  int open = 0; /* defined when state == inValue;
1514                   initialization just to shut up compilers */
1515
1516  for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1517    switch (BYTE_TYPE(enc, ptr)) {
1518#  define START_NAME                                                           \
1519    if (state == other) {                                                      \
1520      if (nAtts < attsMax) {                                                   \
1521        atts[nAtts].name = ptr;                                                \
1522        atts[nAtts].normalized = 1;                                            \
1523      }                                                                        \
1524      state = inName;                                                          \
1525    }
1526#  define LEAD_CASE(n)                                                         \
1527  case BT_LEAD##n: /* NOTE: The encoding has already been validated. */        \
1528    START_NAME ptr += (n - MINBPC(enc));                                       \
1529    break;
1530      LEAD_CASE(2)
1531      LEAD_CASE(3)
1532      LEAD_CASE(4)
1533#  undef LEAD_CASE
1534    case BT_NONASCII:
1535    case BT_NMSTRT:
1536    case BT_HEX:
1537      START_NAME
1538      break;
1539#  undef START_NAME
1540    case BT_QUOT:
1541      if (state != inValue) {
1542        if (nAtts < attsMax)
1543          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1544        state = inValue;
1545        open = BT_QUOT;
1546      } else if (open == BT_QUOT) {
1547        state = other;
1548        if (nAtts < attsMax)
1549          atts[nAtts].valueEnd = ptr;
1550        nAtts++;
1551      }
1552      break;
1553    case BT_APOS:
1554      if (state != inValue) {
1555        if (nAtts < attsMax)
1556          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1557        state = inValue;
1558        open = BT_APOS;
1559      } else if (open == BT_APOS) {
1560        state = other;
1561        if (nAtts < attsMax)
1562          atts[nAtts].valueEnd = ptr;
1563        nAtts++;
1564      }
1565      break;
1566    case BT_AMP:
1567      if (nAtts < attsMax)
1568        atts[nAtts].normalized = 0;
1569      break;
1570    case BT_S:
1571      if (state == inName)
1572        state = other;
1573      else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1574               && (ptr == atts[nAtts].valuePtr
1575                   || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1576                   || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1577                   || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1578        atts[nAtts].normalized = 0;
1579      break;
1580    case BT_CR:
1581    case BT_LF:
1582      /* This case ensures that the first attribute name is counted
1583         Apart from that we could just change state on the quote. */
1584      if (state == inName)
1585        state = other;
1586      else if (state == inValue && nAtts < attsMax)
1587        atts[nAtts].normalized = 0;
1588      break;
1589    case BT_GT:
1590    case BT_SOL:
1591      if (state != inValue)
1592        return nAtts;
1593      break;
1594    default:
1595      break;
1596    }
1597  }
1598  /* not reached */
1599}
1600
1601static int PTRFASTCALL
1602PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1603  int result = 0;
1604  /* skip &# */
1605  UNUSED_P(enc);
1606  ptr += 2 * MINBPC(enc);
1607  if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1608    for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1609         ptr += MINBPC(enc)) {
1610      int c = BYTE_TO_ASCII(enc, ptr);
1611      switch (c) {
1612      case ASCII_0:
1613      case ASCII_1:
1614      case ASCII_2:
1615      case ASCII_3:
1616      case ASCII_4:
1617      case ASCII_5:
1618      case ASCII_6:
1619      case ASCII_7:
1620      case ASCII_8:
1621      case ASCII_9:
1622        result <<= 4;
1623        result |= (c - ASCII_0);
1624        break;
1625      case ASCII_A:
1626      case ASCII_B:
1627      case ASCII_C:
1628      case ASCII_D:
1629      case ASCII_E:
1630      case ASCII_F:
1631        result <<= 4;
1632        result += 10 + (c - ASCII_A);
1633        break;
1634      case ASCII_a:
1635      case ASCII_b:
1636      case ASCII_c:
1637      case ASCII_d:
1638      case ASCII_e:
1639      case ASCII_f:
1640        result <<= 4;
1641        result += 10 + (c - ASCII_a);
1642        break;
1643      }
1644      if (result >= 0x110000)
1645        return -1;
1646    }
1647  } else {
1648    for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1649      int c = BYTE_TO_ASCII(enc, ptr);
1650      result *= 10;
1651      result += (c - ASCII_0);
1652      if (result >= 0x110000)
1653        return -1;
1654    }
1655  }
1656  return checkCharRefNumber(result);
1657}
1658
1659static int PTRCALL
1660PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1661                             const char *end) {
1662  UNUSED_P(enc);
1663  switch ((end - ptr) / MINBPC(enc)) {
1664  case 2:
1665    if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1666      switch (BYTE_TO_ASCII(enc, ptr)) {
1667      case ASCII_l:
1668        return ASCII_LT;
1669      case ASCII_g:
1670        return ASCII_GT;
1671      }
1672    }
1673    break;
1674  case 3:
1675    if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1676      ptr += MINBPC(enc);
1677      if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1678        ptr += MINBPC(enc);
1679        if (CHAR_MATCHES(enc, ptr, ASCII_p))
1680          return ASCII_AMP;
1681      }
1682    }
1683    break;
1684  case 4:
1685    switch (BYTE_TO_ASCII(enc, ptr)) {
1686    case ASCII_q:
1687      ptr += MINBPC(enc);
1688      if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1689        ptr += MINBPC(enc);
1690        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1691          ptr += MINBPC(enc);
1692          if (CHAR_MATCHES(enc, ptr, ASCII_t))
1693            return ASCII_QUOT;
1694        }
1695      }
1696      break;
1697    case ASCII_a:
1698      ptr += MINBPC(enc);
1699      if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1700        ptr += MINBPC(enc);
1701        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1702          ptr += MINBPC(enc);
1703          if (CHAR_MATCHES(enc, ptr, ASCII_s))
1704            return ASCII_APOS;
1705        }
1706      }
1707      break;
1708    }
1709  }
1710  return 0;
1711}
1712
1713static int PTRCALL
1714PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1715                         const char *end1, const char *ptr2) {
1716  UNUSED_P(enc);
1717  for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1718    if (end1 - ptr1 < MINBPC(enc)) {
1719      /* This line cannot be executed.  The incoming data has already
1720       * been tokenized once, so incomplete characters like this have
1721       * already been eliminated from the input.  Retaining the
1722       * paranoia check is still valuable, however.
1723       */
1724      return 0; /* LCOV_EXCL_LINE */
1725    }
1726    if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1727      return 0;
1728  }
1729  return ptr1 == end1;
1730}
1731
1732static int PTRFASTCALL
1733PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1734  const char *start = ptr;
1735  for (;;) {
1736    switch (BYTE_TYPE(enc, ptr)) {
1737#  define LEAD_CASE(n)                                                         \
1738  case BT_LEAD##n:                                                             \
1739    ptr += n; /* NOTE: The encoding has already been validated. */             \
1740    break;
1741      LEAD_CASE(2)
1742      LEAD_CASE(3)
1743      LEAD_CASE(4)
1744#  undef LEAD_CASE
1745    case BT_NONASCII:
1746    case BT_NMSTRT:
1747#  ifdef XML_NS
1748    case BT_COLON:
1749#  endif
1750    case BT_HEX:
1751    case BT_DIGIT:
1752    case BT_NAME:
1753    case BT_MINUS:
1754      ptr += MINBPC(enc);
1755      break;
1756    default:
1757      return (int)(ptr - start);
1758    }
1759  }
1760}
1761
1762static const char *PTRFASTCALL
1763PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1764  for (;;) {
1765    switch (BYTE_TYPE(enc, ptr)) {
1766    case BT_LF:
1767    case BT_CR:
1768    case BT_S:
1769      ptr += MINBPC(enc);
1770      break;
1771    default:
1772      return ptr;
1773    }
1774  }
1775}
1776
1777static void PTRCALL
1778PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1779                       POSITION *pos) {
1780  while (HAS_CHAR(enc, ptr, end)) {
1781    switch (BYTE_TYPE(enc, ptr)) {
1782#  define LEAD_CASE(n)                                                         \
1783  case BT_LEAD##n:                                                             \
1784    ptr += n; /* NOTE: The encoding has already been validated. */             \
1785    pos->columnNumber++;                                                       \
1786    break;
1787      LEAD_CASE(2)
1788      LEAD_CASE(3)
1789      LEAD_CASE(4)
1790#  undef LEAD_CASE
1791    case BT_LF:
1792      pos->columnNumber = 0;
1793      pos->lineNumber++;
1794      ptr += MINBPC(enc);
1795      break;
1796    case BT_CR:
1797      pos->lineNumber++;
1798      ptr += MINBPC(enc);
1799      if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1800        ptr += MINBPC(enc);
1801      pos->columnNumber = 0;
1802      break;
1803    default:
1804      ptr += MINBPC(enc);
1805      pos->columnNumber++;
1806      break;
1807    }
1808  }
1809}
1810
1811#  undef DO_LEAD_CASE
1812#  undef MULTIBYTE_CASES
1813#  undef INVALID_CASES
1814#  undef CHECK_NAME_CASE
1815#  undef CHECK_NAME_CASES
1816#  undef CHECK_NMSTRT_CASE
1817#  undef CHECK_NMSTRT_CASES
1818
1819#endif /* XML_TOK_IMPL_C */
1820