xmltok_impl.c revision 302385
1/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2   See the file COPYING for copying permission.
3*/
4
5/* This file is included! */
6#ifdef XML_TOK_IMPL_C
7
8#ifndef IS_INVALID_CHAR
9#define IS_INVALID_CHAR(enc, ptr, n) (0)
10#endif
11
12#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
13    case BT_LEAD ## n: \
14      if (end - ptr < n) \
15        return XML_TOK_PARTIAL_CHAR; \
16      if (IS_INVALID_CHAR(enc, ptr, n)) { \
17        *(nextTokPtr) = (ptr); \
18        return XML_TOK_INVALID; \
19      } \
20      ptr += n; \
21      break;
22
23#define INVALID_CASES(ptr, nextTokPtr) \
24  INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25  INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26  INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
27  case BT_NONXML: \
28  case BT_MALFORM: \
29  case BT_TRAIL: \
30    *(nextTokPtr) = (ptr); \
31    return XML_TOK_INVALID;
32
33#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
34   case BT_LEAD ## n: \
35     if (end - ptr < n) \
36       return XML_TOK_PARTIAL_CHAR; \
37     if (!IS_NAME_CHAR(enc, ptr, n)) { \
38       *nextTokPtr = ptr; \
39       return XML_TOK_INVALID; \
40     } \
41     ptr += n; \
42     break;
43
44#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
45  case BT_NONASCII: \
46    if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
47      *nextTokPtr = ptr; \
48      return XML_TOK_INVALID; \
49    } \
50  case BT_NMSTRT: \
51  case BT_HEX: \
52  case BT_DIGIT: \
53  case BT_NAME: \
54  case BT_MINUS: \
55    ptr += MINBPC(enc); \
56    break; \
57  CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58  CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59  CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
60
61#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
62   case BT_LEAD ## n: \
63     if (end - ptr < n) \
64       return XML_TOK_PARTIAL_CHAR; \
65     if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
66       *nextTokPtr = ptr; \
67       return XML_TOK_INVALID; \
68     } \
69     ptr += n; \
70     break;
71
72#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
73  case BT_NONASCII: \
74    if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
75      *nextTokPtr = ptr; \
76      return XML_TOK_INVALID; \
77    } \
78  case BT_NMSTRT: \
79  case BT_HEX: \
80    ptr += MINBPC(enc); \
81    break; \
82  CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83  CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84  CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
85
86#ifndef PREFIX
87#define PREFIX(ident) ident
88#endif
89
90
91#define HAS_CHARS(enc, ptr, end, count) \
92    (end - ptr >= count * MINBPC(enc))
93
94#define HAS_CHAR(enc, ptr, end) \
95    HAS_CHARS(enc, ptr, end, 1)
96
97#define REQUIRE_CHARS(enc, ptr, end, count) \
98    { \
99      if (! HAS_CHARS(enc, ptr, end, count)) { \
100        return XML_TOK_PARTIAL; \
101      } \
102    }
103
104#define REQUIRE_CHAR(enc, ptr, end) \
105    REQUIRE_CHARS(enc, ptr, end, 1)
106
107
108/* ptr points to character following "<!-" */
109
110static int PTRCALL
111PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
112                    const char *end, const char **nextTokPtr)
113{
114  if (HAS_CHAR(enc, ptr, end)) {
115    if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
116      *nextTokPtr = ptr;
117      return XML_TOK_INVALID;
118    }
119    ptr += MINBPC(enc);
120    while (HAS_CHAR(enc, ptr, end)) {
121      switch (BYTE_TYPE(enc, ptr)) {
122      INVALID_CASES(ptr, nextTokPtr)
123      case BT_MINUS:
124        ptr += MINBPC(enc);
125        REQUIRE_CHAR(enc, ptr, end);
126        if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
127          ptr += MINBPC(enc);
128          REQUIRE_CHAR(enc, ptr, end);
129          if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
130            *nextTokPtr = ptr;
131            return XML_TOK_INVALID;
132          }
133          *nextTokPtr = ptr + MINBPC(enc);
134          return XML_TOK_COMMENT;
135        }
136        break;
137      default:
138        ptr += MINBPC(enc);
139        break;
140      }
141    }
142  }
143  return XML_TOK_PARTIAL;
144}
145
146/* ptr points to character following "<!" */
147
148static int PTRCALL
149PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
150                 const char *end, const char **nextTokPtr)
151{
152  REQUIRE_CHAR(enc, ptr, end);
153  switch (BYTE_TYPE(enc, ptr)) {
154  case BT_MINUS:
155    return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
156  case BT_LSQB:
157    *nextTokPtr = ptr + MINBPC(enc);
158    return XML_TOK_COND_SECT_OPEN;
159  case BT_NMSTRT:
160  case BT_HEX:
161    ptr += MINBPC(enc);
162    break;
163  default:
164    *nextTokPtr = ptr;
165    return XML_TOK_INVALID;
166  }
167  while (HAS_CHAR(enc, ptr, end)) {
168    switch (BYTE_TYPE(enc, ptr)) {
169    case BT_PERCNT:
170      REQUIRE_CHARS(enc, ptr, end, 2);
171      /* don't allow <!ENTITY% foo "whatever"> */
172      switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
173      case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
174        *nextTokPtr = ptr;
175        return XML_TOK_INVALID;
176      }
177      /* fall through */
178    case BT_S: case BT_CR: case BT_LF:
179      *nextTokPtr = ptr;
180      return XML_TOK_DECL_OPEN;
181    case BT_NMSTRT:
182    case BT_HEX:
183      ptr += MINBPC(enc);
184      break;
185    default:
186      *nextTokPtr = ptr;
187      return XML_TOK_INVALID;
188    }
189  }
190  return XML_TOK_PARTIAL;
191}
192
193static int PTRCALL
194PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
195                      const char *end, int *tokPtr)
196{
197  int upper = 0;
198  *tokPtr = XML_TOK_PI;
199  if (end - ptr != MINBPC(enc)*3)
200    return 1;
201  switch (BYTE_TO_ASCII(enc, ptr)) {
202  case ASCII_x:
203    break;
204  case ASCII_X:
205    upper = 1;
206    break;
207  default:
208    return 1;
209  }
210  ptr += MINBPC(enc);
211  switch (BYTE_TO_ASCII(enc, ptr)) {
212  case ASCII_m:
213    break;
214  case ASCII_M:
215    upper = 1;
216    break;
217  default:
218    return 1;
219  }
220  ptr += MINBPC(enc);
221  switch (BYTE_TO_ASCII(enc, ptr)) {
222  case ASCII_l:
223    break;
224  case ASCII_L:
225    upper = 1;
226    break;
227  default:
228    return 1;
229  }
230  if (upper)
231    return 0;
232  *tokPtr = XML_TOK_XML_DECL;
233  return 1;
234}
235
236/* ptr points to character following "<?" */
237
238static int PTRCALL
239PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
240               const char *end, const char **nextTokPtr)
241{
242  int tok;
243  const char *target = ptr;
244  REQUIRE_CHAR(enc, ptr, end);
245  switch (BYTE_TYPE(enc, ptr)) {
246  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
247  default:
248    *nextTokPtr = ptr;
249    return XML_TOK_INVALID;
250  }
251  while (HAS_CHAR(enc, ptr, end)) {
252    switch (BYTE_TYPE(enc, ptr)) {
253    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
254    case BT_S: case BT_CR: case BT_LF:
255      if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
256        *nextTokPtr = ptr;
257        return XML_TOK_INVALID;
258      }
259      ptr += MINBPC(enc);
260      while (HAS_CHAR(enc, ptr, end)) {
261        switch (BYTE_TYPE(enc, ptr)) {
262        INVALID_CASES(ptr, nextTokPtr)
263        case BT_QUEST:
264          ptr += MINBPC(enc);
265          REQUIRE_CHAR(enc, ptr, end);
266          if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
267            *nextTokPtr = ptr + MINBPC(enc);
268            return tok;
269          }
270          break;
271        default:
272          ptr += MINBPC(enc);
273          break;
274        }
275      }
276      return XML_TOK_PARTIAL;
277    case BT_QUEST:
278      if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
279        *nextTokPtr = ptr;
280        return XML_TOK_INVALID;
281      }
282      ptr += MINBPC(enc);
283      REQUIRE_CHAR(enc, ptr, end);
284      if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
285        *nextTokPtr = ptr + MINBPC(enc);
286        return tok;
287      }
288      /* fall through */
289    default:
290      *nextTokPtr = ptr;
291      return XML_TOK_INVALID;
292    }
293  }
294  return XML_TOK_PARTIAL;
295}
296
297static int PTRCALL
298PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
299                         const char *end, const char **nextTokPtr)
300{
301  static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
302                                     ASCII_T, ASCII_A, ASCII_LSQB };
303  int i;
304  /* CDATA[ */
305  REQUIRE_CHARS(enc, ptr, end, 6);
306  for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
307    if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
308      *nextTokPtr = ptr;
309      return XML_TOK_INVALID;
310    }
311  }
312  *nextTokPtr = ptr;
313  return XML_TOK_CDATA_SECT_OPEN;
314}
315
316static int PTRCALL
317PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
318                        const char *end, const char **nextTokPtr)
319{
320  if (ptr >= end)
321    return XML_TOK_NONE;
322  if (MINBPC(enc) > 1) {
323    size_t n = end - ptr;
324    if (n & (MINBPC(enc) - 1)) {
325      n &= ~(MINBPC(enc) - 1);
326      if (n == 0)
327        return XML_TOK_PARTIAL;
328      end = ptr + n;
329    }
330  }
331  switch (BYTE_TYPE(enc, ptr)) {
332  case BT_RSQB:
333    ptr += MINBPC(enc);
334    REQUIRE_CHAR(enc, ptr, end);
335    if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
336      break;
337    ptr += MINBPC(enc);
338    REQUIRE_CHAR(enc, ptr, end);
339    if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
340      ptr -= MINBPC(enc);
341      break;
342    }
343    *nextTokPtr = ptr + MINBPC(enc);
344    return XML_TOK_CDATA_SECT_CLOSE;
345  case BT_CR:
346    ptr += MINBPC(enc);
347    REQUIRE_CHAR(enc, ptr, end);
348    if (BYTE_TYPE(enc, ptr) == BT_LF)
349      ptr += MINBPC(enc);
350    *nextTokPtr = ptr;
351    return XML_TOK_DATA_NEWLINE;
352  case BT_LF:
353    *nextTokPtr = ptr + MINBPC(enc);
354    return XML_TOK_DATA_NEWLINE;
355  INVALID_CASES(ptr, nextTokPtr)
356  default:
357    ptr += MINBPC(enc);
358    break;
359  }
360  while (HAS_CHAR(enc, ptr, end)) {
361    switch (BYTE_TYPE(enc, ptr)) {
362#define LEAD_CASE(n) \
363    case BT_LEAD ## n: \
364      if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
365        *nextTokPtr = ptr; \
366        return XML_TOK_DATA_CHARS; \
367      } \
368      ptr += n; \
369      break;
370    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
371#undef LEAD_CASE
372    case BT_NONXML:
373    case BT_MALFORM:
374    case BT_TRAIL:
375    case BT_CR:
376    case BT_LF:
377    case BT_RSQB:
378      *nextTokPtr = ptr;
379      return XML_TOK_DATA_CHARS;
380    default:
381      ptr += MINBPC(enc);
382      break;
383    }
384  }
385  *nextTokPtr = ptr;
386  return XML_TOK_DATA_CHARS;
387}
388
389/* ptr points to character following "</" */
390
391static int PTRCALL
392PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
393                   const char *end, const char **nextTokPtr)
394{
395  REQUIRE_CHAR(enc, ptr, end);
396  switch (BYTE_TYPE(enc, ptr)) {
397  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
398  default:
399    *nextTokPtr = ptr;
400    return XML_TOK_INVALID;
401  }
402  while (HAS_CHAR(enc, ptr, end)) {
403    switch (BYTE_TYPE(enc, ptr)) {
404    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
405    case BT_S: case BT_CR: case BT_LF:
406      for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
407        switch (BYTE_TYPE(enc, ptr)) {
408        case BT_S: case BT_CR: case BT_LF:
409          break;
410        case BT_GT:
411          *nextTokPtr = ptr + MINBPC(enc);
412          return XML_TOK_END_TAG;
413        default:
414          *nextTokPtr = ptr;
415          return XML_TOK_INVALID;
416        }
417      }
418      return XML_TOK_PARTIAL;
419#ifdef XML_NS
420    case BT_COLON:
421      /* no need to check qname syntax here,
422         since end-tag must match exactly */
423      ptr += MINBPC(enc);
424      break;
425#endif
426    case BT_GT:
427      *nextTokPtr = ptr + MINBPC(enc);
428      return XML_TOK_END_TAG;
429    default:
430      *nextTokPtr = ptr;
431      return XML_TOK_INVALID;
432    }
433  }
434  return XML_TOK_PARTIAL;
435}
436
437/* ptr points to character following "&#X" */
438
439static int PTRCALL
440PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
441                       const char *end, const char **nextTokPtr)
442{
443  if (HAS_CHAR(enc, ptr, end)) {
444    switch (BYTE_TYPE(enc, ptr)) {
445    case BT_DIGIT:
446    case BT_HEX:
447      break;
448    default:
449      *nextTokPtr = ptr;
450      return XML_TOK_INVALID;
451    }
452    for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
453      switch (BYTE_TYPE(enc, ptr)) {
454      case BT_DIGIT:
455      case BT_HEX:
456        break;
457      case BT_SEMI:
458        *nextTokPtr = ptr + MINBPC(enc);
459        return XML_TOK_CHAR_REF;
460      default:
461        *nextTokPtr = ptr;
462        return XML_TOK_INVALID;
463      }
464    }
465  }
466  return XML_TOK_PARTIAL;
467}
468
469/* ptr points to character following "&#" */
470
471static int PTRCALL
472PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
473                    const char *end, const char **nextTokPtr)
474{
475  if (HAS_CHAR(enc, ptr, end)) {
476    if (CHAR_MATCHES(enc, ptr, ASCII_x))
477      return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
478    switch (BYTE_TYPE(enc, ptr)) {
479    case BT_DIGIT:
480      break;
481    default:
482      *nextTokPtr = ptr;
483      return XML_TOK_INVALID;
484    }
485    for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
486      switch (BYTE_TYPE(enc, ptr)) {
487      case BT_DIGIT:
488        break;
489      case BT_SEMI:
490        *nextTokPtr = ptr + MINBPC(enc);
491        return XML_TOK_CHAR_REF;
492      default:
493        *nextTokPtr = ptr;
494        return XML_TOK_INVALID;
495      }
496    }
497  }
498  return XML_TOK_PARTIAL;
499}
500
501/* ptr points to character following "&" */
502
503static int PTRCALL
504PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
505                const char **nextTokPtr)
506{
507  REQUIRE_CHAR(enc, ptr, end);
508  switch (BYTE_TYPE(enc, ptr)) {
509  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
510  case BT_NUM:
511    return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
512  default:
513    *nextTokPtr = ptr;
514    return XML_TOK_INVALID;
515  }
516  while (HAS_CHAR(enc, ptr, end)) {
517    switch (BYTE_TYPE(enc, ptr)) {
518    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
519    case BT_SEMI:
520      *nextTokPtr = ptr + MINBPC(enc);
521      return XML_TOK_ENTITY_REF;
522    default:
523      *nextTokPtr = ptr;
524      return XML_TOK_INVALID;
525    }
526  }
527  return XML_TOK_PARTIAL;
528}
529
530/* ptr points to character following first character of attribute name */
531
532static int PTRCALL
533PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
534                 const char **nextTokPtr)
535{
536#ifdef XML_NS
537  int hadColon = 0;
538#endif
539  while (HAS_CHAR(enc, ptr, end)) {
540    switch (BYTE_TYPE(enc, ptr)) {
541    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
542#ifdef XML_NS
543    case BT_COLON:
544      if (hadColon) {
545        *nextTokPtr = ptr;
546        return XML_TOK_INVALID;
547      }
548      hadColon = 1;
549      ptr += MINBPC(enc);
550      REQUIRE_CHAR(enc, ptr, end);
551      switch (BYTE_TYPE(enc, ptr)) {
552      CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
553      default:
554        *nextTokPtr = ptr;
555        return XML_TOK_INVALID;
556      }
557      break;
558#endif
559    case BT_S: case BT_CR: case BT_LF:
560      for (;;) {
561        int t;
562
563        ptr += MINBPC(enc);
564        REQUIRE_CHAR(enc, ptr, end);
565        t = BYTE_TYPE(enc, ptr);
566        if (t == BT_EQUALS)
567          break;
568        switch (t) {
569        case BT_S:
570        case BT_LF:
571        case BT_CR:
572          break;
573        default:
574          *nextTokPtr = ptr;
575          return XML_TOK_INVALID;
576        }
577      }
578    /* fall through */
579    case BT_EQUALS:
580      {
581        int open;
582#ifdef XML_NS
583        hadColon = 0;
584#endif
585        for (;;) {
586          ptr += MINBPC(enc);
587          REQUIRE_CHAR(enc, ptr, end);
588          open = BYTE_TYPE(enc, ptr);
589          if (open == BT_QUOT || open == BT_APOS)
590            break;
591          switch (open) {
592          case BT_S:
593          case BT_LF:
594          case BT_CR:
595            break;
596          default:
597            *nextTokPtr = ptr;
598            return XML_TOK_INVALID;
599          }
600        }
601        ptr += MINBPC(enc);
602        /* in attribute value */
603        for (;;) {
604          int t;
605          REQUIRE_CHAR(enc, ptr, end);
606          t = BYTE_TYPE(enc, ptr);
607          if (t == open)
608            break;
609          switch (t) {
610          INVALID_CASES(ptr, nextTokPtr)
611          case BT_AMP:
612            {
613              int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
614              if (tok <= 0) {
615                if (tok == XML_TOK_INVALID)
616                  *nextTokPtr = ptr;
617                return tok;
618              }
619              break;
620            }
621          case BT_LT:
622            *nextTokPtr = ptr;
623            return XML_TOK_INVALID;
624          default:
625            ptr += MINBPC(enc);
626            break;
627          }
628        }
629        ptr += MINBPC(enc);
630        REQUIRE_CHAR(enc, ptr, end);
631        switch (BYTE_TYPE(enc, ptr)) {
632        case BT_S:
633        case BT_CR:
634        case BT_LF:
635          break;
636        case BT_SOL:
637          goto sol;
638        case BT_GT:
639          goto gt;
640        default:
641          *nextTokPtr = ptr;
642          return XML_TOK_INVALID;
643        }
644        /* ptr points to closing quote */
645        for (;;) {
646          ptr += MINBPC(enc);
647          REQUIRE_CHAR(enc, ptr, end);
648          switch (BYTE_TYPE(enc, ptr)) {
649          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
650          case BT_S: case BT_CR: case BT_LF:
651            continue;
652          case BT_GT:
653          gt:
654            *nextTokPtr = ptr + MINBPC(enc);
655            return XML_TOK_START_TAG_WITH_ATTS;
656          case BT_SOL:
657          sol:
658            ptr += MINBPC(enc);
659            REQUIRE_CHAR(enc, ptr, end);
660            if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
661              *nextTokPtr = ptr;
662              return XML_TOK_INVALID;
663            }
664            *nextTokPtr = ptr + MINBPC(enc);
665            return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
666          default:
667            *nextTokPtr = ptr;
668            return XML_TOK_INVALID;
669          }
670          break;
671        }
672        break;
673      }
674    default:
675      *nextTokPtr = ptr;
676      return XML_TOK_INVALID;
677    }
678  }
679  return XML_TOK_PARTIAL;
680}
681
682/* ptr points to character following "<" */
683
684static int PTRCALL
685PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
686               const char **nextTokPtr)
687{
688#ifdef XML_NS
689  int hadColon;
690#endif
691  REQUIRE_CHAR(enc, ptr, end);
692  switch (BYTE_TYPE(enc, ptr)) {
693  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
694  case BT_EXCL:
695    ptr += MINBPC(enc);
696    REQUIRE_CHAR(enc, ptr, end);
697    switch (BYTE_TYPE(enc, ptr)) {
698    case BT_MINUS:
699      return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
700    case BT_LSQB:
701      return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
702                                      end, nextTokPtr);
703    }
704    *nextTokPtr = ptr;
705    return XML_TOK_INVALID;
706  case BT_QUEST:
707    return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
708  case BT_SOL:
709    return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
710  default:
711    *nextTokPtr = ptr;
712    return XML_TOK_INVALID;
713  }
714#ifdef XML_NS
715  hadColon = 0;
716#endif
717  /* we have a start-tag */
718  while (HAS_CHAR(enc, ptr, end)) {
719    switch (BYTE_TYPE(enc, ptr)) {
720    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
721#ifdef XML_NS
722    case BT_COLON:
723      if (hadColon) {
724        *nextTokPtr = ptr;
725        return XML_TOK_INVALID;
726      }
727      hadColon = 1;
728      ptr += MINBPC(enc);
729      REQUIRE_CHAR(enc, ptr, end);
730      switch (BYTE_TYPE(enc, ptr)) {
731      CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
732      default:
733        *nextTokPtr = ptr;
734        return XML_TOK_INVALID;
735      }
736      break;
737#endif
738    case BT_S: case BT_CR: case BT_LF:
739      {
740        ptr += MINBPC(enc);
741        while (HAS_CHAR(enc, ptr, end)) {
742          switch (BYTE_TYPE(enc, ptr)) {
743          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
744          case BT_GT:
745            goto gt;
746          case BT_SOL:
747            goto sol;
748          case BT_S: case BT_CR: case BT_LF:
749            ptr += MINBPC(enc);
750            continue;
751          default:
752            *nextTokPtr = ptr;
753            return XML_TOK_INVALID;
754          }
755          return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
756        }
757        return XML_TOK_PARTIAL;
758      }
759    case BT_GT:
760    gt:
761      *nextTokPtr = ptr + MINBPC(enc);
762      return XML_TOK_START_TAG_NO_ATTS;
763    case BT_SOL:
764    sol:
765      ptr += MINBPC(enc);
766      REQUIRE_CHAR(enc, ptr, end);
767      if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
768        *nextTokPtr = ptr;
769        return XML_TOK_INVALID;
770      }
771      *nextTokPtr = ptr + MINBPC(enc);
772      return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
773    default:
774      *nextTokPtr = ptr;
775      return XML_TOK_INVALID;
776    }
777  }
778  return XML_TOK_PARTIAL;
779}
780
781static int PTRCALL
782PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
783                   const char **nextTokPtr)
784{
785  if (ptr >= end)
786    return XML_TOK_NONE;
787  if (MINBPC(enc) > 1) {
788    size_t n = end - ptr;
789    if (n & (MINBPC(enc) - 1)) {
790      n &= ~(MINBPC(enc) - 1);
791      if (n == 0)
792        return XML_TOK_PARTIAL;
793      end = ptr + n;
794    }
795  }
796  switch (BYTE_TYPE(enc, ptr)) {
797  case BT_LT:
798    return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
799  case BT_AMP:
800    return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
801  case BT_CR:
802    ptr += MINBPC(enc);
803    if (! HAS_CHAR(enc, ptr, end))
804      return XML_TOK_TRAILING_CR;
805    if (BYTE_TYPE(enc, ptr) == BT_LF)
806      ptr += MINBPC(enc);
807    *nextTokPtr = ptr;
808    return XML_TOK_DATA_NEWLINE;
809  case BT_LF:
810    *nextTokPtr = ptr + MINBPC(enc);
811    return XML_TOK_DATA_NEWLINE;
812  case BT_RSQB:
813    ptr += MINBPC(enc);
814    if (! HAS_CHAR(enc, ptr, end))
815      return XML_TOK_TRAILING_RSQB;
816    if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
817      break;
818    ptr += MINBPC(enc);
819    if (! HAS_CHAR(enc, ptr, end))
820      return XML_TOK_TRAILING_RSQB;
821    if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
822      ptr -= MINBPC(enc);
823      break;
824    }
825    *nextTokPtr = ptr;
826    return XML_TOK_INVALID;
827  INVALID_CASES(ptr, nextTokPtr)
828  default:
829    ptr += MINBPC(enc);
830    break;
831  }
832  while (HAS_CHAR(enc, ptr, end)) {
833    switch (BYTE_TYPE(enc, ptr)) {
834#define LEAD_CASE(n) \
835    case BT_LEAD ## n: \
836      if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
837        *nextTokPtr = ptr; \
838        return XML_TOK_DATA_CHARS; \
839      } \
840      ptr += n; \
841      break;
842    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
843#undef LEAD_CASE
844    case BT_RSQB:
845      if (HAS_CHARS(enc, ptr, end, 2)) {
846         if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
847           ptr += MINBPC(enc);
848           break;
849         }
850         if (HAS_CHARS(enc, ptr, end, 3)) {
851           if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
852             ptr += MINBPC(enc);
853             break;
854           }
855           *nextTokPtr = ptr + 2*MINBPC(enc);
856           return XML_TOK_INVALID;
857         }
858      }
859      /* fall through */
860    case BT_AMP:
861    case BT_LT:
862    case BT_NONXML:
863    case BT_MALFORM:
864    case BT_TRAIL:
865    case BT_CR:
866    case BT_LF:
867      *nextTokPtr = ptr;
868      return XML_TOK_DATA_CHARS;
869    default:
870      ptr += MINBPC(enc);
871      break;
872    }
873  }
874  *nextTokPtr = ptr;
875  return XML_TOK_DATA_CHARS;
876}
877
878/* ptr points to character following "%" */
879
880static int PTRCALL
881PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
882                    const char **nextTokPtr)
883{
884  REQUIRE_CHAR(enc, ptr, end);
885  switch (BYTE_TYPE(enc, ptr)) {
886  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
887  case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
888    *nextTokPtr = ptr;
889    return XML_TOK_PERCENT;
890  default:
891    *nextTokPtr = ptr;
892    return XML_TOK_INVALID;
893  }
894  while (HAS_CHAR(enc, ptr, end)) {
895    switch (BYTE_TYPE(enc, ptr)) {
896    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
897    case BT_SEMI:
898      *nextTokPtr = ptr + MINBPC(enc);
899      return XML_TOK_PARAM_ENTITY_REF;
900    default:
901      *nextTokPtr = ptr;
902      return XML_TOK_INVALID;
903    }
904  }
905  return XML_TOK_PARTIAL;
906}
907
908static int PTRCALL
909PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
910                      const char **nextTokPtr)
911{
912  REQUIRE_CHAR(enc, ptr, end);
913  switch (BYTE_TYPE(enc, ptr)) {
914  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
915  default:
916    *nextTokPtr = ptr;
917    return XML_TOK_INVALID;
918  }
919  while (HAS_CHAR(enc, ptr, end)) {
920    switch (BYTE_TYPE(enc, ptr)) {
921    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
922    case BT_CR: case BT_LF: case BT_S:
923    case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
924      *nextTokPtr = ptr;
925      return XML_TOK_POUND_NAME;
926    default:
927      *nextTokPtr = ptr;
928      return XML_TOK_INVALID;
929    }
930  }
931  return -XML_TOK_POUND_NAME;
932}
933
934static int PTRCALL
935PREFIX(scanLit)(int open, const ENCODING *enc,
936                const char *ptr, const char *end,
937                const char **nextTokPtr)
938{
939  while (HAS_CHAR(enc, ptr, end)) {
940    int t = BYTE_TYPE(enc, ptr);
941    switch (t) {
942    INVALID_CASES(ptr, nextTokPtr)
943    case BT_QUOT:
944    case BT_APOS:
945      ptr += MINBPC(enc);
946      if (t != open)
947        break;
948      if (! HAS_CHAR(enc, ptr, end))
949        return -XML_TOK_LITERAL;
950      *nextTokPtr = ptr;
951      switch (BYTE_TYPE(enc, ptr)) {
952      case BT_S: case BT_CR: case BT_LF:
953      case BT_GT: case BT_PERCNT: case BT_LSQB:
954        return XML_TOK_LITERAL;
955      default:
956        return XML_TOK_INVALID;
957      }
958    default:
959      ptr += MINBPC(enc);
960      break;
961    }
962  }
963  return XML_TOK_PARTIAL;
964}
965
966static int PTRCALL
967PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
968                  const char **nextTokPtr)
969{
970  int tok;
971  if (ptr >= end)
972    return XML_TOK_NONE;
973  if (MINBPC(enc) > 1) {
974    size_t n = end - ptr;
975    if (n & (MINBPC(enc) - 1)) {
976      n &= ~(MINBPC(enc) - 1);
977      if (n == 0)
978        return XML_TOK_PARTIAL;
979      end = ptr + n;
980    }
981  }
982  switch (BYTE_TYPE(enc, ptr)) {
983  case BT_QUOT:
984    return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
985  case BT_APOS:
986    return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
987  case BT_LT:
988    {
989      ptr += MINBPC(enc);
990      REQUIRE_CHAR(enc, ptr, end);
991      switch (BYTE_TYPE(enc, ptr)) {
992      case BT_EXCL:
993        return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
994      case BT_QUEST:
995        return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
996      case BT_NMSTRT:
997      case BT_HEX:
998      case BT_NONASCII:
999      case BT_LEAD2:
1000      case BT_LEAD3:
1001      case BT_LEAD4:
1002        *nextTokPtr = ptr - MINBPC(enc);
1003        return XML_TOK_INSTANCE_START;
1004      }
1005      *nextTokPtr = ptr;
1006      return XML_TOK_INVALID;
1007    }
1008  case BT_CR:
1009    if (ptr + MINBPC(enc) == end) {
1010      *nextTokPtr = end;
1011      /* indicate that this might be part of a CR/LF pair */
1012      return -XML_TOK_PROLOG_S;
1013    }
1014    /* fall through */
1015  case BT_S: case BT_LF:
1016    for (;;) {
1017      ptr += MINBPC(enc);
1018      if (! HAS_CHAR(enc, ptr, end))
1019        break;
1020      switch (BYTE_TYPE(enc, ptr)) {
1021      case BT_S: case BT_LF:
1022        break;
1023      case BT_CR:
1024        /* don't split CR/LF pair */
1025        if (ptr + MINBPC(enc) != end)
1026          break;
1027        /* fall through */
1028      default:
1029        *nextTokPtr = ptr;
1030        return XML_TOK_PROLOG_S;
1031      }
1032    }
1033    *nextTokPtr = ptr;
1034    return XML_TOK_PROLOG_S;
1035  case BT_PERCNT:
1036    return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1037  case BT_COMMA:
1038    *nextTokPtr = ptr + MINBPC(enc);
1039    return XML_TOK_COMMA;
1040  case BT_LSQB:
1041    *nextTokPtr = ptr + MINBPC(enc);
1042    return XML_TOK_OPEN_BRACKET;
1043  case BT_RSQB:
1044    ptr += MINBPC(enc);
1045    if (! HAS_CHAR(enc, ptr, end))
1046      return -XML_TOK_CLOSE_BRACKET;
1047    if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1048      REQUIRE_CHARS(enc, ptr, end, 2);
1049      if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1050        *nextTokPtr = ptr + 2*MINBPC(enc);
1051        return XML_TOK_COND_SECT_CLOSE;
1052      }
1053    }
1054    *nextTokPtr = ptr;
1055    return XML_TOK_CLOSE_BRACKET;
1056  case BT_LPAR:
1057    *nextTokPtr = ptr + MINBPC(enc);
1058    return XML_TOK_OPEN_PAREN;
1059  case BT_RPAR:
1060    ptr += MINBPC(enc);
1061    if (! HAS_CHAR(enc, ptr, end))
1062      return -XML_TOK_CLOSE_PAREN;
1063    switch (BYTE_TYPE(enc, ptr)) {
1064    case BT_AST:
1065      *nextTokPtr = ptr + MINBPC(enc);
1066      return XML_TOK_CLOSE_PAREN_ASTERISK;
1067    case BT_QUEST:
1068      *nextTokPtr = ptr + MINBPC(enc);
1069      return XML_TOK_CLOSE_PAREN_QUESTION;
1070    case BT_PLUS:
1071      *nextTokPtr = ptr + MINBPC(enc);
1072      return XML_TOK_CLOSE_PAREN_PLUS;
1073    case BT_CR: case BT_LF: case BT_S:
1074    case BT_GT: case BT_COMMA: case BT_VERBAR:
1075    case BT_RPAR:
1076      *nextTokPtr = ptr;
1077      return XML_TOK_CLOSE_PAREN;
1078    }
1079    *nextTokPtr = ptr;
1080    return XML_TOK_INVALID;
1081  case BT_VERBAR:
1082    *nextTokPtr = ptr + MINBPC(enc);
1083    return XML_TOK_OR;
1084  case BT_GT:
1085    *nextTokPtr = ptr + MINBPC(enc);
1086    return XML_TOK_DECL_CLOSE;
1087  case BT_NUM:
1088    return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1089#define LEAD_CASE(n) \
1090  case BT_LEAD ## n: \
1091    if (end - ptr < n) \
1092      return XML_TOK_PARTIAL_CHAR; \
1093    if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1094      ptr += n; \
1095      tok = XML_TOK_NAME; \
1096      break; \
1097    } \
1098    if (IS_NAME_CHAR(enc, ptr, n)) { \
1099      ptr += n; \
1100      tok = XML_TOK_NMTOKEN; \
1101      break; \
1102    } \
1103    *nextTokPtr = ptr; \
1104    return XML_TOK_INVALID;
1105    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1106#undef LEAD_CASE
1107  case BT_NMSTRT:
1108  case BT_HEX:
1109    tok = XML_TOK_NAME;
1110    ptr += MINBPC(enc);
1111    break;
1112  case BT_DIGIT:
1113  case BT_NAME:
1114  case BT_MINUS:
1115#ifdef XML_NS
1116  case BT_COLON:
1117#endif
1118    tok = XML_TOK_NMTOKEN;
1119    ptr += MINBPC(enc);
1120    break;
1121  case BT_NONASCII:
1122    if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1123      ptr += MINBPC(enc);
1124      tok = XML_TOK_NAME;
1125      break;
1126    }
1127    if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1128      ptr += MINBPC(enc);
1129      tok = XML_TOK_NMTOKEN;
1130      break;
1131    }
1132    /* fall through */
1133  default:
1134    *nextTokPtr = ptr;
1135    return XML_TOK_INVALID;
1136  }
1137  while (HAS_CHAR(enc, ptr, end)) {
1138    switch (BYTE_TYPE(enc, ptr)) {
1139    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1140    case BT_GT: case BT_RPAR: case BT_COMMA:
1141    case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1142    case BT_S: case BT_CR: case BT_LF:
1143      *nextTokPtr = ptr;
1144      return tok;
1145#ifdef XML_NS
1146    case BT_COLON:
1147      ptr += MINBPC(enc);
1148      switch (tok) {
1149      case XML_TOK_NAME:
1150        REQUIRE_CHAR(enc, ptr, end);
1151        tok = XML_TOK_PREFIXED_NAME;
1152        switch (BYTE_TYPE(enc, ptr)) {
1153        CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1154        default:
1155          tok = XML_TOK_NMTOKEN;
1156          break;
1157        }
1158        break;
1159      case XML_TOK_PREFIXED_NAME:
1160        tok = XML_TOK_NMTOKEN;
1161        break;
1162      }
1163      break;
1164#endif
1165    case BT_PLUS:
1166      if (tok == XML_TOK_NMTOKEN)  {
1167        *nextTokPtr = ptr;
1168        return XML_TOK_INVALID;
1169      }
1170      *nextTokPtr = ptr + MINBPC(enc);
1171      return XML_TOK_NAME_PLUS;
1172    case BT_AST:
1173      if (tok == XML_TOK_NMTOKEN)  {
1174        *nextTokPtr = ptr;
1175        return XML_TOK_INVALID;
1176      }
1177      *nextTokPtr = ptr + MINBPC(enc);
1178      return XML_TOK_NAME_ASTERISK;
1179    case BT_QUEST:
1180      if (tok == XML_TOK_NMTOKEN)  {
1181        *nextTokPtr = ptr;
1182        return XML_TOK_INVALID;
1183      }
1184      *nextTokPtr = ptr + MINBPC(enc);
1185      return XML_TOK_NAME_QUESTION;
1186    default:
1187      *nextTokPtr = ptr;
1188      return XML_TOK_INVALID;
1189    }
1190  }
1191  return -tok;
1192}
1193
1194static int PTRCALL
1195PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1196                          const char *end, const char **nextTokPtr)
1197{
1198  const char *start;
1199  if (ptr >= end)
1200    return XML_TOK_NONE;
1201  else if (! HAS_CHAR(enc, ptr, end))
1202    return XML_TOK_PARTIAL;
1203  start = ptr;
1204  while (HAS_CHAR(enc, ptr, end)) {
1205    switch (BYTE_TYPE(enc, ptr)) {
1206#define LEAD_CASE(n) \
1207    case BT_LEAD ## n: ptr += n; break;
1208    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1209#undef LEAD_CASE
1210    case BT_AMP:
1211      if (ptr == start)
1212        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1213      *nextTokPtr = ptr;
1214      return XML_TOK_DATA_CHARS;
1215    case BT_LT:
1216      /* this is for inside entity references */
1217      *nextTokPtr = ptr;
1218      return XML_TOK_INVALID;
1219    case BT_LF:
1220      if (ptr == start) {
1221        *nextTokPtr = ptr + MINBPC(enc);
1222        return XML_TOK_DATA_NEWLINE;
1223      }
1224      *nextTokPtr = ptr;
1225      return XML_TOK_DATA_CHARS;
1226    case BT_CR:
1227      if (ptr == start) {
1228        ptr += MINBPC(enc);
1229        if (! HAS_CHAR(enc, ptr, end))
1230          return XML_TOK_TRAILING_CR;
1231        if (BYTE_TYPE(enc, ptr) == BT_LF)
1232          ptr += MINBPC(enc);
1233        *nextTokPtr = ptr;
1234        return XML_TOK_DATA_NEWLINE;
1235      }
1236      *nextTokPtr = ptr;
1237      return XML_TOK_DATA_CHARS;
1238    case BT_S:
1239      if (ptr == start) {
1240        *nextTokPtr = ptr + MINBPC(enc);
1241        return XML_TOK_ATTRIBUTE_VALUE_S;
1242      }
1243      *nextTokPtr = ptr;
1244      return XML_TOK_DATA_CHARS;
1245    default:
1246      ptr += MINBPC(enc);
1247      break;
1248    }
1249  }
1250  *nextTokPtr = ptr;
1251  return XML_TOK_DATA_CHARS;
1252}
1253
1254static int PTRCALL
1255PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1256                       const char *end, const char **nextTokPtr)
1257{
1258  const char *start;
1259  if (ptr >= end)
1260    return XML_TOK_NONE;
1261  else if (! HAS_CHAR(enc, ptr, end))
1262    return XML_TOK_PARTIAL;
1263  start = ptr;
1264  while (HAS_CHAR(enc, ptr, end)) {
1265    switch (BYTE_TYPE(enc, ptr)) {
1266#define LEAD_CASE(n) \
1267    case BT_LEAD ## n: ptr += n; break;
1268    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1269#undef LEAD_CASE
1270    case BT_AMP:
1271      if (ptr == start)
1272        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1273      *nextTokPtr = ptr;
1274      return XML_TOK_DATA_CHARS;
1275    case BT_PERCNT:
1276      if (ptr == start) {
1277        int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1278                                       end, nextTokPtr);
1279        return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1280      }
1281      *nextTokPtr = ptr;
1282      return XML_TOK_DATA_CHARS;
1283    case BT_LF:
1284      if (ptr == start) {
1285        *nextTokPtr = ptr + MINBPC(enc);
1286        return XML_TOK_DATA_NEWLINE;
1287      }
1288      *nextTokPtr = ptr;
1289      return XML_TOK_DATA_CHARS;
1290    case BT_CR:
1291      if (ptr == start) {
1292        ptr += MINBPC(enc);
1293        if (! HAS_CHAR(enc, ptr, end))
1294          return XML_TOK_TRAILING_CR;
1295        if (BYTE_TYPE(enc, ptr) == BT_LF)
1296          ptr += MINBPC(enc);
1297        *nextTokPtr = ptr;
1298        return XML_TOK_DATA_NEWLINE;
1299      }
1300      *nextTokPtr = ptr;
1301      return XML_TOK_DATA_CHARS;
1302    default:
1303      ptr += MINBPC(enc);
1304      break;
1305    }
1306  }
1307  *nextTokPtr = ptr;
1308  return XML_TOK_DATA_CHARS;
1309}
1310
1311#ifdef XML_DTD
1312
1313static int PTRCALL
1314PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1315                         const char *end, const char **nextTokPtr)
1316{
1317  int level = 0;
1318  if (MINBPC(enc) > 1) {
1319    size_t n = end - ptr;
1320    if (n & (MINBPC(enc) - 1)) {
1321      n &= ~(MINBPC(enc) - 1);
1322      end = ptr + n;
1323    }
1324  }
1325  while (HAS_CHAR(enc, ptr, end)) {
1326    switch (BYTE_TYPE(enc, ptr)) {
1327    INVALID_CASES(ptr, nextTokPtr)
1328    case BT_LT:
1329      ptr += MINBPC(enc);
1330      REQUIRE_CHAR(enc, ptr, end);
1331      if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1332        ptr += MINBPC(enc);
1333        REQUIRE_CHAR(enc, ptr, end);
1334        if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1335          ++level;
1336          ptr += MINBPC(enc);
1337        }
1338      }
1339      break;
1340    case BT_RSQB:
1341      ptr += MINBPC(enc);
1342      REQUIRE_CHAR(enc, ptr, end);
1343      if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1344        ptr += MINBPC(enc);
1345        REQUIRE_CHAR(enc, ptr, end);
1346        if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1347          ptr += MINBPC(enc);
1348          if (level == 0) {
1349            *nextTokPtr = ptr;
1350            return XML_TOK_IGNORE_SECT;
1351          }
1352          --level;
1353        }
1354      }
1355      break;
1356    default:
1357      ptr += MINBPC(enc);
1358      break;
1359    }
1360  }
1361  return XML_TOK_PARTIAL;
1362}
1363
1364#endif /* XML_DTD */
1365
1366static int PTRCALL
1367PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1368                   const char **badPtr)
1369{
1370  ptr += MINBPC(enc);
1371  end -= MINBPC(enc);
1372  for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1373    switch (BYTE_TYPE(enc, ptr)) {
1374    case BT_DIGIT:
1375    case BT_HEX:
1376    case BT_MINUS:
1377    case BT_APOS:
1378    case BT_LPAR:
1379    case BT_RPAR:
1380    case BT_PLUS:
1381    case BT_COMMA:
1382    case BT_SOL:
1383    case BT_EQUALS:
1384    case BT_QUEST:
1385    case BT_CR:
1386    case BT_LF:
1387    case BT_SEMI:
1388    case BT_EXCL:
1389    case BT_AST:
1390    case BT_PERCNT:
1391    case BT_NUM:
1392#ifdef XML_NS
1393    case BT_COLON:
1394#endif
1395      break;
1396    case BT_S:
1397      if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1398        *badPtr = ptr;
1399        return 0;
1400      }
1401      break;
1402    case BT_NAME:
1403    case BT_NMSTRT:
1404      if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1405        break;
1406    default:
1407      switch (BYTE_TO_ASCII(enc, ptr)) {
1408      case 0x24: /* $ */
1409      case 0x40: /* @ */
1410        break;
1411      default:
1412        *badPtr = ptr;
1413        return 0;
1414      }
1415      break;
1416    }
1417  }
1418  return 1;
1419}
1420
1421/* This must only be called for a well-formed start-tag or empty
1422   element tag.  Returns the number of attributes.  Pointers to the
1423   first attsMax attributes are stored in atts.
1424*/
1425
1426static int PTRCALL
1427PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1428                int attsMax, ATTRIBUTE *atts)
1429{
1430  enum { other, inName, inValue } state = inName;
1431  int nAtts = 0;
1432  int open = 0; /* defined when state == inValue;
1433                   initialization just to shut up compilers */
1434
1435  for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1436    switch (BYTE_TYPE(enc, ptr)) {
1437#define START_NAME \
1438      if (state == other) { \
1439        if (nAtts < attsMax) { \
1440          atts[nAtts].name = ptr; \
1441          atts[nAtts].normalized = 1; \
1442        } \
1443        state = inName; \
1444      }
1445#define LEAD_CASE(n) \
1446    case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1447    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1448#undef LEAD_CASE
1449    case BT_NONASCII:
1450    case BT_NMSTRT:
1451    case BT_HEX:
1452      START_NAME
1453      break;
1454#undef START_NAME
1455    case BT_QUOT:
1456      if (state != inValue) {
1457        if (nAtts < attsMax)
1458          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1459        state = inValue;
1460        open = BT_QUOT;
1461      }
1462      else if (open == BT_QUOT) {
1463        state = other;
1464        if (nAtts < attsMax)
1465          atts[nAtts].valueEnd = ptr;
1466        nAtts++;
1467      }
1468      break;
1469    case BT_APOS:
1470      if (state != inValue) {
1471        if (nAtts < attsMax)
1472          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1473        state = inValue;
1474        open = BT_APOS;
1475      }
1476      else if (open == BT_APOS) {
1477        state = other;
1478        if (nAtts < attsMax)
1479          atts[nAtts].valueEnd = ptr;
1480        nAtts++;
1481      }
1482      break;
1483    case BT_AMP:
1484      if (nAtts < attsMax)
1485        atts[nAtts].normalized = 0;
1486      break;
1487    case BT_S:
1488      if (state == inName)
1489        state = other;
1490      else if (state == inValue
1491               && nAtts < attsMax
1492               && atts[nAtts].normalized
1493               && (ptr == atts[nAtts].valuePtr
1494                   || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1495                   || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1496                   || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1497        atts[nAtts].normalized = 0;
1498      break;
1499    case BT_CR: case BT_LF:
1500      /* This case ensures that the first attribute name is counted
1501         Apart from that we could just change state on the quote. */
1502      if (state == inName)
1503        state = other;
1504      else if (state == inValue && nAtts < attsMax)
1505        atts[nAtts].normalized = 0;
1506      break;
1507    case BT_GT:
1508    case BT_SOL:
1509      if (state != inValue)
1510        return nAtts;
1511      break;
1512    default:
1513      break;
1514    }
1515  }
1516  /* not reached */
1517}
1518
1519static int PTRFASTCALL
1520PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1521{
1522  int result = 0;
1523  /* skip &# */
1524  ptr += 2*MINBPC(enc);
1525  if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1526    for (ptr += MINBPC(enc);
1527         !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1528         ptr += MINBPC(enc)) {
1529      int c = BYTE_TO_ASCII(enc, ptr);
1530      switch (c) {
1531      case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1532      case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1533        result <<= 4;
1534        result |= (c - ASCII_0);
1535        break;
1536      case ASCII_A: case ASCII_B: case ASCII_C:
1537      case ASCII_D: case ASCII_E: case ASCII_F:
1538        result <<= 4;
1539        result += 10 + (c - ASCII_A);
1540        break;
1541      case ASCII_a: case ASCII_b: case ASCII_c:
1542      case ASCII_d: case ASCII_e: case ASCII_f:
1543        result <<= 4;
1544        result += 10 + (c - ASCII_a);
1545        break;
1546      }
1547      if (result >= 0x110000)
1548        return -1;
1549    }
1550  }
1551  else {
1552    for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1553      int c = BYTE_TO_ASCII(enc, ptr);
1554      result *= 10;
1555      result += (c - ASCII_0);
1556      if (result >= 0x110000)
1557        return -1;
1558    }
1559  }
1560  return checkCharRefNumber(result);
1561}
1562
1563static int PTRCALL
1564PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1565                             const char *end)
1566{
1567  switch ((end - ptr)/MINBPC(enc)) {
1568  case 2:
1569    if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1570      switch (BYTE_TO_ASCII(enc, ptr)) {
1571      case ASCII_l:
1572        return ASCII_LT;
1573      case ASCII_g:
1574        return ASCII_GT;
1575      }
1576    }
1577    break;
1578  case 3:
1579    if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1580      ptr += MINBPC(enc);
1581      if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1582        ptr += MINBPC(enc);
1583        if (CHAR_MATCHES(enc, ptr, ASCII_p))
1584          return ASCII_AMP;
1585      }
1586    }
1587    break;
1588  case 4:
1589    switch (BYTE_TO_ASCII(enc, ptr)) {
1590    case ASCII_q:
1591      ptr += MINBPC(enc);
1592      if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1593        ptr += MINBPC(enc);
1594        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1595          ptr += MINBPC(enc);
1596          if (CHAR_MATCHES(enc, ptr, ASCII_t))
1597            return ASCII_QUOT;
1598        }
1599      }
1600      break;
1601    case ASCII_a:
1602      ptr += MINBPC(enc);
1603      if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1604        ptr += MINBPC(enc);
1605        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1606          ptr += MINBPC(enc);
1607          if (CHAR_MATCHES(enc, ptr, ASCII_s))
1608            return ASCII_APOS;
1609        }
1610      }
1611      break;
1612    }
1613  }
1614  return 0;
1615}
1616
1617static int PTRCALL
1618PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1619{
1620  for (;;) {
1621    switch (BYTE_TYPE(enc, ptr1)) {
1622#define LEAD_CASE(n) \
1623    case BT_LEAD ## n: \
1624      if (*ptr1++ != *ptr2++) \
1625        return 0;
1626    LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1627#undef LEAD_CASE
1628      /* fall through */
1629      if (*ptr1++ != *ptr2++)
1630        return 0;
1631      break;
1632    case BT_NONASCII:
1633    case BT_NMSTRT:
1634#ifdef XML_NS
1635    case BT_COLON:
1636#endif
1637    case BT_HEX:
1638    case BT_DIGIT:
1639    case BT_NAME:
1640    case BT_MINUS:
1641      if (*ptr2++ != *ptr1++)
1642        return 0;
1643      if (MINBPC(enc) > 1) {
1644        if (*ptr2++ != *ptr1++)
1645          return 0;
1646        if (MINBPC(enc) > 2) {
1647          if (*ptr2++ != *ptr1++)
1648            return 0;
1649          if (MINBPC(enc) > 3) {
1650            if (*ptr2++ != *ptr1++)
1651              return 0;
1652          }
1653        }
1654      }
1655      break;
1656    default:
1657      if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1658        return 1;
1659      switch (BYTE_TYPE(enc, ptr2)) {
1660      case BT_LEAD2:
1661      case BT_LEAD3:
1662      case BT_LEAD4:
1663      case BT_NONASCII:
1664      case BT_NMSTRT:
1665#ifdef XML_NS
1666      case BT_COLON:
1667#endif
1668      case BT_HEX:
1669      case BT_DIGIT:
1670      case BT_NAME:
1671      case BT_MINUS:
1672        return 0;
1673      default:
1674        return 1;
1675      }
1676    }
1677  }
1678  /* not reached */
1679}
1680
1681static int PTRCALL
1682PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1683                         const char *end1, const char *ptr2)
1684{
1685  for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1686    if (end1 - ptr1 < MINBPC(enc))
1687      return 0;
1688    if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1689      return 0;
1690  }
1691  return ptr1 == end1;
1692}
1693
1694static int PTRFASTCALL
1695PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1696{
1697  const char *start = ptr;
1698  for (;;) {
1699    switch (BYTE_TYPE(enc, ptr)) {
1700#define LEAD_CASE(n) \
1701    case BT_LEAD ## n: ptr += n; break;
1702    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1703#undef LEAD_CASE
1704    case BT_NONASCII:
1705    case BT_NMSTRT:
1706#ifdef XML_NS
1707    case BT_COLON:
1708#endif
1709    case BT_HEX:
1710    case BT_DIGIT:
1711    case BT_NAME:
1712    case BT_MINUS:
1713      ptr += MINBPC(enc);
1714      break;
1715    default:
1716      return (int)(ptr - start);
1717    }
1718  }
1719}
1720
1721static const char * PTRFASTCALL
1722PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1723{
1724  for (;;) {
1725    switch (BYTE_TYPE(enc, ptr)) {
1726    case BT_LF:
1727    case BT_CR:
1728    case BT_S:
1729      ptr += MINBPC(enc);
1730      break;
1731    default:
1732      return ptr;
1733    }
1734  }
1735}
1736
1737static void PTRCALL
1738PREFIX(updatePosition)(const ENCODING *enc,
1739                       const char *ptr,
1740                       const char *end,
1741                       POSITION *pos)
1742{
1743  while (HAS_CHAR(enc, ptr, end)) {
1744    switch (BYTE_TYPE(enc, ptr)) {
1745#define LEAD_CASE(n) \
1746    case BT_LEAD ## n: \
1747      ptr += n; \
1748      break;
1749    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1750#undef LEAD_CASE
1751    case BT_LF:
1752      pos->columnNumber = (XML_Size)-1;
1753      pos->lineNumber++;
1754      ptr += MINBPC(enc);
1755      break;
1756    case BT_CR:
1757      pos->lineNumber++;
1758      ptr += MINBPC(enc);
1759      if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1760        ptr += MINBPC(enc);
1761      pos->columnNumber = (XML_Size)-1;
1762      break;
1763    default:
1764      ptr += MINBPC(enc);
1765      break;
1766    }
1767    pos->columnNumber++;
1768  }
1769}
1770
1771#undef DO_LEAD_CASE
1772#undef MULTIBYTE_CASES
1773#undef INVALID_CASES
1774#undef CHECK_NAME_CASE
1775#undef CHECK_NAME_CASES
1776#undef CHECK_NMSTRT_CASE
1777#undef CHECK_NMSTRT_CASES
1778
1779#endif /* XML_TOK_IMPL_C */
1780