1/*
2 *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 *  Copyright (C) 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All Rights Reserved.
4 *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5 *  Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
6 *  Copyright (C) 2012 Mathias Bynens (mathias@qiwi.be)
7 *
8 *  This library is free software; you can redistribute it and/or
9 *  modify it under the terms of the GNU Library General Public
10 *  License as published by the Free Software Foundation; either
11 *  version 2 of the License, or (at your option) any later version.
12 *
13 *  This library is distributed in the hope that it will be useful,
14 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 *  Library General Public License for more details.
17 *
18 *  You should have received a copy of the GNU Library General Public License
19 *  along with this library; see the file COPYING.LIB.  If not, write to
20 *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 *  Boston, MA 02110-1301, USA.
22 *
23 */
24
25#include "config.h"
26#include "Lexer.h"
27
28#include "JSFunctionInlines.h"
29
30#include "BuiltinNames.h"
31#include "JSGlobalObjectFunctions.h"
32#include "Identifier.h"
33#include "NodeInfo.h"
34#include "Nodes.h"
35#include "JSCInlines.h"
36#include <wtf/dtoa.h>
37#include <ctype.h>
38#include <limits.h>
39#include <string.h>
40#include <wtf/Assertions.h>
41
42#include "KeywordLookup.h"
43#include "Lexer.lut.h"
44#include "Parser.h"
45
46namespace JSC {
47
48Keywords::Keywords(VM& vm)
49    : m_vm(vm)
50    , m_keywordTable(JSC::mainTable)
51{
52}
53
54enum CharacterType {
55    // Types for the main switch
56
57    // The first three types are fixed, and also used for identifying
58    // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart).
59    CharacterIdentifierStart,
60    CharacterZero,
61    CharacterNumber,
62
63    CharacterInvalid,
64    CharacterLineTerminator,
65    CharacterExclamationMark,
66    CharacterOpenParen,
67    CharacterCloseParen,
68    CharacterOpenBracket,
69    CharacterCloseBracket,
70    CharacterComma,
71    CharacterColon,
72    CharacterQuestion,
73    CharacterTilde,
74    CharacterQuote,
75    CharacterDot,
76    CharacterSlash,
77    CharacterBackSlash,
78    CharacterSemicolon,
79    CharacterOpenBrace,
80    CharacterCloseBrace,
81
82    CharacterAdd,
83    CharacterSub,
84    CharacterMultiply,
85    CharacterModulo,
86    CharacterAnd,
87    CharacterXor,
88    CharacterOr,
89    CharacterLess,
90    CharacterGreater,
91    CharacterEqual,
92
93    // Other types (only one so far)
94    CharacterWhiteSpace,
95    CharacterPrivateIdentifierStart
96};
97
98// 256 Latin-1 codes
99static const unsigned short typesOfLatin1Characters[256] = {
100/*   0 - Null               */ CharacterInvalid,
101/*   1 - Start of Heading   */ CharacterInvalid,
102/*   2 - Start of Text      */ CharacterInvalid,
103/*   3 - End of Text        */ CharacterInvalid,
104/*   4 - End of Transm.     */ CharacterInvalid,
105/*   5 - Enquiry            */ CharacterInvalid,
106/*   6 - Acknowledgment     */ CharacterInvalid,
107/*   7 - Bell               */ CharacterInvalid,
108/*   8 - Back Space         */ CharacterInvalid,
109/*   9 - Horizontal Tab     */ CharacterWhiteSpace,
110/*  10 - Line Feed          */ CharacterLineTerminator,
111/*  11 - Vertical Tab       */ CharacterWhiteSpace,
112/*  12 - Form Feed          */ CharacterWhiteSpace,
113/*  13 - Carriage Return    */ CharacterLineTerminator,
114/*  14 - Shift Out          */ CharacterInvalid,
115/*  15 - Shift In           */ CharacterInvalid,
116/*  16 - Data Line Escape   */ CharacterInvalid,
117/*  17 - Device Control 1   */ CharacterInvalid,
118/*  18 - Device Control 2   */ CharacterInvalid,
119/*  19 - Device Control 3   */ CharacterInvalid,
120/*  20 - Device Control 4   */ CharacterInvalid,
121/*  21 - Negative Ack.      */ CharacterInvalid,
122/*  22 - Synchronous Idle   */ CharacterInvalid,
123/*  23 - End of Transmit    */ CharacterInvalid,
124/*  24 - Cancel             */ CharacterInvalid,
125/*  25 - End of Medium      */ CharacterInvalid,
126/*  26 - Substitute         */ CharacterInvalid,
127/*  27 - Escape             */ CharacterInvalid,
128/*  28 - File Separator     */ CharacterInvalid,
129/*  29 - Group Separator    */ CharacterInvalid,
130/*  30 - Record Separator   */ CharacterInvalid,
131/*  31 - Unit Separator     */ CharacterInvalid,
132/*  32 - Space              */ CharacterWhiteSpace,
133/*  33 - !                  */ CharacterExclamationMark,
134/*  34 - "                  */ CharacterQuote,
135/*  35 - #                  */ CharacterInvalid,
136/*  36 - $                  */ CharacterIdentifierStart,
137/*  37 - %                  */ CharacterModulo,
138/*  38 - &                  */ CharacterAnd,
139/*  39 - '                  */ CharacterQuote,
140/*  40 - (                  */ CharacterOpenParen,
141/*  41 - )                  */ CharacterCloseParen,
142/*  42 - *                  */ CharacterMultiply,
143/*  43 - +                  */ CharacterAdd,
144/*  44 - ,                  */ CharacterComma,
145/*  45 - -                  */ CharacterSub,
146/*  46 - .                  */ CharacterDot,
147/*  47 - /                  */ CharacterSlash,
148/*  48 - 0                  */ CharacterZero,
149/*  49 - 1                  */ CharacterNumber,
150/*  50 - 2                  */ CharacterNumber,
151/*  51 - 3                  */ CharacterNumber,
152/*  52 - 4                  */ CharacterNumber,
153/*  53 - 5                  */ CharacterNumber,
154/*  54 - 6                  */ CharacterNumber,
155/*  55 - 7                  */ CharacterNumber,
156/*  56 - 8                  */ CharacterNumber,
157/*  57 - 9                  */ CharacterNumber,
158/*  58 - :                  */ CharacterColon,
159/*  59 - ;                  */ CharacterSemicolon,
160/*  60 - <                  */ CharacterLess,
161/*  61 - =                  */ CharacterEqual,
162/*  62 - >                  */ CharacterGreater,
163/*  63 - ?                  */ CharacterQuestion,
164/*  64 - @                  */ CharacterPrivateIdentifierStart,
165/*  65 - A                  */ CharacterIdentifierStart,
166/*  66 - B                  */ CharacterIdentifierStart,
167/*  67 - C                  */ CharacterIdentifierStart,
168/*  68 - D                  */ CharacterIdentifierStart,
169/*  69 - E                  */ CharacterIdentifierStart,
170/*  70 - F                  */ CharacterIdentifierStart,
171/*  71 - G                  */ CharacterIdentifierStart,
172/*  72 - H                  */ CharacterIdentifierStart,
173/*  73 - I                  */ CharacterIdentifierStart,
174/*  74 - J                  */ CharacterIdentifierStart,
175/*  75 - K                  */ CharacterIdentifierStart,
176/*  76 - L                  */ CharacterIdentifierStart,
177/*  77 - M                  */ CharacterIdentifierStart,
178/*  78 - N                  */ CharacterIdentifierStart,
179/*  79 - O                  */ CharacterIdentifierStart,
180/*  80 - P                  */ CharacterIdentifierStart,
181/*  81 - Q                  */ CharacterIdentifierStart,
182/*  82 - R                  */ CharacterIdentifierStart,
183/*  83 - S                  */ CharacterIdentifierStart,
184/*  84 - T                  */ CharacterIdentifierStart,
185/*  85 - U                  */ CharacterIdentifierStart,
186/*  86 - V                  */ CharacterIdentifierStart,
187/*  87 - W                  */ CharacterIdentifierStart,
188/*  88 - X                  */ CharacterIdentifierStart,
189/*  89 - Y                  */ CharacterIdentifierStart,
190/*  90 - Z                  */ CharacterIdentifierStart,
191/*  91 - [                  */ CharacterOpenBracket,
192/*  92 - \                  */ CharacterBackSlash,
193/*  93 - ]                  */ CharacterCloseBracket,
194/*  94 - ^                  */ CharacterXor,
195/*  95 - _                  */ CharacterIdentifierStart,
196/*  96 - `                  */ CharacterInvalid,
197/*  97 - a                  */ CharacterIdentifierStart,
198/*  98 - b                  */ CharacterIdentifierStart,
199/*  99 - c                  */ CharacterIdentifierStart,
200/* 100 - d                  */ CharacterIdentifierStart,
201/* 101 - e                  */ CharacterIdentifierStart,
202/* 102 - f                  */ CharacterIdentifierStart,
203/* 103 - g                  */ CharacterIdentifierStart,
204/* 104 - h                  */ CharacterIdentifierStart,
205/* 105 - i                  */ CharacterIdentifierStart,
206/* 106 - j                  */ CharacterIdentifierStart,
207/* 107 - k                  */ CharacterIdentifierStart,
208/* 108 - l                  */ CharacterIdentifierStart,
209/* 109 - m                  */ CharacterIdentifierStart,
210/* 110 - n                  */ CharacterIdentifierStart,
211/* 111 - o                  */ CharacterIdentifierStart,
212/* 112 - p                  */ CharacterIdentifierStart,
213/* 113 - q                  */ CharacterIdentifierStart,
214/* 114 - r                  */ CharacterIdentifierStart,
215/* 115 - s                  */ CharacterIdentifierStart,
216/* 116 - t                  */ CharacterIdentifierStart,
217/* 117 - u                  */ CharacterIdentifierStart,
218/* 118 - v                  */ CharacterIdentifierStart,
219/* 119 - w                  */ CharacterIdentifierStart,
220/* 120 - x                  */ CharacterIdentifierStart,
221/* 121 - y                  */ CharacterIdentifierStart,
222/* 122 - z                  */ CharacterIdentifierStart,
223/* 123 - {                  */ CharacterOpenBrace,
224/* 124 - |                  */ CharacterOr,
225/* 125 - }                  */ CharacterCloseBrace,
226/* 126 - ~                  */ CharacterTilde,
227/* 127 - Delete             */ CharacterInvalid,
228/* 128 - Cc category        */ CharacterInvalid,
229/* 129 - Cc category        */ CharacterInvalid,
230/* 130 - Cc category        */ CharacterInvalid,
231/* 131 - Cc category        */ CharacterInvalid,
232/* 132 - Cc category        */ CharacterInvalid,
233/* 133 - Cc category        */ CharacterInvalid,
234/* 134 - Cc category        */ CharacterInvalid,
235/* 135 - Cc category        */ CharacterInvalid,
236/* 136 - Cc category        */ CharacterInvalid,
237/* 137 - Cc category        */ CharacterInvalid,
238/* 138 - Cc category        */ CharacterInvalid,
239/* 139 - Cc category        */ CharacterInvalid,
240/* 140 - Cc category        */ CharacterInvalid,
241/* 141 - Cc category        */ CharacterInvalid,
242/* 142 - Cc category        */ CharacterInvalid,
243/* 143 - Cc category        */ CharacterInvalid,
244/* 144 - Cc category        */ CharacterInvalid,
245/* 145 - Cc category        */ CharacterInvalid,
246/* 146 - Cc category        */ CharacterInvalid,
247/* 147 - Cc category        */ CharacterInvalid,
248/* 148 - Cc category        */ CharacterInvalid,
249/* 149 - Cc category        */ CharacterInvalid,
250/* 150 - Cc category        */ CharacterInvalid,
251/* 151 - Cc category        */ CharacterInvalid,
252/* 152 - Cc category        */ CharacterInvalid,
253/* 153 - Cc category        */ CharacterInvalid,
254/* 154 - Cc category        */ CharacterInvalid,
255/* 155 - Cc category        */ CharacterInvalid,
256/* 156 - Cc category        */ CharacterInvalid,
257/* 157 - Cc category        */ CharacterInvalid,
258/* 158 - Cc category        */ CharacterInvalid,
259/* 159 - Cc category        */ CharacterInvalid,
260/* 160 - Zs category (nbsp) */ CharacterWhiteSpace,
261/* 161 - Po category        */ CharacterInvalid,
262/* 162 - Sc category        */ CharacterInvalid,
263/* 163 - Sc category        */ CharacterInvalid,
264/* 164 - Sc category        */ CharacterInvalid,
265/* 165 - Sc category        */ CharacterInvalid,
266/* 166 - So category        */ CharacterInvalid,
267/* 167 - So category        */ CharacterInvalid,
268/* 168 - Sk category        */ CharacterInvalid,
269/* 169 - So category        */ CharacterInvalid,
270/* 170 - Ll category        */ CharacterIdentifierStart,
271/* 171 - Pi category        */ CharacterInvalid,
272/* 172 - Sm category        */ CharacterInvalid,
273/* 173 - Cf category        */ CharacterInvalid,
274/* 174 - So category        */ CharacterInvalid,
275/* 175 - Sk category        */ CharacterInvalid,
276/* 176 - So category        */ CharacterInvalid,
277/* 177 - Sm category        */ CharacterInvalid,
278/* 178 - No category        */ CharacterInvalid,
279/* 179 - No category        */ CharacterInvalid,
280/* 180 - Sk category        */ CharacterInvalid,
281/* 181 - Ll category        */ CharacterIdentifierStart,
282/* 182 - So category        */ CharacterInvalid,
283/* 183 - Po category        */ CharacterInvalid,
284/* 184 - Sk category        */ CharacterInvalid,
285/* 185 - No category        */ CharacterInvalid,
286/* 186 - Ll category        */ CharacterIdentifierStart,
287/* 187 - Pf category        */ CharacterInvalid,
288/* 188 - No category        */ CharacterInvalid,
289/* 189 - No category        */ CharacterInvalid,
290/* 190 - No category        */ CharacterInvalid,
291/* 191 - Po category        */ CharacterInvalid,
292/* 192 - Lu category        */ CharacterIdentifierStart,
293/* 193 - Lu category        */ CharacterIdentifierStart,
294/* 194 - Lu category        */ CharacterIdentifierStart,
295/* 195 - Lu category        */ CharacterIdentifierStart,
296/* 196 - Lu category        */ CharacterIdentifierStart,
297/* 197 - Lu category        */ CharacterIdentifierStart,
298/* 198 - Lu category        */ CharacterIdentifierStart,
299/* 199 - Lu category        */ CharacterIdentifierStart,
300/* 200 - Lu category        */ CharacterIdentifierStart,
301/* 201 - Lu category        */ CharacterIdentifierStart,
302/* 202 - Lu category        */ CharacterIdentifierStart,
303/* 203 - Lu category        */ CharacterIdentifierStart,
304/* 204 - Lu category        */ CharacterIdentifierStart,
305/* 205 - Lu category        */ CharacterIdentifierStart,
306/* 206 - Lu category        */ CharacterIdentifierStart,
307/* 207 - Lu category        */ CharacterIdentifierStart,
308/* 208 - Lu category        */ CharacterIdentifierStart,
309/* 209 - Lu category        */ CharacterIdentifierStart,
310/* 210 - Lu category        */ CharacterIdentifierStart,
311/* 211 - Lu category        */ CharacterIdentifierStart,
312/* 212 - Lu category        */ CharacterIdentifierStart,
313/* 213 - Lu category        */ CharacterIdentifierStart,
314/* 214 - Lu category        */ CharacterIdentifierStart,
315/* 215 - Sm category        */ CharacterInvalid,
316/* 216 - Lu category        */ CharacterIdentifierStart,
317/* 217 - Lu category        */ CharacterIdentifierStart,
318/* 218 - Lu category        */ CharacterIdentifierStart,
319/* 219 - Lu category        */ CharacterIdentifierStart,
320/* 220 - Lu category        */ CharacterIdentifierStart,
321/* 221 - Lu category        */ CharacterIdentifierStart,
322/* 222 - Lu category        */ CharacterIdentifierStart,
323/* 223 - Ll category        */ CharacterIdentifierStart,
324/* 224 - Ll category        */ CharacterIdentifierStart,
325/* 225 - Ll category        */ CharacterIdentifierStart,
326/* 226 - Ll category        */ CharacterIdentifierStart,
327/* 227 - Ll category        */ CharacterIdentifierStart,
328/* 228 - Ll category        */ CharacterIdentifierStart,
329/* 229 - Ll category        */ CharacterIdentifierStart,
330/* 230 - Ll category        */ CharacterIdentifierStart,
331/* 231 - Ll category        */ CharacterIdentifierStart,
332/* 232 - Ll category        */ CharacterIdentifierStart,
333/* 233 - Ll category        */ CharacterIdentifierStart,
334/* 234 - Ll category        */ CharacterIdentifierStart,
335/* 235 - Ll category        */ CharacterIdentifierStart,
336/* 236 - Ll category        */ CharacterIdentifierStart,
337/* 237 - Ll category        */ CharacterIdentifierStart,
338/* 238 - Ll category        */ CharacterIdentifierStart,
339/* 239 - Ll category        */ CharacterIdentifierStart,
340/* 240 - Ll category        */ CharacterIdentifierStart,
341/* 241 - Ll category        */ CharacterIdentifierStart,
342/* 242 - Ll category        */ CharacterIdentifierStart,
343/* 243 - Ll category        */ CharacterIdentifierStart,
344/* 244 - Ll category        */ CharacterIdentifierStart,
345/* 245 - Ll category        */ CharacterIdentifierStart,
346/* 246 - Ll category        */ CharacterIdentifierStart,
347/* 247 - Sm category        */ CharacterInvalid,
348/* 248 - Ll category        */ CharacterIdentifierStart,
349/* 249 - Ll category        */ CharacterIdentifierStart,
350/* 250 - Ll category        */ CharacterIdentifierStart,
351/* 251 - Ll category        */ CharacterIdentifierStart,
352/* 252 - Ll category        */ CharacterIdentifierStart,
353/* 253 - Ll category        */ CharacterIdentifierStart,
354/* 254 - Ll category        */ CharacterIdentifierStart,
355/* 255 - Ll category        */ CharacterIdentifierStart
356};
357
358// This table provides the character that results from \X where X is the index in the table beginning
359// with SPACE. A table value of 0 means that more processing needs to be done.
360static const LChar singleCharacterEscapeValuesForASCII[128] = {
361/*   0 - Null               */ 0,
362/*   1 - Start of Heading   */ 0,
363/*   2 - Start of Text      */ 0,
364/*   3 - End of Text        */ 0,
365/*   4 - End of Transm.     */ 0,
366/*   5 - Enquiry            */ 0,
367/*   6 - Acknowledgment     */ 0,
368/*   7 - Bell               */ 0,
369/*   8 - Back Space         */ 0,
370/*   9 - Horizontal Tab     */ 0,
371/*  10 - Line Feed          */ 0,
372/*  11 - Vertical Tab       */ 0,
373/*  12 - Form Feed          */ 0,
374/*  13 - Carriage Return    */ 0,
375/*  14 - Shift Out          */ 0,
376/*  15 - Shift In           */ 0,
377/*  16 - Data Line Escape   */ 0,
378/*  17 - Device Control 1   */ 0,
379/*  18 - Device Control 2   */ 0,
380/*  19 - Device Control 3   */ 0,
381/*  20 - Device Control 4   */ 0,
382/*  21 - Negative Ack.      */ 0,
383/*  22 - Synchronous Idle   */ 0,
384/*  23 - End of Transmit    */ 0,
385/*  24 - Cancel             */ 0,
386/*  25 - End of Medium      */ 0,
387/*  26 - Substitute         */ 0,
388/*  27 - Escape             */ 0,
389/*  28 - File Separator     */ 0,
390/*  29 - Group Separator    */ 0,
391/*  30 - Record Separator   */ 0,
392/*  31 - Unit Separator     */ 0,
393/*  32 - Space              */ ' ',
394/*  33 - !                  */ '!',
395/*  34 - "                  */ '"',
396/*  35 - #                  */ '#',
397/*  36 - $                  */ '$',
398/*  37 - %                  */ '%',
399/*  38 - &                  */ '&',
400/*  39 - '                  */ '\'',
401/*  40 - (                  */ '(',
402/*  41 - )                  */ ')',
403/*  42 - *                  */ '*',
404/*  43 - +                  */ '+',
405/*  44 - ,                  */ ',',
406/*  45 - -                  */ '-',
407/*  46 - .                  */ '.',
408/*  47 - /                  */ '/',
409/*  48 - 0                  */ 0,
410/*  49 - 1                  */ 0,
411/*  50 - 2                  */ 0,
412/*  51 - 3                  */ 0,
413/*  52 - 4                  */ 0,
414/*  53 - 5                  */ 0,
415/*  54 - 6                  */ 0,
416/*  55 - 7                  */ 0,
417/*  56 - 8                  */ 0,
418/*  57 - 9                  */ 0,
419/*  58 - :                  */ ':',
420/*  59 - ;                  */ ';',
421/*  60 - <                  */ '<',
422/*  61 - =                  */ '=',
423/*  62 - >                  */ '>',
424/*  63 - ?                  */ '?',
425/*  64 - @                  */ '@',
426/*  65 - A                  */ 'A',
427/*  66 - B                  */ 'B',
428/*  67 - C                  */ 'C',
429/*  68 - D                  */ 'D',
430/*  69 - E                  */ 'E',
431/*  70 - F                  */ 'F',
432/*  71 - G                  */ 'G',
433/*  72 - H                  */ 'H',
434/*  73 - I                  */ 'I',
435/*  74 - J                  */ 'J',
436/*  75 - K                  */ 'K',
437/*  76 - L                  */ 'L',
438/*  77 - M                  */ 'M',
439/*  78 - N                  */ 'N',
440/*  79 - O                  */ 'O',
441/*  80 - P                  */ 'P',
442/*  81 - Q                  */ 'Q',
443/*  82 - R                  */ 'R',
444/*  83 - S                  */ 'S',
445/*  84 - T                  */ 'T',
446/*  85 - U                  */ 'U',
447/*  86 - V                  */ 'V',
448/*  87 - W                  */ 'W',
449/*  88 - X                  */ 'X',
450/*  89 - Y                  */ 'Y',
451/*  90 - Z                  */ 'Z',
452/*  91 - [                  */ '[',
453/*  92 - \                  */ '\\',
454/*  93 - ]                  */ ']',
455/*  94 - ^                  */ '^',
456/*  95 - _                  */ '_',
457/*  96 - `                  */ '`',
458/*  97 - a                  */ 'a',
459/*  98 - b                  */ 0x08,
460/*  99 - c                  */ 'c',
461/* 100 - d                  */ 'd',
462/* 101 - e                  */ 'e',
463/* 102 - f                  */ 0x0C,
464/* 103 - g                  */ 'g',
465/* 104 - h                  */ 'h',
466/* 105 - i                  */ 'i',
467/* 106 - j                  */ 'j',
468/* 107 - k                  */ 'k',
469/* 108 - l                  */ 'l',
470/* 109 - m                  */ 'm',
471/* 110 - n                  */ 0x0A,
472/* 111 - o                  */ 'o',
473/* 112 - p                  */ 'p',
474/* 113 - q                  */ 'q',
475/* 114 - r                  */ 0x0D,
476/* 115 - s                  */ 's',
477/* 116 - t                  */ 0x09,
478/* 117 - u                  */ 0,
479/* 118 - v                  */ 0x0B,
480/* 119 - w                  */ 'w',
481/* 120 - x                  */ 0,
482/* 121 - y                  */ 'y',
483/* 122 - z                  */ 'z',
484/* 123 - {                  */ '{',
485/* 124 - |                  */ '|',
486/* 125 - }                  */ '}',
487/* 126 - ~                  */ '~',
488/* 127 - Delete             */ 0
489};
490
491template <typename T>
492Lexer<T>::Lexer(VM* vm, JSParserStrictness strictness)
493    : m_isReparsing(false)
494    , m_vm(vm)
495    , m_parsingBuiltinFunction(strictness == JSParseBuiltin)
496{
497}
498
499template <typename T>
500Lexer<T>::~Lexer()
501{
502}
503
504template <typename T>
505String Lexer<T>::invalidCharacterMessage() const
506{
507    switch (m_current) {
508    case 0:
509        return "Invalid character: '\\0'";
510    case 10:
511        return "Invalid character: '\\n'";
512    case 11:
513        return "Invalid character: '\\v'";
514    case 13:
515        return "Invalid character: '\\r'";
516    case 35:
517        return "Invalid character: '#'";
518    case 64:
519        return "Invalid character: '@'";
520    case 96:
521        return "Invalid character: '`'";
522    default:
523        return String::format("Invalid character '\\u%04u'", static_cast<unsigned>(m_current)).impl();
524    }
525}
526
527template <typename T>
528ALWAYS_INLINE const T* Lexer<T>::currentSourcePtr() const
529{
530    ASSERT(m_code <= m_codeEnd);
531    return m_code;
532}
533
534template <typename T>
535void Lexer<T>::setCode(const SourceCode& source, ParserArena* arena)
536{
537    m_arena = &arena->identifierArena();
538
539    m_lineNumber = source.firstLine();
540    m_lastToken = -1;
541
542    const String& sourceString = source.provider()->source();
543
544    if (!sourceString.isNull())
545        setCodeStart(sourceString.impl());
546    else
547        m_codeStart = 0;
548
549    m_source = &source;
550    m_sourceOffset = source.startOffset();
551    m_codeStartPlusOffset = m_codeStart + source.startOffset();
552    m_code = m_codeStartPlusOffset;
553    m_codeEnd = m_codeStart + source.endOffset();
554    m_error = false;
555    m_atLineStart = true;
556    m_lineStart = m_code;
557    m_lexErrorMessage = String();
558
559    m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
560    m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2);
561
562    if (LIKELY(m_code < m_codeEnd))
563        m_current = *m_code;
564    else
565        m_current = 0;
566    ASSERT(currentOffset() == source.startOffset());
567}
568
569template <typename T>
570template <int shiftAmount> ALWAYS_INLINE void Lexer<T>::internalShift()
571{
572    m_code += shiftAmount;
573    ASSERT(currentOffset() >= currentLineStartOffset());
574    m_current = *m_code;
575}
576
577template <typename T>
578ALWAYS_INLINE void Lexer<T>::shift()
579{
580    // At one point timing showed that setting m_current to 0 unconditionally was faster than an if-else sequence.
581    m_current = 0;
582    ++m_code;
583    if (LIKELY(m_code < m_codeEnd))
584        m_current = *m_code;
585}
586
587template <typename T>
588ALWAYS_INLINE bool Lexer<T>::atEnd() const
589{
590    ASSERT(!m_current || m_code < m_codeEnd);
591    return UNLIKELY(UNLIKELY(!m_current) && m_code == m_codeEnd);
592}
593
594template <typename T>
595ALWAYS_INLINE T Lexer<T>::peek(int offset) const
596{
597    ASSERT(offset > 0 && offset < 5);
598    const T* code = m_code + offset;
599    return (code < m_codeEnd) ? *code : 0;
600}
601
602template <typename T>
603typename Lexer<T>::UnicodeHexValue Lexer<T>::parseFourDigitUnicodeHex()
604{
605    T char1 = peek(1);
606    T char2 = peek(2);
607    T char3 = peek(3);
608
609    if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
610        return UnicodeHexValue((m_code + 4) >= m_codeEnd ? UnicodeHexValue::IncompleteHex : UnicodeHexValue::InvalidHex);
611
612    int result = convertUnicode(m_current, char1, char2, char3);
613    shift();
614    shift();
615    shift();
616    shift();
617    return UnicodeHexValue(result);
618}
619
620template <typename T>
621void Lexer<T>::shiftLineTerminator()
622{
623    ASSERT(isLineTerminator(m_current));
624
625    m_positionBeforeLastNewline = currentPosition();
626    T prev = m_current;
627    shift();
628
629    // Allow both CRLF and LFCR.
630    if (prev + m_current == '\n' + '\r')
631        shift();
632
633    ++m_lineNumber;
634}
635
636template <typename T>
637ALWAYS_INLINE bool Lexer<T>::lastTokenWasRestrKeyword() const
638{
639    return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
640}
641
642static NEVER_INLINE bool isNonLatin1IdentStart(UChar c)
643{
644    return U_GET_GC_MASK(c) & U_GC_L_MASK;
645}
646
647static ALWAYS_INLINE bool isLatin1(LChar)
648{
649    return true;
650}
651
652static ALWAYS_INLINE bool isLatin1(UChar c)
653{
654    return c < 256;
655}
656
657static inline bool isIdentStart(LChar c)
658{
659    return typesOfLatin1Characters[c] == CharacterIdentifierStart;
660}
661
662static inline bool isIdentStart(UChar c)
663{
664    return isLatin1(c) ? isIdentStart(static_cast<LChar>(c)) : isNonLatin1IdentStart(c);
665}
666
667static NEVER_INLINE bool isNonLatin1IdentPart(int c)
668{
669    return (U_GET_GC_MASK(c) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || c == 0x200C || c == 0x200D;
670}
671
672static ALWAYS_INLINE bool isIdentPart(LChar c)
673{
674    // Character types are divided into two groups depending on whether they can be part of an
675    // identifier or not. Those whose type value is less or equal than CharacterNumber can be
676    // part of an identifier. (See the CharacterType definition for more details.)
677    return typesOfLatin1Characters[c] <= CharacterNumber;
678}
679
680static ALWAYS_INLINE bool isIdentPart(UChar c)
681{
682    return isLatin1(c) ? isIdentPart(static_cast<LChar>(c)) : isNonLatin1IdentPart(c);
683}
684
685static inline LChar singleEscape(int c)
686{
687    if (c < 128) {
688        ASSERT(static_cast<size_t>(c) < ARRAY_SIZE(singleCharacterEscapeValuesForASCII));
689        return singleCharacterEscapeValuesForASCII[c];
690    }
691    return 0;
692}
693
694template <typename T>
695inline void Lexer<T>::record8(int c)
696{
697    ASSERT(c >= 0);
698    ASSERT(c <= 0xFF);
699    m_buffer8.append(static_cast<LChar>(c));
700}
701
702template <typename T>
703inline void assertCharIsIn8BitRange(T c)
704{
705    UNUSED_PARAM(c);
706    ASSERT(c >= 0);
707    ASSERT(c <= 0xFF);
708}
709
710template <>
711inline void assertCharIsIn8BitRange(UChar c)
712{
713    UNUSED_PARAM(c);
714    ASSERT(c <= 0xFF);
715}
716
717template <>
718inline void assertCharIsIn8BitRange(LChar)
719{
720}
721
722template <typename T>
723inline void Lexer<T>::append8(const T* p, size_t length)
724{
725    size_t currentSize = m_buffer8.size();
726    m_buffer8.grow(currentSize + length);
727    LChar* rawBuffer = m_buffer8.data() + currentSize;
728
729    for (size_t i = 0; i < length; i++) {
730        T c = p[i];
731        assertCharIsIn8BitRange(c);
732        rawBuffer[i] = c;
733    }
734}
735
736template <typename T>
737inline void Lexer<T>::append16(const LChar* p, size_t length)
738{
739    size_t currentSize = m_buffer16.size();
740    m_buffer16.grow(currentSize + length);
741    UChar* rawBuffer = m_buffer16.data() + currentSize;
742
743    for (size_t i = 0; i < length; i++)
744        rawBuffer[i] = p[i];
745}
746
747template <typename T>
748inline void Lexer<T>::record16(T c)
749{
750    m_buffer16.append(c);
751}
752
753template <typename T>
754inline void Lexer<T>::record16(int c)
755{
756    ASSERT(c >= 0);
757    ASSERT(c <= static_cast<int>(USHRT_MAX));
758    m_buffer16.append(static_cast<UChar>(c));
759}
760
761#if !ASSERT_DISABLED
762bool isSafeBuiltinIdentifier(VM& vm, const Identifier* ident)
763{
764    if (!ident)
765        return true;
766    /* Just block any use of suspicious identifiers.  This is intended to
767     * be used as a safety net while implementing builtins.
768     */
769    if (*ident == vm.propertyNames->builtinNames().callPublicName())
770        return false;
771    if (*ident == vm.propertyNames->builtinNames().applyPublicName())
772        return false;
773    if (*ident == vm.propertyNames->eval)
774        return false;
775    if (*ident == vm.propertyNames->Function)
776        return false;
777    return true;
778}
779#endif
780
781template <>
782template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<LChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
783{
784    const ptrdiff_t remaining = m_codeEnd - m_code;
785    if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
786        JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
787        if (keyword != IDENT) {
788            ASSERT((!shouldCreateIdentifier) || tokenData->ident);
789            return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
790        }
791    }
792
793    bool isPrivateName = m_current == '@' && m_parsingBuiltinFunction;
794    if (isPrivateName)
795        shift();
796
797    const LChar* identifierStart = currentSourcePtr();
798    unsigned identifierLineStart = currentLineStartOffset();
799
800    while (isIdentPart(m_current))
801        shift();
802
803    if (UNLIKELY(m_current == '\\')) {
804        setOffsetFromSourcePtr(identifierStart, identifierLineStart);
805        return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
806    }
807
808    const Identifier* ident = 0;
809
810    if (shouldCreateIdentifier || m_parsingBuiltinFunction) {
811        int identifierLength = currentSourcePtr() - identifierStart;
812        ident = makeIdentifier(identifierStart, identifierLength);
813        if (m_parsingBuiltinFunction) {
814            if (!isSafeBuiltinIdentifier(*m_vm, ident) && !isPrivateName) {
815                m_lexErrorMessage = makeString("The use of '", ident->string(), "' is disallowed in builtin functions.");
816                return ERRORTOK;
817            }
818            if (isPrivateName)
819                ident = m_vm->propertyNames->getPrivateName(*ident);
820            else if (*ident == m_vm->propertyNames->undefinedKeyword)
821                tokenData->ident = &m_vm->propertyNames->undefinedPrivateName;
822            if (!ident)
823                return INVALID_PRIVATE_NAME_ERRORTOK;
824        }
825        tokenData->ident = ident;
826    } else
827        tokenData->ident = 0;
828
829    if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) && !isPrivateName) {
830        ASSERT(shouldCreateIdentifier);
831        if (remaining < maxTokenLength) {
832            const HashTableValue* entry = m_vm->keywords->getKeyword(*ident);
833            ASSERT((remaining < maxTokenLength) || !entry);
834            if (!entry)
835                return IDENT;
836            JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
837            return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
838        }
839        return IDENT;
840    }
841
842    return IDENT;
843}
844
845template <>
846template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<UChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
847{
848    const ptrdiff_t remaining = m_codeEnd - m_code;
849    if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
850        JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
851        if (keyword != IDENT) {
852            ASSERT((!shouldCreateIdentifier) || tokenData->ident);
853            return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
854        }
855    }
856
857    bool isPrivateName = m_current == '@' && m_parsingBuiltinFunction;
858    if (isPrivateName)
859        shift();
860
861    const UChar* identifierStart = currentSourcePtr();
862    int identifierLineStart = currentLineStartOffset();
863
864    UChar orAllChars = 0;
865
866    while (isIdentPart(m_current)) {
867        orAllChars |= m_current;
868        shift();
869    }
870
871    if (UNLIKELY(m_current == '\\')) {
872        ASSERT(!isPrivateName);
873        setOffsetFromSourcePtr(identifierStart, identifierLineStart);
874        return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
875    }
876
877    bool isAll8Bit = false;
878
879    if (!(orAllChars & ~0xff))
880        isAll8Bit = true;
881
882    const Identifier* ident = 0;
883
884    if (shouldCreateIdentifier || m_parsingBuiltinFunction) {
885        int identifierLength = currentSourcePtr() - identifierStart;
886        if (isAll8Bit)
887            ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
888        else
889            ident = makeIdentifier(identifierStart, identifierLength);
890        if (m_parsingBuiltinFunction) {
891            if (!isSafeBuiltinIdentifier(*m_vm, ident) && !isPrivateName) {
892                m_lexErrorMessage = makeString("The use of '", ident->string(), "' is disallowed in builtin functions.");
893                return ERRORTOK;
894            }
895            if (isPrivateName)
896                ident = m_vm->propertyNames->getPrivateName(*ident);
897            else if (*ident == m_vm->propertyNames->undefinedKeyword)
898                tokenData->ident = &m_vm->propertyNames->undefinedPrivateName;
899            if (!ident)
900                return INVALID_PRIVATE_NAME_ERRORTOK;
901        }
902        tokenData->ident = ident;
903    } else
904        tokenData->ident = 0;
905
906    if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) && !isPrivateName) {
907        ASSERT(shouldCreateIdentifier);
908        if (remaining < maxTokenLength) {
909            const HashTableValue* entry = m_vm->keywords->getKeyword(*ident);
910            ASSERT((remaining < maxTokenLength) || !entry);
911            if (!entry)
912                return IDENT;
913            JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
914            return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
915        }
916        return IDENT;
917    }
918
919    return IDENT;
920}
921
922template <typename T>
923template <bool shouldCreateIdentifier> JSTokenType Lexer<T>::parseIdentifierSlowCase(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
924{
925    const ptrdiff_t remaining = m_codeEnd - m_code;
926    const T* identifierStart = currentSourcePtr();
927    bool bufferRequired = false;
928
929    while (true) {
930        if (LIKELY(isIdentPart(m_current))) {
931            shift();
932            continue;
933        }
934        if (LIKELY(m_current != '\\'))
935            break;
936
937        // \uXXXX unicode characters.
938        bufferRequired = true;
939        if (identifierStart != currentSourcePtr())
940            m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
941        shift();
942        if (UNLIKELY(m_current != 'u'))
943            return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
944        shift();
945        UnicodeHexValue character = parseFourDigitUnicodeHex();
946        if (UNLIKELY(!character.isValid()))
947            return character.valueType() == UnicodeHexValue::IncompleteHex ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
948        UChar ucharacter = static_cast<UChar>(character.value());
949        if (UNLIKELY(m_buffer16.size() ? !isIdentPart(ucharacter) : !isIdentStart(ucharacter)))
950            return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
951        if (shouldCreateIdentifier)
952            record16(ucharacter);
953        identifierStart = currentSourcePtr();
954    }
955
956    int identifierLength;
957    const Identifier* ident = 0;
958    if (shouldCreateIdentifier) {
959        if (!bufferRequired) {
960            identifierLength = currentSourcePtr() - identifierStart;
961            ident = makeIdentifier(identifierStart, identifierLength);
962        } else {
963            if (identifierStart != currentSourcePtr())
964                m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
965            ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
966        }
967
968        tokenData->ident = ident;
969    } else
970        tokenData->ident = 0;
971
972    if (LIKELY(!bufferRequired && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
973        ASSERT(shouldCreateIdentifier);
974        // Keywords must not be recognized if there was an \uXXXX in the identifier.
975        if (remaining < maxTokenLength) {
976            const HashTableValue* entry = m_vm->keywords->getKeyword(*ident);
977            ASSERT((remaining < maxTokenLength) || !entry);
978            if (!entry)
979                return IDENT;
980            JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
981            return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
982        }
983        return IDENT;
984    }
985
986    m_buffer16.resize(0);
987    return IDENT;
988}
989
990static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(LChar character)
991{
992    return character < 0xE;
993}
994
995static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(UChar character)
996{
997    return character < 0xE || character > 0xFF;
998}
999
1000template <typename T>
1001template <bool shouldBuildStrings> ALWAYS_INLINE typename Lexer<T>::StringParseResult Lexer<T>::parseString(JSTokenData* tokenData, bool strictMode)
1002{
1003    int startingOffset = currentOffset();
1004    int startingLineStartOffset = currentLineStartOffset();
1005    int startingLineNumber = lineNumber();
1006    T stringQuoteCharacter = m_current;
1007    shift();
1008
1009    const T* stringStart = currentSourcePtr();
1010
1011    while (m_current != stringQuoteCharacter) {
1012        if (UNLIKELY(m_current == '\\')) {
1013            if (stringStart != currentSourcePtr() && shouldBuildStrings)
1014                append8(stringStart, currentSourcePtr() - stringStart);
1015            shift();
1016
1017            LChar escape = singleEscape(m_current);
1018
1019            // Most common escape sequences first
1020            if (escape) {
1021                if (shouldBuildStrings)
1022                    record8(escape);
1023                shift();
1024            } else if (UNLIKELY(isLineTerminator(m_current)))
1025                shiftLineTerminator();
1026            else if (m_current == 'x') {
1027                shift();
1028                if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
1029                    m_lexErrorMessage = "\\x can only be followed by a hex character sequence";
1030                    return (atEnd() || (isASCIIHexDigit(m_current) && (m_code + 1 == m_codeEnd))) ? StringUnterminated : StringCannotBeParsed;
1031                }
1032                T prev = m_current;
1033                shift();
1034                if (shouldBuildStrings)
1035                    record8(convertHex(prev, m_current));
1036                shift();
1037            } else {
1038                setOffset(startingOffset, startingLineStartOffset);
1039                setLineNumber(startingLineNumber);
1040                m_buffer8.resize(0);
1041                return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
1042            }
1043            stringStart = currentSourcePtr();
1044            continue;
1045        }
1046
1047        if (UNLIKELY(characterRequiresParseStringSlowCase(m_current))) {
1048            setOffset(startingOffset, startingLineStartOffset);
1049            setLineNumber(startingLineNumber);
1050            m_buffer8.resize(0);
1051            return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
1052        }
1053
1054        shift();
1055    }
1056
1057    if (currentSourcePtr() != stringStart && shouldBuildStrings)
1058        append8(stringStart, currentSourcePtr() - stringStart);
1059    if (shouldBuildStrings) {
1060        tokenData->ident = makeIdentifier(m_buffer8.data(), m_buffer8.size());
1061        m_buffer8.resize(0);
1062    } else
1063        tokenData->ident = 0;
1064
1065    return StringParsedSuccessfully;
1066}
1067
1068template <typename T>
1069template <bool shouldBuildStrings> typename Lexer<T>::StringParseResult Lexer<T>::parseStringSlowCase(JSTokenData* tokenData, bool strictMode)
1070{
1071    T stringQuoteCharacter = m_current;
1072    shift();
1073
1074    const T* stringStart = currentSourcePtr();
1075
1076    while (m_current != stringQuoteCharacter) {
1077        if (UNLIKELY(m_current == '\\')) {
1078            if (stringStart != currentSourcePtr() && shouldBuildStrings)
1079                append16(stringStart, currentSourcePtr() - stringStart);
1080            shift();
1081
1082            LChar escape = singleEscape(m_current);
1083
1084            // Most common escape sequences first
1085            if (escape) {
1086                if (shouldBuildStrings)
1087                    record16(escape);
1088                shift();
1089            } else if (UNLIKELY(isLineTerminator(m_current)))
1090                shiftLineTerminator();
1091            else if (m_current == 'x') {
1092                shift();
1093                if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
1094                    m_lexErrorMessage = "\\x can only be followed by a hex character sequence";
1095                    return StringCannotBeParsed;
1096                }
1097                T prev = m_current;
1098                shift();
1099                if (shouldBuildStrings)
1100                    record16(convertHex(prev, m_current));
1101                shift();
1102            } else if (m_current == 'u') {
1103                shift();
1104                UnicodeHexValue character = parseFourDigitUnicodeHex();
1105                if (character.isValid()) {
1106                    if (shouldBuildStrings)
1107                        record16(character.value());
1108                } else if (m_current == stringQuoteCharacter) {
1109                    if (shouldBuildStrings)
1110                        record16('u');
1111                } else {
1112                    m_lexErrorMessage = "\\u can only be followed by a Unicode character sequence";
1113                    return character.valueType() == UnicodeHexValue::IncompleteHex ? StringUnterminated : StringCannotBeParsed;
1114                }
1115            } else if (strictMode && isASCIIDigit(m_current)) {
1116                // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit.
1117                int character1 = m_current;
1118                shift();
1119                if (character1 != '0' || isASCIIDigit(m_current)) {
1120                    m_lexErrorMessage = "The only valid numeric escape in strict mode is '\\0'";
1121                    return StringCannotBeParsed;
1122                }
1123                if (shouldBuildStrings)
1124                    record16(0);
1125            } else if (!strictMode && isASCIIOctalDigit(m_current)) {
1126                // Octal character sequences
1127                T character1 = m_current;
1128                shift();
1129                if (isASCIIOctalDigit(m_current)) {
1130                    // Two octal characters
1131                    T character2 = m_current;
1132                    shift();
1133                    if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
1134                        if (shouldBuildStrings)
1135                            record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0');
1136                        shift();
1137                    } else {
1138                        if (shouldBuildStrings)
1139                            record16((character1 - '0') * 8 + character2 - '0');
1140                    }
1141                } else {
1142                    if (shouldBuildStrings)
1143                        record16(character1 - '0');
1144                }
1145            } else if (!atEnd()) {
1146                if (shouldBuildStrings)
1147                    record16(m_current);
1148                shift();
1149            } else {
1150                m_lexErrorMessage = "Unterminated string constant";
1151                return StringUnterminated;
1152            }
1153
1154            stringStart = currentSourcePtr();
1155            continue;
1156        }
1157        // Fast check for characters that require special handling.
1158        // Catches 0, \n, \r, 0x2028, and 0x2029 as efficiently
1159        // as possible, and lets through all common ASCII characters.
1160        if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
1161            // New-line or end of input is not allowed
1162            if (atEnd() || isLineTerminator(m_current)) {
1163                m_lexErrorMessage = "Unexpected EOF";
1164                return atEnd() ? StringUnterminated : StringCannotBeParsed;
1165            }
1166            // Anything else is just a normal character
1167        }
1168        shift();
1169    }
1170
1171    if (currentSourcePtr() != stringStart && shouldBuildStrings)
1172        append16(stringStart, currentSourcePtr() - stringStart);
1173    if (shouldBuildStrings)
1174        tokenData->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1175    else
1176        tokenData->ident = 0;
1177
1178    m_buffer16.resize(0);
1179    return StringParsedSuccessfully;
1180}
1181
1182template <typename T>
1183ALWAYS_INLINE void Lexer<T>::parseHex(double& returnValue)
1184{
1185    // Optimization: most hexadecimal values fit into 4 bytes.
1186    uint32_t hexValue = 0;
1187    int maximumDigits = 7;
1188
1189    // Shift out the 'x' prefix.
1190    shift();
1191
1192    do {
1193        hexValue = (hexValue << 4) + toASCIIHexValue(m_current);
1194        shift();
1195        --maximumDigits;
1196    } while (isASCIIHexDigit(m_current) && maximumDigits >= 0);
1197
1198    if (maximumDigits >= 0) {
1199        returnValue = hexValue;
1200        return;
1201    }
1202
1203    // No more place in the hexValue buffer.
1204    // The values are shifted out and placed into the m_buffer8 vector.
1205    for (int i = 0; i < 8; ++i) {
1206         int digit = hexValue >> 28;
1207         if (digit < 10)
1208             record8(digit + '0');
1209         else
1210             record8(digit - 10 + 'a');
1211         hexValue <<= 4;
1212    }
1213
1214    while (isASCIIHexDigit(m_current)) {
1215        record8(m_current);
1216        shift();
1217    }
1218
1219    returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16);
1220}
1221
1222template <typename T>
1223ALWAYS_INLINE bool Lexer<T>::parseOctal(double& returnValue)
1224{
1225    // Optimization: most octal values fit into 4 bytes.
1226    uint32_t octalValue = 0;
1227    int maximumDigits = 9;
1228    // Temporary buffer for the digits. Makes easier
1229    // to reconstruct the input characters when needed.
1230    LChar digits[10];
1231
1232    do {
1233        octalValue = octalValue * 8 + (m_current - '0');
1234        digits[maximumDigits] = m_current;
1235        shift();
1236        --maximumDigits;
1237    } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0);
1238
1239    if (!isASCIIDigit(m_current) && maximumDigits >= 0) {
1240        returnValue = octalValue;
1241        return true;
1242    }
1243
1244    for (int i = 9; i > maximumDigits; --i)
1245         record8(digits[i]);
1246
1247    while (isASCIIOctalDigit(m_current)) {
1248        record8(m_current);
1249        shift();
1250    }
1251
1252    if (isASCIIDigit(m_current))
1253        return false;
1254
1255    returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8);
1256    return true;
1257}
1258
1259template <typename T>
1260ALWAYS_INLINE bool Lexer<T>::parseDecimal(double& returnValue)
1261{
1262    // Optimization: most decimal values fit into 4 bytes.
1263    uint32_t decimalValue = 0;
1264
1265    // Since parseOctal may be executed before parseDecimal,
1266    // the m_buffer8 may hold ascii digits.
1267    if (!m_buffer8.size()) {
1268        int maximumDigits = 9;
1269        // Temporary buffer for the digits. Makes easier
1270        // to reconstruct the input characters when needed.
1271        LChar digits[10];
1272
1273        do {
1274            decimalValue = decimalValue * 10 + (m_current - '0');
1275            digits[maximumDigits] = m_current;
1276            shift();
1277            --maximumDigits;
1278        } while (isASCIIDigit(m_current) && maximumDigits >= 0);
1279
1280        if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') {
1281            returnValue = decimalValue;
1282            return true;
1283        }
1284
1285        for (int i = 9; i > maximumDigits; --i)
1286            record8(digits[i]);
1287    }
1288
1289    while (isASCIIDigit(m_current)) {
1290        record8(m_current);
1291        shift();
1292    }
1293
1294    return false;
1295}
1296
1297template <typename T>
1298ALWAYS_INLINE void Lexer<T>::parseNumberAfterDecimalPoint()
1299{
1300    record8('.');
1301    while (isASCIIDigit(m_current)) {
1302        record8(m_current);
1303        shift();
1304    }
1305}
1306
1307template <typename T>
1308ALWAYS_INLINE bool Lexer<T>::parseNumberAfterExponentIndicator()
1309{
1310    record8('e');
1311    shift();
1312    if (m_current == '+' || m_current == '-') {
1313        record8(m_current);
1314        shift();
1315    }
1316
1317    if (!isASCIIDigit(m_current))
1318        return false;
1319
1320    do {
1321        record8(m_current);
1322        shift();
1323    } while (isASCIIDigit(m_current));
1324    return true;
1325}
1326
1327template <typename T>
1328ALWAYS_INLINE bool Lexer<T>::parseMultilineComment()
1329{
1330    while (true) {
1331        while (UNLIKELY(m_current == '*')) {
1332            shift();
1333            if (m_current == '/') {
1334                shift();
1335                return true;
1336            }
1337        }
1338
1339        if (atEnd())
1340            return false;
1341
1342        if (isLineTerminator(m_current)) {
1343            shiftLineTerminator();
1344            m_terminator = true;
1345        } else
1346            shift();
1347    }
1348}
1349
1350template <typename T>
1351bool Lexer<T>::nextTokenIsColon()
1352{
1353    const T* code = m_code;
1354    while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code)))
1355        code++;
1356
1357    return code < m_codeEnd && *code == ':';
1358}
1359
1360template <typename T>
1361JSTokenType Lexer<T>::lex(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode)
1362{
1363    JSTokenData* tokenData = &tokenRecord->m_data;
1364    JSTokenLocation* tokenLocation = &tokenRecord->m_location;
1365    ASSERT(!m_error);
1366    ASSERT(m_buffer8.isEmpty());
1367    ASSERT(m_buffer16.isEmpty());
1368
1369    JSTokenType token = ERRORTOK;
1370    m_terminator = false;
1371
1372start:
1373    while (isWhiteSpace(m_current))
1374        shift();
1375
1376    if (atEnd())
1377        return EOFTOK;
1378
1379    tokenLocation->startOffset = currentOffset();
1380    ASSERT(currentOffset() >= currentLineStartOffset());
1381    tokenRecord->m_startPosition = currentPosition();
1382
1383    CharacterType type;
1384    if (LIKELY(isLatin1(m_current)))
1385        type = static_cast<CharacterType>(typesOfLatin1Characters[m_current]);
1386    else if (isNonLatin1IdentStart(m_current))
1387        type = CharacterIdentifierStart;
1388    else if (isLineTerminator(m_current))
1389        type = CharacterLineTerminator;
1390    else
1391        type = CharacterInvalid;
1392
1393    switch (type) {
1394    case CharacterGreater:
1395        shift();
1396        if (m_current == '>') {
1397            shift();
1398            if (m_current == '>') {
1399                shift();
1400                if (m_current == '=') {
1401                    shift();
1402                    token = URSHIFTEQUAL;
1403                    break;
1404                }
1405                token = URSHIFT;
1406                break;
1407            }
1408            if (m_current == '=') {
1409                shift();
1410                token = RSHIFTEQUAL;
1411                break;
1412            }
1413            token = RSHIFT;
1414            break;
1415        }
1416        if (m_current == '=') {
1417            shift();
1418            token = GE;
1419            break;
1420        }
1421        token = GT;
1422        break;
1423    case CharacterEqual:
1424        shift();
1425        if (m_current == '=') {
1426            shift();
1427            if (m_current == '=') {
1428                shift();
1429                token = STREQ;
1430                break;
1431            }
1432            token = EQEQ;
1433            break;
1434        }
1435        token = EQUAL;
1436        break;
1437    case CharacterLess:
1438        shift();
1439        if (m_current == '!' && peek(1) == '-' && peek(2) == '-') {
1440            // <!-- marks the beginning of a line comment (for www usage)
1441            goto inSingleLineComment;
1442        }
1443        if (m_current == '<') {
1444            shift();
1445            if (m_current == '=') {
1446                shift();
1447                token = LSHIFTEQUAL;
1448                break;
1449            }
1450            token = LSHIFT;
1451            break;
1452        }
1453        if (m_current == '=') {
1454            shift();
1455            token = LE;
1456            break;
1457        }
1458        token = LT;
1459        break;
1460    case CharacterExclamationMark:
1461        shift();
1462        if (m_current == '=') {
1463            shift();
1464            if (m_current == '=') {
1465                shift();
1466                token = STRNEQ;
1467                break;
1468            }
1469            token = NE;
1470            break;
1471        }
1472        token = EXCLAMATION;
1473        break;
1474    case CharacterAdd:
1475        shift();
1476        if (m_current == '+') {
1477            shift();
1478            token = (!m_terminator) ? PLUSPLUS : AUTOPLUSPLUS;
1479            break;
1480        }
1481        if (m_current == '=') {
1482            shift();
1483            token = PLUSEQUAL;
1484            break;
1485        }
1486        token = PLUS;
1487        break;
1488    case CharacterSub:
1489        shift();
1490        if (m_current == '-') {
1491            shift();
1492            if (m_atLineStart && m_current == '>') {
1493                shift();
1494                goto inSingleLineComment;
1495            }
1496            token = (!m_terminator) ? MINUSMINUS : AUTOMINUSMINUS;
1497            break;
1498        }
1499        if (m_current == '=') {
1500            shift();
1501            token = MINUSEQUAL;
1502            break;
1503        }
1504        token = MINUS;
1505        break;
1506    case CharacterMultiply:
1507        shift();
1508        if (m_current == '=') {
1509            shift();
1510            token = MULTEQUAL;
1511            break;
1512        }
1513        token = TIMES;
1514        break;
1515    case CharacterSlash:
1516        shift();
1517        if (m_current == '/') {
1518            shift();
1519            goto inSingleLineComment;
1520        }
1521        if (m_current == '*') {
1522            shift();
1523            if (parseMultilineComment())
1524                goto start;
1525            m_lexErrorMessage = "Multiline comment was not closed properly";
1526            token = UNTERMINATED_MULTILINE_COMMENT_ERRORTOK;
1527            goto returnError;
1528        }
1529        if (m_current == '=') {
1530            shift();
1531            token = DIVEQUAL;
1532            break;
1533        }
1534        token = DIVIDE;
1535        break;
1536    case CharacterAnd:
1537        shift();
1538        if (m_current == '&') {
1539            shift();
1540            token = AND;
1541            break;
1542        }
1543        if (m_current == '=') {
1544            shift();
1545            token = ANDEQUAL;
1546            break;
1547        }
1548        token = BITAND;
1549        break;
1550    case CharacterXor:
1551        shift();
1552        if (m_current == '=') {
1553            shift();
1554            token = XOREQUAL;
1555            break;
1556        }
1557        token = BITXOR;
1558        break;
1559    case CharacterModulo:
1560        shift();
1561        if (m_current == '=') {
1562            shift();
1563            token = MODEQUAL;
1564            break;
1565        }
1566        token = MOD;
1567        break;
1568    case CharacterOr:
1569        shift();
1570        if (m_current == '=') {
1571            shift();
1572            token = OREQUAL;
1573            break;
1574        }
1575        if (m_current == '|') {
1576            shift();
1577            token = OR;
1578            break;
1579        }
1580        token = BITOR;
1581        break;
1582    case CharacterOpenParen:
1583        token = OPENPAREN;
1584        shift();
1585        break;
1586    case CharacterCloseParen:
1587        token = CLOSEPAREN;
1588        shift();
1589        break;
1590    case CharacterOpenBracket:
1591        token = OPENBRACKET;
1592        shift();
1593        break;
1594    case CharacterCloseBracket:
1595        token = CLOSEBRACKET;
1596        shift();
1597        break;
1598    case CharacterComma:
1599        token = COMMA;
1600        shift();
1601        break;
1602    case CharacterColon:
1603        token = COLON;
1604        shift();
1605        break;
1606    case CharacterQuestion:
1607        token = QUESTION;
1608        shift();
1609        break;
1610    case CharacterTilde:
1611        token = TILDE;
1612        shift();
1613        break;
1614    case CharacterSemicolon:
1615        shift();
1616        token = SEMICOLON;
1617        break;
1618    case CharacterOpenBrace:
1619        tokenData->line = lineNumber();
1620        tokenData->offset = currentOffset();
1621        tokenData->lineStartOffset = currentLineStartOffset();
1622        ASSERT(tokenData->offset >= tokenData->lineStartOffset);
1623        shift();
1624        token = OPENBRACE;
1625        break;
1626    case CharacterCloseBrace:
1627        tokenData->line = lineNumber();
1628        tokenData->offset = currentOffset();
1629        tokenData->lineStartOffset = currentLineStartOffset();
1630        ASSERT(tokenData->offset >= tokenData->lineStartOffset);
1631        shift();
1632        token = CLOSEBRACE;
1633        break;
1634    case CharacterDot:
1635        shift();
1636        if (!isASCIIDigit(m_current)) {
1637            if (UNLIKELY((m_current == '.') && (peek(1) == '.'))) {
1638                shift();
1639                shift();
1640                token = DOTDOTDOT;
1641                break;
1642            }
1643            token = DOT;
1644            break;
1645        }
1646        goto inNumberAfterDecimalPoint;
1647    case CharacterZero:
1648        shift();
1649        if ((m_current | 0x20) == 'x') {
1650            if (!isASCIIHexDigit(peek(1))) {
1651                m_lexErrorMessage = "No hexadecimal digits after '0x'";
1652                token = INVALID_HEX_NUMBER_ERRORTOK;
1653                goto returnError;
1654            }
1655            parseHex(tokenData->doubleValue);
1656            if (isIdentStart(m_current)) {
1657                m_lexErrorMessage = "No space between hexadecimal literal and identifier";
1658                token = INVALID_HEX_NUMBER_ERRORTOK;
1659                goto returnError;
1660            }
1661            token = NUMBER;
1662            m_buffer8.resize(0);
1663            break;
1664        }
1665
1666        record8('0');
1667        if (isASCIIOctalDigit(m_current)) {
1668            if (parseOctal(tokenData->doubleValue)) {
1669                if (strictMode) {
1670                    m_lexErrorMessage = "Octal escapes are forbidden in strict mode";
1671                    token = INVALID_OCTAL_NUMBER_ERRORTOK;
1672                    goto returnError;
1673                }
1674                token = NUMBER;
1675            }
1676        }
1677        FALLTHROUGH;
1678    case CharacterNumber:
1679        if (LIKELY(token != NUMBER)) {
1680            if (!parseDecimal(tokenData->doubleValue)) {
1681                if (m_current == '.') {
1682                    shift();
1683inNumberAfterDecimalPoint:
1684                    parseNumberAfterDecimalPoint();
1685                }
1686                if ((m_current | 0x20) == 'e') {
1687                    if (!parseNumberAfterExponentIndicator()) {
1688                        m_lexErrorMessage = "Non-number found after exponent indicator";
1689                        token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
1690                        goto returnError;
1691                    }
1692                }
1693                size_t parsedLength;
1694                tokenData->doubleValue = parseDouble(m_buffer8.data(), m_buffer8.size(), parsedLength);
1695            }
1696            token = NUMBER;
1697        }
1698
1699        // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
1700        if (UNLIKELY(isIdentStart(m_current))) {
1701            m_lexErrorMessage = "At least one digit must occur after a decimal point";
1702            token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
1703            goto returnError;
1704        }
1705        m_buffer8.resize(0);
1706        break;
1707    case CharacterQuote:
1708        if (lexerFlags & LexerFlagsDontBuildStrings) {
1709            StringParseResult result = parseString<false>(tokenData, strictMode);
1710            if (UNLIKELY(result != StringParsedSuccessfully)) {
1711                token = result == StringUnterminated ? UNTERMINATED_STRING_LITERAL_ERRORTOK : INVALID_STRING_LITERAL_ERRORTOK;
1712                goto returnError;
1713            }
1714        } else {
1715            StringParseResult result = parseString<true>(tokenData, strictMode);
1716            if (UNLIKELY(result != StringParsedSuccessfully)) {
1717                token = result == StringUnterminated ? UNTERMINATED_STRING_LITERAL_ERRORTOK : INVALID_STRING_LITERAL_ERRORTOK;
1718                goto returnError;
1719            }
1720        }
1721        shift();
1722        token = STRING;
1723        break;
1724    case CharacterIdentifierStart:
1725        ASSERT(isIdentStart(m_current));
1726        FALLTHROUGH;
1727    case CharacterBackSlash:
1728        parseIdent:
1729        if (lexerFlags & LexexFlagsDontBuildKeywords)
1730            token = parseIdentifier<false>(tokenData, lexerFlags, strictMode);
1731        else
1732            token = parseIdentifier<true>(tokenData, lexerFlags, strictMode);
1733        break;
1734    case CharacterLineTerminator:
1735        ASSERT(isLineTerminator(m_current));
1736        shiftLineTerminator();
1737        m_atLineStart = true;
1738        m_terminator = true;
1739        m_lineStart = m_code;
1740        goto start;
1741    case CharacterPrivateIdentifierStart:
1742        if (m_parsingBuiltinFunction)
1743            goto parseIdent;
1744
1745        FALLTHROUGH;
1746    case CharacterInvalid:
1747        m_lexErrorMessage = invalidCharacterMessage();
1748        token = ERRORTOK;
1749        goto returnError;
1750    default:
1751        RELEASE_ASSERT_NOT_REACHED();
1752        m_lexErrorMessage = "Internal Error";
1753        token = ERRORTOK;
1754        goto returnError;
1755    }
1756
1757    m_atLineStart = false;
1758    goto returnToken;
1759
1760inSingleLineComment:
1761    while (!isLineTerminator(m_current)) {
1762        if (atEnd())
1763            return EOFTOK;
1764        shift();
1765    }
1766    shiftLineTerminator();
1767    m_atLineStart = true;
1768    m_terminator = true;
1769    m_lineStart = m_code;
1770    if (!lastTokenWasRestrKeyword())
1771        goto start;
1772
1773    token = SEMICOLON;
1774    // Fall through into returnToken.
1775
1776returnToken:
1777    tokenLocation->line = m_lineNumber;
1778    tokenLocation->endOffset = currentOffset();
1779    tokenLocation->lineStartOffset = currentLineStartOffset();
1780    ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset);
1781    tokenRecord->m_endPosition = currentPosition();
1782    m_lastToken = token;
1783    return token;
1784
1785returnError:
1786    m_error = true;
1787    tokenLocation->line = m_lineNumber;
1788    tokenLocation->endOffset = currentOffset();
1789    tokenLocation->lineStartOffset = currentLineStartOffset();
1790    ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset);
1791    tokenRecord->m_endPosition = currentPosition();
1792    RELEASE_ASSERT(token & ErrorTokenFlag);
1793    return token;
1794}
1795
1796template <typename T>
1797static inline void orCharacter(UChar&, UChar);
1798
1799template <>
1800inline void orCharacter<LChar>(UChar&, UChar) { }
1801
1802template <>
1803inline void orCharacter<UChar>(UChar& orAccumulator, UChar character)
1804{
1805    orAccumulator |= character;
1806}
1807
1808template <typename T>
1809bool Lexer<T>::scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix)
1810{
1811    ASSERT(m_buffer16.isEmpty());
1812
1813    bool lastWasEscape = false;
1814    bool inBrackets = false;
1815    UChar charactersOredTogether = 0;
1816
1817    if (patternPrefix) {
1818        ASSERT(!isLineTerminator(patternPrefix));
1819        ASSERT(patternPrefix != '/');
1820        ASSERT(patternPrefix != '[');
1821        record16(patternPrefix);
1822    }
1823
1824    while (true) {
1825        if (isLineTerminator(m_current) || atEnd()) {
1826            m_buffer16.resize(0);
1827            return false;
1828        }
1829
1830        T prev = m_current;
1831
1832        shift();
1833
1834        if (prev == '/' && !lastWasEscape && !inBrackets)
1835            break;
1836
1837        record16(prev);
1838        orCharacter<T>(charactersOredTogether, prev);
1839
1840        if (lastWasEscape) {
1841            lastWasEscape = false;
1842            continue;
1843        }
1844
1845        switch (prev) {
1846        case '[':
1847            inBrackets = true;
1848            break;
1849        case ']':
1850            inBrackets = false;
1851            break;
1852        case '\\':
1853            lastWasEscape = true;
1854            break;
1855        }
1856    }
1857
1858    pattern = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
1859
1860    m_buffer16.resize(0);
1861    charactersOredTogether = 0;
1862
1863    while (isIdentPart(m_current)) {
1864        record16(m_current);
1865        orCharacter<T>(charactersOredTogether, m_current);
1866        shift();
1867    }
1868
1869    flags = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
1870    m_buffer16.resize(0);
1871
1872    return true;
1873}
1874
1875template <typename T>
1876bool Lexer<T>::skipRegExp()
1877{
1878    bool lastWasEscape = false;
1879    bool inBrackets = false;
1880
1881    while (true) {
1882        if (isLineTerminator(m_current) || atEnd())
1883            return false;
1884
1885        T prev = m_current;
1886
1887        shift();
1888
1889        if (prev == '/' && !lastWasEscape && !inBrackets)
1890            break;
1891
1892        if (lastWasEscape) {
1893            lastWasEscape = false;
1894            continue;
1895        }
1896
1897        switch (prev) {
1898        case '[':
1899            inBrackets = true;
1900            break;
1901        case ']':
1902            inBrackets = false;
1903            break;
1904        case '\\':
1905            lastWasEscape = true;
1906            break;
1907        }
1908    }
1909
1910    while (isIdentPart(m_current))
1911        shift();
1912
1913    return true;
1914}
1915
1916template <typename T>
1917void Lexer<T>::clear()
1918{
1919    m_arena = 0;
1920
1921    Vector<LChar> newBuffer8;
1922    m_buffer8.swap(newBuffer8);
1923
1924    Vector<UChar> newBuffer16;
1925    m_buffer16.swap(newBuffer16);
1926
1927    m_isReparsing = false;
1928}
1929
1930// Instantiate the two flavors of Lexer we need instead of putting most of this file in Lexer.h
1931template class Lexer<LChar>;
1932template class Lexer<UChar>;
1933
1934} // namespace JSC
1935