Lexer.java revision 1571:fd97b9047199
1151497Sru/* 2151497Sru * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved. 318099Spst * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 418099Spst * 518099Spst * This code is free software; you can redistribute it and/or modify it 618099Spst * under the terms of the GNU General Public License version 2 only, as 718099Spst * published by the Free Software Foundation. Oracle designates this 818099Spst * particular file as subject to the "Classpath" exception as provided 918099Spst * by Oracle in the LICENSE file that accompanied this code. 1018099Spst * 1118099Spst * This code is distributed in the hope that it will be useful, but WITHOUT 1218099Spst * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 1318099Spst * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 1418099Spst * version 2 for more details (a copy is included in the LICENSE file that 1518099Spst * accompanied this code). 1618099Spst * 1718099Spst * You should have received a copy of the GNU General Public License version 1818099Spst * 2 along with this work; if not, write to the Free Software Foundation, 19151497Sru * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 2018099Spst * 21151497Sru * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22151497Sru * or visit www.oracle.com if you need additional information or have any 2375584Sru * questions. 24151497Sru */ 2575584Sru 26151497Srupackage jdk.nashorn.internal.parser; 2775584Sru 28151497Sruimport static jdk.nashorn.internal.parser.TokenType.ADD; 29151497Sruimport static jdk.nashorn.internal.parser.TokenType.BINARY_NUMBER; 30151497Sruimport static jdk.nashorn.internal.parser.TokenType.COMMENT; 31151497Sruimport static jdk.nashorn.internal.parser.TokenType.DECIMAL; 32151497Sruimport static jdk.nashorn.internal.parser.TokenType.DIRECTIVE_COMMENT; 33151497Sruimport static jdk.nashorn.internal.parser.TokenType.EOF; 34151497Sruimport static jdk.nashorn.internal.parser.TokenType.EOL; 35151497Sruimport static jdk.nashorn.internal.parser.TokenType.ERROR; 36151497Sruimport static jdk.nashorn.internal.parser.TokenType.ESCSTRING; 3775584Sruimport static jdk.nashorn.internal.parser.TokenType.EXECSTRING; 38104862Sruimport static jdk.nashorn.internal.parser.TokenType.FLOATING; 3975584Sruimport static jdk.nashorn.internal.parser.TokenType.FUNCTION; 4075584Sruimport static jdk.nashorn.internal.parser.TokenType.HEXADECIMAL; 4118099Spstimport static jdk.nashorn.internal.parser.TokenType.LBRACE; 4218099Spstimport static jdk.nashorn.internal.parser.TokenType.LPAREN; 4318099Spstimport static jdk.nashorn.internal.parser.TokenType.OCTAL; 4418099Spstimport static jdk.nashorn.internal.parser.TokenType.OCTAL_LEGACY; 4518099Spstimport static jdk.nashorn.internal.parser.TokenType.RBRACE; 46151497Sruimport static jdk.nashorn.internal.parser.TokenType.REGEX; 47151497Sruimport static jdk.nashorn.internal.parser.TokenType.RPAREN; 48151497Sruimport static jdk.nashorn.internal.parser.TokenType.STRING; 49151497Sruimport static jdk.nashorn.internal.parser.TokenType.TEMPLATE; 50151497Sruimport static jdk.nashorn.internal.parser.TokenType.TEMPLATE_HEAD; 51151497Sruimport static jdk.nashorn.internal.parser.TokenType.TEMPLATE_MIDDLE; 52151497Sruimport static jdk.nashorn.internal.parser.TokenType.TEMPLATE_TAIL; 53151497Sruimport static jdk.nashorn.internal.parser.TokenType.XML; 54151497Sru 55151497Sruimport java.io.Serializable; 56151497Sru 57151497Sruimport jdk.nashorn.internal.runtime.ECMAErrors; 5818099Spstimport jdk.nashorn.internal.runtime.ErrorManager; 5918099Spstimport jdk.nashorn.internal.runtime.JSErrorType; 6018099Spstimport jdk.nashorn.internal.runtime.JSType; 6118099Spstimport jdk.nashorn.internal.runtime.ParserException; 6218099Spstimport jdk.nashorn.internal.runtime.Source; 6318099Spstimport jdk.nashorn.internal.runtime.options.Options; 6418099Spst 6518099Spst/** 6618099Spst * Responsible for converting source content into a stream of tokens. 6718099Spst * 6818099Spst */ 6918099Spst@SuppressWarnings("fallthrough") 7018099Spstpublic class Lexer extends Scanner { 7118099Spst private static final long MIN_INT_L = Integer.MIN_VALUE; 7218099Spst private static final long MAX_INT_L = Integer.MAX_VALUE; 7318099Spst 7418099Spst private static final boolean XML_LITERALS = Options.getBooleanProperty("nashorn.lexer.xmlliterals"); 7518099Spst 7618099Spst /** Content source. */ 7718099Spst private final Source source; 7818099Spst 7918099Spst /** Buffered stream for tokens. */ 8018099Spst private final TokenStream stream; 81104862Sru 8218099Spst /** True if here and edit strings are supported. */ 8318099Spst private final boolean scripting; 84151497Sru 85151497Sru /** True if parsing in ECMAScript 6 mode. */ 86151497Sru private final boolean es6; 87151497Sru 88151497Sru /** True if a nested scan. (scan to completion, no EOF.) */ 89151497Sru private final boolean nested; 90151497Sru 91151497Sru /** Pending new line number and position. */ 92151497Sru int pendingLine; 93104862Sru 94104862Sru /** Position of last EOL + 1. */ 95104862Sru private int linePosition; 9669626Sru 97104862Sru /** Type of last token added. */ 98104862Sru private TokenType last; 99104862Sru 100104862Sru private final boolean pauseOnFunctionBody; 101104862Sru private boolean pauseOnNextLeftBrace; 10218099Spst 10318099Spst private int templateExpressionOpenBraces; 10418099Spst 105104862Sru private static final String SPACETAB = " \t"; // ASCII space and tab 10618099Spst private static final String LFCR = "\n\r"; // line feed and carriage return (ctrl-m) 10718099Spst 10818099Spst private static final String JAVASCRIPT_WHITESPACE_EOL = 10918099Spst LFCR + 11018099Spst "\u2028" + // line separator 11118099Spst "\u2029" // paragraph separator 11218099Spst ; 11318099Spst private static final String JAVASCRIPT_WHITESPACE = 11418099Spst SPACETAB + 11518099Spst JAVASCRIPT_WHITESPACE_EOL + 11618099Spst "\u000b" + // tabulation line 11718099Spst "\u000c" + // ff (ctrl-l) 11818099Spst "\u00a0" + // Latin-1 space 11918099Spst "\u1680" + // Ogham space mark 12018099Spst "\u180e" + // separator, Mongolian vowel 12118099Spst "\u2000" + // en quad 122104862Sru "\u2001" + // em quad 12375584Sru "\u2002" + // en space 12418099Spst "\u2003" + // em space 125104862Sru "\u2004" + // three-per-em space 12675584Sru "\u2005" + // four-per-em space 12775584Sru "\u2006" + // six-per-em space 12875584Sru "\u2007" + // figure space 129104862Sru "\u2008" + // punctuation space 13069626Sru "\u2009" + // thin space 13175584Sru "\u200a" + // hair space 13275584Sru "\u202f" + // narrow no-break space 13318099Spst "\u205f" + // medium mathematical space 134104862Sru "\u3000" + // ideographic space 135104862Sru "\ufeff" // byte order mark 136104862Sru ; 137104862Sru 138104862Sru private static final String JAVASCRIPT_WHITESPACE_IN_REGEXP = 139104862Sru "\\u000a" + // line feed 140104862Sru "\\u000d" + // carriage return (ctrl-m) 141104862Sru "\\u2028" + // line separator 142104862Sru "\\u2029" + // paragraph separator 143104862Sru "\\u0009" + // tab 144104862Sru "\\u0020" + // ASCII space 145104862Sru "\\u000b" + // tabulation line 146151497Sru "\\u000c" + // ff (ctrl-l) 147151497Sru "\\u00a0" + // Latin-1 space 148151497Sru "\\u1680" + // Ogham space mark 149104862Sru "\\u180e" + // separator, Mongolian vowel 15018099Spst "\\u2000" + // en quad 15118099Spst "\\u2001" + // em quad 152104862Sru "\\u2002" + // en space 153104862Sru "\\u2003" + // em space 15418099Spst "\\u2004" + // three-per-em space 155104862Sru "\\u2005" + // four-per-em space 156104862Sru "\\u2006" + // six-per-em space 157104862Sru "\\u2007" + // figure space 158104862Sru "\\u2008" + // punctuation space 159151497Sru "\\u2009" + // thin space 160104862Sru "\\u200a" + // hair space 161104862Sru "\\u202f" + // narrow no-break space 16218099Spst "\\u205f" + // medium mathematical space 16318099Spst "\\u3000" + // ideographic space 164104862Sru "\\ufeff" // byte order mark 16575584Sru ; 16675584Sru 167104862Sru static String unicodeEscape(final char ch) { 16875584Sru final StringBuilder sb = new StringBuilder(); 16975584Sru 170151497Sru sb.append("\\u"); 171151497Sru 172151497Sru final String hex = Integer.toHexString(ch); 173151497Sru for (int i = hex.length(); i < 4; i++) { 174104862Sru sb.append('0'); 17575584Sru } 17675584Sru sb.append(hex); 17775584Sru 17875584Sru return sb.toString(); 179151497Sru } 18018099Spst 181104862Sru /** 18218099Spst * Constructor 18318099Spst * 184114402Sru * @param source the source 185114402Sru * @param stream the token stream to lex 186114402Sru */ 187114402Sru public Lexer(final Source source, final TokenStream stream) { 188114402Sru this(source, stream, false, false); 189104862Sru } 19018099Spst 19118099Spst /** 19218099Spst * Constructor 19318099Spst * 19418099Spst * @param source the source 19518099Spst * @param stream the token stream to lex 19618099Spst * @param scripting are we in scripting mode 19718099Spst * @param es6 are we in ECMAScript 6 mode 19818099Spst */ 19918099Spst public Lexer(final Source source, final TokenStream stream, final boolean scripting, final boolean es6) { 20069626Sru this(source, 0, source.getLength(), stream, scripting, es6, false); 20118099Spst } 20218099Spst 20369626Sru /** 20418099Spst * Constructor 20518099Spst * 20669626Sru * @param source the source 20718099Spst * @param start start position in source from which to start lexing 20818099Spst * @param len length of source segment to lex 20918099Spst * @param stream token stream to lex 21018099Spst * @param scripting are we in scripting mode 21118099Spst * @param es6 are we in ECMAScript 6 mode 21218099Spst * @param pauseOnFunctionBody if true, lexer will return from {@link #lexify()} when it encounters a 21318099Spst * function body. This is used with the feature where the parser is skipping nested function bodies to 21418099Spst * avoid reading ahead unnecessarily when we skip the function bodies. 21518099Spst */ 21618099Spst public Lexer(final Source source, final int start, final int len, final TokenStream stream, final boolean scripting, final boolean es6, final boolean pauseOnFunctionBody) { 21718099Spst super(source.getContent(), 1, start, len); 21818099Spst this.source = source; 21918099Spst this.stream = stream; 22018099Spst this.scripting = scripting; 22118099Spst this.es6 = es6; 22218099Spst this.nested = false; 22318099Spst this.pendingLine = 1; 22418099Spst this.last = EOL; 22518099Spst 22618099Spst this.pauseOnFunctionBody = pauseOnFunctionBody; 227104862Sru } 22879543Sru 22979543Sru private Lexer(final Lexer lexer, final State state) { 23018099Spst super(lexer, state); 231104862Sru 23218099Spst source = lexer.source; 23318099Spst stream = lexer.stream; 23418099Spst scripting = lexer.scripting; 235104862Sru es6 = lexer.es6; 23618099Spst nested = true; 23718099Spst 23818099Spst pendingLine = state.pendingLine; 239104862Sru linePosition = state.linePosition; 24018099Spst last = EOL; 24118099Spst pauseOnFunctionBody = false; 24218099Spst } 243104862Sru 244104862Sru static class State extends Scanner.State { 245104862Sru /** Pending new line number and position. */ 246104862Sru public final int pendingLine; 247104862Sru 248151497Sru /** Position of last EOL + 1. */ 249151497Sru public final int linePosition; 250151497Sru 251151497Sru /** Type of last token added. */ 252151497Sru public final TokenType last; 25318099Spst 25418099Spst /* 25518099Spst * Constructor. 256104862Sru */ 25718099Spst 25818099Spst State(final int position, final int limit, final int line, final int pendingLine, final int linePosition, final TokenType last) { 25918099Spst super(position, limit, line); 26018099Spst 261104862Sru this.pendingLine = pendingLine; 26218099Spst this.linePosition = linePosition; 26318099Spst this.last = last; 26418099Spst } 265104862Sru } 266104862Sru 267151497Sru /** 26818099Spst * Save the state of the scan. 269104862Sru * 27069626Sru * @return Captured state. 271104862Sru */ 272104862Sru @Override 273104862Sru State saveState() { 274104862Sru return new State(position, limit, line, pendingLine, linePosition, last); 275104862Sru } 276104862Sru 277104862Sru /** 278104862Sru * Restore the state of the scan. 279151497Sru * 280104862Sru * @param state 281104862Sru * Captured state. 282104862Sru */ 28318099Spst void restoreState(final State state) { 284114402Sru super.restoreState(state); 285104862Sru 286104862Sru pendingLine = state.pendingLine; 287104862Sru linePosition = state.linePosition; 288104862Sru last = state.last; 28969626Sru } 290104862Sru 291151497Sru /** 292104862Sru * Add a new token to the stream. 293104862Sru * 294104862Sru * @param type 295104862Sru * Token type. 29669626Sru * @param start 297104862Sru * Start position. 298104862Sru * @param end 299151497Sru * End position. 300151497Sru */ 301114402Sru protected void add(final TokenType type, final int start, final int end) { 302151497Sru // Record last token. 303151497Sru last = type; 304114402Sru 305104862Sru // Only emit the last EOL in a cluster. 306104862Sru if (type == EOL) { 30769626Sru pendingLine = end; 30869626Sru linePosition = start; 30969626Sru } else { 310151497Sru // Write any pending EOL to stream. 311151497Sru if (pendingLine != -1) { 312151497Sru stream.put(Token.toDesc(EOL, linePosition, pendingLine)); 313151497Sru pendingLine = -1; 314104862Sru } 315104862Sru 31655839Sasmodai // Write token to stream. 31755839Sasmodai stream.put(Token.toDesc(type, start, end - start)); 31855839Sasmodai } 31918099Spst } 32055839Sasmodai 32118099Spst /** 32218099Spst * Add a new token to the stream. 32318099Spst * 32469626Sru * @param type 325151497Sru * Token type. 326151497Sru * @param start 327151497Sru * Start position. 32818099Spst */ 32918099Spst protected void add(final TokenType type, final int start) { 330104862Sru add(type, start, position); 331104862Sru } 332114402Sru 333114402Sru /** 334104862Sru * Return the String of valid whitespace characters for regular 335104862Sru * expressions in JavaScript 336151497Sru * @return regexp whitespace string 33718099Spst */ 33818099Spst public static String getWhitespaceRegExp() { 339114402Sru return JAVASCRIPT_WHITESPACE_IN_REGEXP; 34018099Spst } 34118099Spst 34218099Spst /** 34318099Spst * Skip end of line. 34418099Spst * 34518099Spst * @param addEOL true if EOL token should be recorded. 34618099Spst */ 34718099Spst private void skipEOL(final boolean addEOL) { 34818099Spst 349151497Sru if (ch0 == '\r') { // detect \r\n pattern 35055839Sasmodai skip(1); 351151497Sru if (ch0 == '\n') { 352151497Sru skip(1); 353151497Sru } 354151497Sru } else { // all other space, ch0 is guaranteed to be EOL or \0 355151497Sru skip(1); 356151497Sru } 35718099Spst 35818099Spst // bump up line count 35918099Spst line++; 360104862Sru 361104862Sru if (addEOL) { 362114402Sru // Add an EOL token. 363114402Sru add(EOL, position, line); 36418099Spst } 36555839Sasmodai } 36618099Spst 36718099Spst /** 36818099Spst * Skip over rest of line including end of line. 36969626Sru * 37018099Spst * @param addEOL true if EOL token should be recorded. 371104862Sru */ 37218099Spst private void skipLine(final boolean addEOL) { 37318099Spst // Ignore characters. 37418099Spst while (!isEOL(ch0) && !atEOF()) { 37518099Spst skip(1); 37618099Spst } 37718099Spst // Skip over end of line. 37818099Spst skipEOL(addEOL); 37918099Spst } 38018099Spst 38118099Spst /** 382151497Sru * Test whether a char is valid JavaScript whitespace 383151497Sru * @param ch a char 38418099Spst * @return true if valid JavaScript whitespace 385104862Sru */ 386151497Sru public static boolean isJSWhitespace(final char ch) { 38769626Sru return JAVASCRIPT_WHITESPACE.indexOf(ch) != -1; 38818099Spst } 38918099Spst 39018099Spst /** 39118099Spst * Test whether a char is valid JavaScript end of line 39218099Spst * @param ch a char 39318099Spst * @return true if valid JavaScript end of line 39418099Spst */ 39518099Spst public static boolean isJSEOL(final char ch) { 39669626Sru return JAVASCRIPT_WHITESPACE_EOL.indexOf(ch) != -1; 397151497Sru } 398151497Sru 399114402Sru /** 40069626Sru * Test if char is a string delimiter, e.g. '\' or '"'. 401114402Sru * @param ch a char 402114402Sru * @return true if string delimiter 403114402Sru */ 404114402Sru protected boolean isStringDelimiter(final char ch) { 405114402Sru return ch == '\'' || ch == '"'; 406114402Sru } 407151497Sru 40869626Sru /** 409114402Sru * Test if char is a template literal delimiter ('`'). 410114402Sru */ 411114402Sru private static boolean isTemplateDelimiter(char ch) { 412114402Sru return ch == '`'; 413114402Sru } 414114402Sru 415114402Sru /** 416114402Sru * Test whether a char is valid JavaScript whitespace 417114402Sru * @param ch a char 418114402Sru * @return true if valid JavaScript whitespace 419114402Sru */ 420114402Sru protected boolean isWhitespace(final char ch) { 421114402Sru return Lexer.isJSWhitespace(ch); 422114402Sru } 423114402Sru 424114402Sru /** 425104862Sru * Test whether a char is valid JavaScript end of line 426114402Sru * @param ch a char 427151497Sru * @return true if valid JavaScript end of line 428114402Sru */ 42969626Sru protected boolean isEOL(final char ch) { 430114402Sru return Lexer.isJSEOL(ch); 431114402Sru } 432114402Sru 433151497Sru /** 434151497Sru * Skip over whitespace and detect end of line, adding EOL tokens if 435151497Sru * encountered. 436114402Sru * 437151497Sru * @param addEOL true if EOL tokens should be recorded. 438114402Sru */ 439151497Sru private void skipWhitespace(final boolean addEOL) { 440151497Sru while (isWhitespace(ch0)) { 441151497Sru if (isEOL(ch0)) { 442151497Sru skipEOL(addEOL); 443151497Sru } else { 444151497Sru skip(1); 445151497Sru } 446114402Sru } 447114402Sru } 448151497Sru 449114402Sru /** 450114402Sru * Skip over comments. 45169626Sru * 45275584Sru * @return True if a comment. 45369626Sru */ 454104862Sru protected boolean skipComments() { 455104862Sru // Save the current position. 456114402Sru final int start = position; 457114402Sru 458114402Sru if (ch0 == '/') { 459114402Sru // Is it a // comment. 460104862Sru if (ch1 == '/') { 461151497Sru // Skip over //. 462114402Sru skip(2); 463114402Sru 464114402Sru boolean directiveComment = false; 465114402Sru if ((ch0 == '#' || ch0 == '@') && (ch1 == ' ')) { 466114402Sru directiveComment = true; 46775584Sru } 46875584Sru 469104862Sru // Scan for EOL. 47075584Sru while (!atEOF() && !isEOL(ch0)) { 471114402Sru skip(1); 472114402Sru } 473151497Sru // Did detect a comment. 474151497Sru add(directiveComment? DIRECTIVE_COMMENT : COMMENT, start); 475114402Sru return true; 47669626Sru } else if (ch1 == '*') { 477114402Sru // Skip over /*. 47869626Sru skip(2); 479114402Sru // Scan for */. 48069626Sru while (!atEOF() && !(ch0 == '*' && ch1 == '/')) { 481114402Sru // If end of line handle else skip character. 48269626Sru if (isEOL(ch0)) { 483114402Sru skipEOL(true); 484114402Sru } else { 48569626Sru skip(1); 486114402Sru } 487114402Sru } 48818099Spst 489114402Sru if (atEOF()) { 49069626Sru // TODO - Report closing */ missing in parser. 491114402Sru add(ERROR, start); 492114402Sru } else { 493114402Sru // Skip */. 494114402Sru skip(2); 495114402Sru } 496114402Sru 49718099Spst // Did detect a comment. 49869626Sru add(COMMENT, start); 49969626Sru return true; 50069626Sru } 50169626Sru } else if (ch0 == '#') { 502151497Sru assert scripting; 503151497Sru // shell style comment 50469626Sru // Skip over #. 50569626Sru skip(1); 50669626Sru // Scan for EOL. 50769626Sru while (!atEOF() && !isEOL(ch0)) { 50869626Sru skip(1); 50969626Sru } 51069626Sru // Did detect a comment. 51169626Sru add(COMMENT, start); 51269626Sru return true; 51375584Sru } 51469626Sru 51569626Sru // Not a comment. 51669626Sru return false; 51769626Sru } 51869626Sru 51969626Sru /** 52069626Sru * Convert a regex token to a token object. 52169626Sru * 52269626Sru * @param start Position in source content. 52369626Sru * @param length Length of regex token. 52469626Sru * @return Regex token object. 52569626Sru */ 526151497Sru public RegexToken valueOfPattern(final int start, final int length) { 527151497Sru // Save the current position. 528151497Sru final int savePosition = position; 529151497Sru // Reset to beginning of content. 530151497Sru reset(start); 53169626Sru // Buffer for recording characters. 53269626Sru final StringBuilder sb = new StringBuilder(length); 53369626Sru 534104862Sru // Skip /. 53569626Sru skip(1); 53669626Sru boolean inBrackets = false; 53769626Sru // Scan for closing /, stopping at end of line. 53869626Sru while (!atEOF() && ch0 != '/' && !isEOL(ch0) || inBrackets) { 53969626Sru // Skip over escaped character. 540151497Sru if (ch0 == '\\') { 54169626Sru sb.append(ch0); 54269626Sru sb.append(ch1); 54369626Sru skip(2); 54469626Sru } else { 54569626Sru if (ch0 == '[') { 54669626Sru inBrackets = true; 547151497Sru } else if (ch0 == ']') { 548104862Sru inBrackets = false; 549104862Sru } 550104862Sru 551151497Sru // Skip literal character. 552104862Sru sb.append(ch0); 553104862Sru skip(1); 554151497Sru } 555151497Sru } 55669626Sru 557151497Sru // Get pattern as string. 55869626Sru final String regex = sb.toString(); 55969626Sru 56069626Sru // Skip /. 561104862Sru skip(1); 56269626Sru 563104862Sru // Options as string. 564104862Sru final String options = source.getString(position, scanIdentifier()); 565104862Sru 566104862Sru reset(savePosition); 567104862Sru 568104862Sru // Compile the pattern. 569151497Sru return new RegexToken(regex, options); 570104862Sru } 571104862Sru 572151497Sru /** 573104862Sru * Return true if the given token can be the beginning of a literal. 57418099Spst * 57518099Spst * @param token a token 57618099Spst * @return true if token can start a literal. 57769626Sru */ 57869626Sru public boolean canStartLiteral(final TokenType token) { 57969626Sru return token.startsWith('/') || ((scripting || XML_LITERALS) && token.startsWith('<')); 58069626Sru } 581151497Sru 582151497Sru /** 58369626Sru * interface to receive line information for multi-line literals. 58469626Sru */ 58569626Sru protected interface LineInfoReceiver { 58618099Spst /** 58718099Spst * Receives line information 58818099Spst * @param line last line number 58969626Sru * @param linePosition position of last line 59018099Spst */ 59118099Spst public void lineInfo(int line, int linePosition); 59218099Spst } 59369626Sru 59418099Spst /** 59518099Spst * Check whether the given token represents the beginning of a literal. If so scan 59669626Sru * the literal and return <tt>true</tt>, otherwise return false. 59769626Sru * 59869626Sru * @param token the token. 59969626Sru * @param startTokenType the token type. 60055839Sasmodai * @param lir LineInfoReceiver that receives line info for multi-line string literals. 60118099Spst * @return True if a literal beginning with startToken was found and scanned. 60269626Sru */ 603151497Sru protected boolean scanLiteral(final long token, final TokenType startTokenType, final LineInfoReceiver lir) { 604151497Sru // Check if it can be a literal. 605151497Sru if (!canStartLiteral(startTokenType)) { 606151497Sru return false; 60718099Spst } 60869626Sru // We break on ambiguous tokens so if we already moved on it can't be a literal. 60918099Spst if (stream.get(stream.last()) != token) { 61069626Sru return false; 61118099Spst } 61269626Sru // Rewind to token start position 61369626Sru reset(Token.descPosition(token)); 61469626Sru 61569626Sru if (ch0 == '/') { 61618099Spst return scanRegEx(); 61718099Spst } else if (ch0 == '<') { 61869626Sru if (ch1 == '<') { 619151497Sru return scanHereString(lir); 620151497Sru } else if (Character.isJavaIdentifierStart(ch1)) { 621151497Sru return scanXMLLiteral(); 622151497Sru } 62318099Spst } 62469626Sru 62518099Spst return false; 62669626Sru } 62718099Spst 62869626Sru /** 62969626Sru * Scan over regex literal. 63069626Sru * 63169626Sru * @return True if a regex literal. 63218099Spst */ 63318099Spst private boolean scanRegEx() { 63469626Sru assert ch0 == '/'; 635151497Sru // Make sure it's not a comment. 636151497Sru if (ch1 != '/' && ch1 != '*') { 637151497Sru // Record beginning of literal. 638151497Sru final int start = position; 63918099Spst // Skip /. 64069626Sru skip(1); 64118099Spst boolean inBrackets = false; 64269626Sru 64318099Spst // Scan for closing /, stopping at end of line. 64469626Sru while (!atEOF() && (ch0 != '/' || inBrackets) && !isEOL(ch0)) { 64569626Sru // Skip over escaped character. 64669626Sru if (ch0 == '\\') { 64769626Sru skip(1); 64818099Spst if (isEOL(ch0)) { 649151497Sru reset(start); 65069626Sru return false; 651151497Sru } 652151497Sru skip(1); 653151497Sru } else { 654151497Sru if (ch0 == '[') { 65518099Spst inBrackets = true; 65669626Sru } else if (ch0 == ']') { 65718099Spst inBrackets = false; 65818099Spst } 65969626Sru 66069626Sru // Skip literal character. 66169626Sru skip(1); 66218099Spst } 663151497Sru } 66469626Sru 665151497Sru // If regex literal. 666151497Sru if (ch0 == '/') { 667151497Sru // Skip /. 668151497Sru skip(1); 66918099Spst 67069626Sru // Skip over options. 67118099Spst while (!atEOF() && Character.isJavaIdentifierPart(ch0) || ch0 == '\\' && ch1 == 'u') { 67218099Spst skip(1); 67369626Sru } 67469626Sru 67569626Sru // Add regex token. 67618099Spst add(REGEX, start); 67718099Spst // Regex literal detected. 67818099Spst return true; 67918099Spst } 68069626Sru 68118099Spst // False start try again. 68218099Spst reset(start); 68369626Sru } 68418099Spst 68518099Spst // Regex literal not detected. 68618099Spst return false; 68718099Spst } 68818099Spst 689104862Sru /** 69018099Spst * Convert a digit to a integer. Can't use Character.digit since we are 691104862Sru * restricted to ASCII by the spec. 692104862Sru * 693104862Sru * @param ch Character to convert. 69418099Spst * @param base Numeric base. 695104862Sru * 696104862Sru * @return The converted digit or -1 if invalid. 69718099Spst */ 698151497Sru protected static int convertDigit(final char ch, final int base) { 699151497Sru int digit; 700151497Sru 701151497Sru if ('0' <= ch && ch <= '9') { 702151497Sru digit = ch - '0'; 703151497Sru } else if ('A' <= ch && ch <= 'Z') { 70418099Spst digit = ch - 'A' + 10; 70518099Spst } else if ('a' <= ch && ch <= 'z') { 70618099Spst digit = ch - 'a' + 10; 70769626Sru } else { 70869626Sru return -1; 70969626Sru } 71069626Sru 71118099Spst return digit < base ? digit : -1; 71218099Spst } 71318099Spst 71418099Spst 71518099Spst /** 71618099Spst * Get the value of a hexadecimal numeric sequence. 71718099Spst * 71818099Spst * @param length Number of digits. 719104862Sru * @param type Type of token to report against. 720104862Sru * @return Value of sequence or < 0 if no digits. 72118099Spst */ 72218099Spst private int hexSequence(final int length, final TokenType type) { 72318099Spst int value = 0; 72418099Spst 72518099Spst for (int i = 0; i < length; i++) { 72618099Spst final int digit = convertDigit(ch0, 16); 72718099Spst 72818099Spst if (digit == -1) { 72918099Spst error(Lexer.message("invalid.hex"), type, position, limit); 73018099Spst return i == 0 ? -1 : value; 73118099Spst } 73218099Spst 73318099Spst value = digit | value << 4; 73418099Spst skip(1); 73518099Spst } 73618099Spst 73718099Spst return value; 738104862Sru } 73918099Spst 740104862Sru /** 741104862Sru * Get the value of an octal numeric sequence. This parses up to 3 digits with a maximum value of 255. 742151497Sru * 743151497Sru * @return Value of sequence. 744151497Sru */ 745151497Sru private int octalSequence() { 74618099Spst int value = 0; 747151497Sru 748151497Sru for (int i = 0; i < 3; i++) { 74918099Spst final int digit = convertDigit(ch0, 8); 750151497Sru 751151497Sru if (digit == -1) { 752151497Sru break; 753151497Sru } 754151497Sru value = digit | value << 3; 755151497Sru skip(1); 75618099Spst 757151497Sru if (i == 1 && value >= 32) { 758151497Sru break; 759151497Sru } 760151497Sru } 761151497Sru return value; 762151497Sru } 763151497Sru 764151497Sru /** 765151497Sru * Convert a string to a JavaScript identifier. 766151497Sru * 767151497Sru * @param start Position in source content. 768151497Sru * @param length Length of token. 769151497Sru * @return Ident string or null if an error. 770151497Sru */ 771151497Sru private String valueOfIdent(final int start, final int length) throws RuntimeException { 772151497Sru // Save the current position. 773151497Sru final int savePosition = position; 77418099Spst // End of scan. 77518099Spst final int end = start + length; 77618099Spst // Reset to beginning of content. 777 reset(start); 778 // Buffer for recording characters. 779 final StringBuilder sb = new StringBuilder(length); 780 781 // Scan until end of line or end of file. 782 while (!atEOF() && position < end && !isEOL(ch0)) { 783 // If escape character. 784 if (ch0 == '\\' && ch1 == 'u') { 785 skip(2); 786 final int ch = hexSequence(4, TokenType.IDENT); 787 if (isWhitespace((char)ch)) { 788 return null; 789 } 790 if (ch < 0) { 791 sb.append('\\'); 792 sb.append('u'); 793 } else { 794 sb.append((char)ch); 795 } 796 } else { 797 // Add regular character. 798 sb.append(ch0); 799 skip(1); 800 } 801 } 802 803 // Restore position. 804 reset(savePosition); 805 806 return sb.toString(); 807 } 808 809 /** 810 * Scan over and identifier or keyword. Handles identifiers containing 811 * encoded Unicode chars. 812 * 813 * Example: 814 * 815 * var \u0042 = 44; 816 */ 817 private void scanIdentifierOrKeyword() { 818 // Record beginning of identifier. 819 final int start = position; 820 // Scan identifier. 821 final int length = scanIdentifier(); 822 // Check to see if it is a keyword. 823 final TokenType type = TokenLookup.lookupKeyword(content, start, length); 824 if (type == FUNCTION && pauseOnFunctionBody) { 825 pauseOnNextLeftBrace = true; 826 } 827 // Add keyword or identifier token. 828 add(type, start); 829 } 830 831 /** 832 * Convert a string to a JavaScript string object. 833 * 834 * @param start Position in source content. 835 * @param length Length of token. 836 * @return JavaScript string object. 837 */ 838 private String valueOfString(final int start, final int length, final boolean strict) throws RuntimeException { 839 // Save the current position. 840 final int savePosition = position; 841 // Calculate the end position. 842 final int end = start + length; 843 // Reset to beginning of string. 844 reset(start); 845 846 // Buffer for recording characters. 847 final StringBuilder sb = new StringBuilder(length); 848 849 // Scan until end of string. 850 while (position < end) { 851 // If escape character. 852 if (ch0 == '\\') { 853 skip(1); 854 855 final char next = ch0; 856 final int afterSlash = position; 857 858 skip(1); 859 860 // Special characters. 861 switch (next) { 862 case '0': 863 case '1': 864 case '2': 865 case '3': 866 case '4': 867 case '5': 868 case '6': 869 case '7': { 870 if (strict) { 871 // "\0" itself is allowed in strict mode. Only other 'real' 872 // octal escape sequences are not allowed (eg. "\02", "\31"). 873 // See section 7.8.4 String literals production EscapeSequence 874 if (next != '0' || (ch0 >= '0' && ch0 <= '9')) { 875 error(Lexer.message("strict.no.octal"), STRING, position, limit); 876 } 877 } 878 reset(afterSlash); 879 // Octal sequence. 880 final int ch = octalSequence(); 881 882 if (ch < 0) { 883 sb.append('\\'); 884 sb.append('x'); 885 } else { 886 sb.append((char)ch); 887 } 888 break; 889 } 890 case 'n': 891 sb.append('\n'); 892 break; 893 case 't': 894 sb.append('\t'); 895 break; 896 case 'b': 897 sb.append('\b'); 898 break; 899 case 'f': 900 sb.append('\f'); 901 break; 902 case 'r': 903 sb.append('\r'); 904 break; 905 case '\'': 906 sb.append('\''); 907 break; 908 case '\"': 909 sb.append('\"'); 910 break; 911 case '\\': 912 sb.append('\\'); 913 break; 914 case '\r': // CR | CRLF 915 if (ch0 == '\n') { 916 skip(1); 917 } 918 // fall through 919 case '\n': // LF 920 case '\u2028': // LS 921 case '\u2029': // PS 922 // continue on the next line, slash-return continues string 923 // literal 924 break; 925 case 'x': { 926 // Hex sequence. 927 final int ch = hexSequence(2, STRING); 928 929 if (ch < 0) { 930 sb.append('\\'); 931 sb.append('x'); 932 } else { 933 sb.append((char)ch); 934 } 935 } 936 break; 937 case 'u': { 938 // Unicode sequence. 939 final int ch = hexSequence(4, STRING); 940 941 if (ch < 0) { 942 sb.append('\\'); 943 sb.append('u'); 944 } else { 945 sb.append((char)ch); 946 } 947 } 948 break; 949 case 'v': 950 sb.append('\u000B'); 951 break; 952 // All other characters. 953 default: 954 sb.append(next); 955 break; 956 } 957 } else if (ch0 == '\r') { 958 // Convert CR-LF or CR to LF line terminator. 959 sb.append('\n'); 960 skip(ch1 == '\n' ? 2 : 1); 961 } else { 962 // Add regular character. 963 sb.append(ch0); 964 skip(1); 965 } 966 } 967 968 // Restore position. 969 reset(savePosition); 970 971 return sb.toString(); 972 } 973 974 /** 975 * Scan over a string literal. 976 * @param add true if we are not just scanning but should actually modify the token stream 977 */ 978 protected void scanString(final boolean add) { 979 // Type of string. 980 TokenType type = STRING; 981 // Record starting quote. 982 final char quote = ch0; 983 // Skip over quote. 984 skip(1); 985 986 // Record beginning of string content. 987 final State stringState = saveState(); 988 989 // Scan until close quote or end of line. 990 while (!atEOF() && ch0 != quote && !isEOL(ch0)) { 991 // Skip over escaped character. 992 if (ch0 == '\\') { 993 type = ESCSTRING; 994 skip(1); 995 if (! isEscapeCharacter(ch0)) { 996 error(Lexer.message("invalid.escape.char"), STRING, position, limit); 997 } 998 if (isEOL(ch0)) { 999 // Multiline string literal 1000 skipEOL(false); 1001 continue; 1002 } 1003 } 1004 // Skip literal character. 1005 skip(1); 1006 } 1007 1008 // If close quote. 1009 if (ch0 == quote) { 1010 // Skip close quote. 1011 skip(1); 1012 } else { 1013 error(Lexer.message("missing.close.quote"), STRING, position, limit); 1014 } 1015 1016 // If not just scanning. 1017 if (add) { 1018 // Record end of string. 1019 stringState.setLimit(position - 1); 1020 1021 if (scripting && !stringState.isEmpty()) { 1022 switch (quote) { 1023 case '`': 1024 // Mark the beginning of an exec string. 1025 add(EXECSTRING, stringState.position, stringState.limit); 1026 // Frame edit string with left brace. 1027 add(LBRACE, stringState.position, stringState.position); 1028 // Process edit string. 1029 editString(type, stringState); 1030 // Frame edit string with right brace. 1031 add(RBRACE, stringState.limit, stringState.limit); 1032 break; 1033 case '"': 1034 // Only edit double quoted strings. 1035 editString(type, stringState); 1036 break; 1037 case '\'': 1038 // Add string token without editing. 1039 add(type, stringState.position, stringState.limit); 1040 break; 1041 default: 1042 break; 1043 } 1044 } else { 1045 /// Add string token without editing. 1046 add(type, stringState.position, stringState.limit); 1047 } 1048 } 1049 } 1050 1051 /** 1052 * Scan over a template string literal. 1053 */ 1054 private void scanTemplate() { 1055 assert ch0 == '`'; 1056 TokenType type = TEMPLATE; 1057 1058 // Skip over quote and record beginning of string content. 1059 skip(1); 1060 State stringState = saveState(); 1061 1062 // Scan until close quote 1063 while (!atEOF()) { 1064 // Skip over escaped character. 1065 if (ch0 == '`') { 1066 skip(1); 1067 // Record end of string. 1068 stringState.setLimit(position - 1); 1069 add(type == TEMPLATE ? type : TEMPLATE_TAIL, stringState.position, stringState.limit); 1070 return; 1071 } else if (ch0 == '$' && ch1 == '{') { 1072 skip(2); 1073 stringState.setLimit(position - 2); 1074 add(type == TEMPLATE ? TEMPLATE_HEAD : type, stringState.position, stringState.limit); 1075 1076 // scan to RBRACE 1077 Lexer expressionLexer = new Lexer(this, saveState()); 1078 expressionLexer.templateExpressionOpenBraces = 1; 1079 expressionLexer.lexify(); 1080 restoreState(expressionLexer.saveState()); 1081 1082 // scan next middle or tail of the template literal 1083 assert ch0 == '}'; 1084 type = TEMPLATE_MIDDLE; 1085 1086 // Skip over rbrace and record beginning of string content. 1087 skip(1); 1088 stringState = saveState(); 1089 1090 continue; 1091 } else if (ch0 == '\\') { 1092 skip(1); 1093 // EscapeSequence 1094 if (!isEscapeCharacter(ch0)) { 1095 error(Lexer.message("invalid.escape.char"), TEMPLATE, position, limit); 1096 } 1097 if (isEOL(ch0)) { 1098 // LineContinuation 1099 skipEOL(false); 1100 continue; 1101 } 1102 } else if (isEOL(ch0)) { 1103 // LineTerminatorSequence 1104 skipEOL(false); 1105 continue; 1106 } 1107 1108 // Skip literal character. 1109 skip(1); 1110 } 1111 1112 error(Lexer.message("missing.close.quote"), TEMPLATE, position, limit); 1113 } 1114 1115 /** 1116 * Is the given character a valid escape char after "\" ? 1117 * 1118 * @param ch character to be checked 1119 * @return if the given character is valid after "\" 1120 */ 1121 protected boolean isEscapeCharacter(final char ch) { 1122 return true; 1123 } 1124 1125 /** 1126 * Convert string to number. 1127 * 1128 * @param valueString String to convert. 1129 * @param radix Numeric base. 1130 * @return Converted number. 1131 */ 1132 private static Number valueOf(final String valueString, final int radix) throws NumberFormatException { 1133 try { 1134 return Integer.parseInt(valueString, radix); 1135 } catch (final NumberFormatException e) { 1136 if (radix == 10) { 1137 return Double.valueOf(valueString); 1138 } 1139 1140 double value = 0.0; 1141 1142 for (int i = 0; i < valueString.length(); i++) { 1143 final char ch = valueString.charAt(i); 1144 // Preverified, should always be a valid digit. 1145 final int digit = convertDigit(ch, radix); 1146 value *= radix; 1147 value += digit; 1148 } 1149 1150 return value; 1151 } 1152 } 1153 1154 /** 1155 * Scan a number. 1156 */ 1157 protected void scanNumber() { 1158 // Record beginning of number. 1159 final int start = position; 1160 // Assume value is a decimal. 1161 TokenType type = DECIMAL; 1162 1163 // First digit of number. 1164 int digit = convertDigit(ch0, 10); 1165 1166 // If number begins with 0x. 1167 if (digit == 0 && (ch1 == 'x' || ch1 == 'X') && convertDigit(ch2, 16) != -1) { 1168 // Skip over 0xN. 1169 skip(3); 1170 // Skip over remaining digits. 1171 while (convertDigit(ch0, 16) != -1) { 1172 skip(1); 1173 } 1174 1175 type = HEXADECIMAL; 1176 } else if (digit == 0 && es6 && (ch1 == 'o' || ch1 == 'O') && convertDigit(ch2, 8) != -1) { 1177 // Skip over 0oN. 1178 skip(3); 1179 // Skip over remaining digits. 1180 while (convertDigit(ch0, 8) != -1) { 1181 skip(1); 1182 } 1183 1184 type = OCTAL; 1185 } else if (digit == 0 && es6 && (ch1 == 'b' || ch1 == 'B') && convertDigit(ch2, 2) != -1) { 1186 // Skip over 0bN. 1187 skip(3); 1188 // Skip over remaining digits. 1189 while (convertDigit(ch0, 2) != -1) { 1190 skip(1); 1191 } 1192 1193 type = BINARY_NUMBER; 1194 } else { 1195 // Check for possible octal constant. 1196 boolean octal = digit == 0; 1197 // Skip first digit if not leading '.'. 1198 if (digit != -1) { 1199 skip(1); 1200 } 1201 1202 // Skip remaining digits. 1203 while ((digit = convertDigit(ch0, 10)) != -1) { 1204 // Check octal only digits. 1205 octal = octal && digit < 8; 1206 // Skip digit. 1207 skip(1); 1208 } 1209 1210 if (octal && position - start > 1) { 1211 type = OCTAL_LEGACY; 1212 } else if (ch0 == '.' || ch0 == 'E' || ch0 == 'e') { 1213 // Must be a double. 1214 if (ch0 == '.') { 1215 // Skip period. 1216 skip(1); 1217 // Skip mantissa. 1218 while (convertDigit(ch0, 10) != -1) { 1219 skip(1); 1220 } 1221 } 1222 1223 // Detect exponent. 1224 if (ch0 == 'E' || ch0 == 'e') { 1225 // Skip E. 1226 skip(1); 1227 // Detect and skip exponent sign. 1228 if (ch0 == '+' || ch0 == '-') { 1229 skip(1); 1230 } 1231 // Skip exponent. 1232 while (convertDigit(ch0, 10) != -1) { 1233 skip(1); 1234 } 1235 } 1236 1237 type = FLOATING; 1238 } 1239 } 1240 1241 if (Character.isJavaIdentifierStart(ch0)) { 1242 error(Lexer.message("missing.space.after.number"), type, position, 1); 1243 } 1244 1245 // Add number token. 1246 add(type, start); 1247 } 1248 1249 /** 1250 * Convert a regex token to a token object. 1251 * 1252 * @param start Position in source content. 1253 * @param length Length of regex token. 1254 * @return Regex token object. 1255 */ 1256 XMLToken valueOfXML(final int start, final int length) { 1257 return new XMLToken(source.getString(start, length)); 1258 } 1259 1260 /** 1261 * Scan over a XML token. 1262 * 1263 * @return TRUE if is an XML literal. 1264 */ 1265 private boolean scanXMLLiteral() { 1266 assert ch0 == '<' && Character.isJavaIdentifierStart(ch1); 1267 if (XML_LITERALS) { 1268 // Record beginning of xml expression. 1269 final int start = position; 1270 1271 int openCount = 0; 1272 1273 do { 1274 if (ch0 == '<') { 1275 if (ch1 == '/' && Character.isJavaIdentifierStart(ch2)) { 1276 skip(3); 1277 openCount--; 1278 } else if (Character.isJavaIdentifierStart(ch1)) { 1279 skip(2); 1280 openCount++; 1281 } else if (ch1 == '?') { 1282 skip(2); 1283 } else if (ch1 == '!' && ch2 == '-' && ch3 == '-') { 1284 skip(4); 1285 } else { 1286 reset(start); 1287 return false; 1288 } 1289 1290 while (!atEOF() && ch0 != '>') { 1291 if (ch0 == '/' && ch1 == '>') { 1292 openCount--; 1293 skip(1); 1294 break; 1295 } else if (ch0 == '\"' || ch0 == '\'') { 1296 scanString(false); 1297 } else { 1298 skip(1); 1299 } 1300 } 1301 1302 if (ch0 != '>') { 1303 reset(start); 1304 return false; 1305 } 1306 1307 skip(1); 1308 } else if (atEOF()) { 1309 reset(start); 1310 return false; 1311 } else { 1312 skip(1); 1313 } 1314 } while (openCount > 0); 1315 1316 add(XML, start); 1317 return true; 1318 } 1319 1320 return false; 1321 } 1322 1323 /** 1324 * Scan over identifier characters. 1325 * 1326 * @return Length of identifier or zero if none found. 1327 */ 1328 private int scanIdentifier() { 1329 final int start = position; 1330 1331 // Make sure first character is valid start character. 1332 if (ch0 == '\\' && ch1 == 'u') { 1333 skip(2); 1334 final int ch = hexSequence(4, TokenType.IDENT); 1335 1336 if (!Character.isJavaIdentifierStart(ch)) { 1337 error(Lexer.message("illegal.identifier.character"), TokenType.IDENT, start, position); 1338 } 1339 } else if (!Character.isJavaIdentifierStart(ch0)) { 1340 // Not an identifier. 1341 return 0; 1342 } 1343 1344 // Make sure remaining characters are valid part characters. 1345 while (!atEOF()) { 1346 if (ch0 == '\\' && ch1 == 'u') { 1347 skip(2); 1348 final int ch = hexSequence(4, TokenType.IDENT); 1349 1350 if (!Character.isJavaIdentifierPart(ch)) { 1351 error(Lexer.message("illegal.identifier.character"), TokenType.IDENT, start, position); 1352 } 1353 } else if (Character.isJavaIdentifierPart(ch0)) { 1354 skip(1); 1355 } else { 1356 break; 1357 } 1358 } 1359 1360 // Length of identifier sequence. 1361 return position - start; 1362 } 1363 1364 /** 1365 * Compare two identifiers (in content) for equality. 1366 * 1367 * @param aStart Start of first identifier. 1368 * @param aLength Length of first identifier. 1369 * @param bStart Start of second identifier. 1370 * @param bLength Length of second identifier. 1371 * @return True if equal. 1372 */ 1373 private boolean identifierEqual(final int aStart, final int aLength, final int bStart, final int bLength) { 1374 if (aLength == bLength) { 1375 for (int i = 0; i < aLength; i++) { 1376 if (content[aStart + i] != content[bStart + i]) { 1377 return false; 1378 } 1379 } 1380 1381 return true; 1382 } 1383 1384 return false; 1385 } 1386 1387 /** 1388 * Detect if a line starts with a marker identifier. 1389 * 1390 * @param identStart Start of identifier. 1391 * @param identLength Length of identifier. 1392 * @return True if detected. 1393 */ 1394 private boolean hasHereMarker(final int identStart, final int identLength) { 1395 // Skip any whitespace. 1396 skipWhitespace(false); 1397 1398 return identifierEqual(identStart, identLength, position, scanIdentifier()); 1399 } 1400 1401 /** 1402 * Lexer to service edit strings. 1403 */ 1404 private static class EditStringLexer extends Lexer { 1405 /** Type of string literals to emit. */ 1406 final TokenType stringType; 1407 1408 /* 1409 * Constructor. 1410 */ 1411 1412 EditStringLexer(final Lexer lexer, final TokenType stringType, final State stringState) { 1413 super(lexer, stringState); 1414 1415 this.stringType = stringType; 1416 } 1417 1418 /** 1419 * Lexify the contents of the string. 1420 */ 1421 @Override 1422 public void lexify() { 1423 // Record start of string position. 1424 int stringStart = position; 1425 // Indicate that the priming first string has not been emitted. 1426 boolean primed = false; 1427 1428 while (true) { 1429 // Detect end of content. 1430 if (atEOF()) { 1431 break; 1432 } 1433 1434 // Honour escapes (should be well formed.) 1435 if (ch0 == '\\' && stringType == ESCSTRING) { 1436 skip(2); 1437 1438 continue; 1439 } 1440 1441 // If start of expression. 1442 if (ch0 == '$' && ch1 == '{') { 1443 if (!primed || stringStart != position) { 1444 if (primed) { 1445 add(ADD, stringStart, stringStart + 1); 1446 } 1447 1448 add(stringType, stringStart, position); 1449 primed = true; 1450 } 1451 1452 // Skip ${ 1453 skip(2); 1454 1455 // Save expression state. 1456 final State expressionState = saveState(); 1457 1458 // Start with one open brace. 1459 int braceCount = 1; 1460 1461 // Scan for the rest of the string. 1462 while (!atEOF()) { 1463 // If closing brace. 1464 if (ch0 == '}') { 1465 // Break only only if matching brace. 1466 if (--braceCount == 0) { 1467 break; 1468 } 1469 } else if (ch0 == '{') { 1470 // Bump up the brace count. 1471 braceCount++; 1472 } 1473 1474 // Skip to next character. 1475 skip(1); 1476 } 1477 1478 // If braces don't match then report an error. 1479 if (braceCount != 0) { 1480 error(Lexer.message("edit.string.missing.brace"), LBRACE, expressionState.position - 1, 1); 1481 } 1482 1483 // Mark end of expression. 1484 expressionState.setLimit(position); 1485 // Skip closing brace. 1486 skip(1); 1487 1488 // Start next string. 1489 stringStart = position; 1490 1491 // Concatenate expression. 1492 add(ADD, expressionState.position, expressionState.position + 1); 1493 add(LPAREN, expressionState.position, expressionState.position + 1); 1494 1495 // Scan expression. 1496 final Lexer lexer = new Lexer(this, expressionState); 1497 lexer.lexify(); 1498 1499 // Close out expression parenthesis. 1500 add(RPAREN, position - 1, position); 1501 1502 continue; 1503 } 1504 1505 // Next character in string. 1506 skip(1); 1507 } 1508 1509 // If there is any unemitted string portion. 1510 if (stringStart != limit) { 1511 // Concatenate remaining string. 1512 if (primed) { 1513 add(ADD, stringStart, 1); 1514 } 1515 1516 add(stringType, stringStart, limit); 1517 } 1518 } 1519 1520 } 1521 1522 /** 1523 * Edit string for nested expressions. 1524 * 1525 * @param stringType Type of string literals to emit. 1526 * @param stringState State of lexer at start of string. 1527 */ 1528 private void editString(final TokenType stringType, final State stringState) { 1529 // Use special lexer to scan string. 1530 final EditStringLexer lexer = new EditStringLexer(this, stringType, stringState); 1531 lexer.lexify(); 1532 1533 // Need to keep lexer informed. 1534 last = stringType; 1535 } 1536 1537 /** 1538 * Scan over a here string. 1539 * 1540 * @return TRUE if is a here string. 1541 */ 1542 private boolean scanHereString(final LineInfoReceiver lir) { 1543 assert ch0 == '<' && ch1 == '<'; 1544 if (scripting) { 1545 // Record beginning of here string. 1546 final State saved = saveState(); 1547 1548 // << or <<< 1549 final boolean excludeLastEOL = ch2 != '<'; 1550 1551 if (excludeLastEOL) { 1552 skip(2); 1553 } else { 1554 skip(3); 1555 } 1556 1557 // Scan identifier. It might be quoted, indicating that no string editing should take place. 1558 final char quoteChar = ch0; 1559 final boolean noStringEditing = quoteChar == '"' || quoteChar == '\''; 1560 if (noStringEditing) { 1561 skip(1); 1562 } 1563 final int identStart = position; 1564 final int identLength = scanIdentifier(); 1565 if (noStringEditing) { 1566 if (ch0 != quoteChar) { 1567 error(Lexer.message("here.non.matching.delimiter"), last, position, position); 1568 restoreState(saved); 1569 return false; 1570 } 1571 skip(1); 1572 } 1573 1574 // Check for identifier. 1575 if (identLength == 0) { 1576 // Treat as shift. 1577 restoreState(saved); 1578 1579 return false; 1580 } 1581 1582 // Record rest of line. 1583 final State restState = saveState(); 1584 // keep line number updated 1585 int lastLine = line; 1586 1587 skipLine(false); 1588 lastLine++; 1589 int lastLinePosition = position; 1590 restState.setLimit(position); 1591 1592 // Record beginning of string. 1593 final State stringState = saveState(); 1594 int stringEnd = position; 1595 1596 // Hunt down marker. 1597 while (!atEOF()) { 1598 // Skip any whitespace. 1599 skipWhitespace(false); 1600 1601 if (hasHereMarker(identStart, identLength)) { 1602 break; 1603 } 1604 1605 skipLine(false); 1606 lastLine++; 1607 lastLinePosition = position; 1608 stringEnd = position; 1609 } 1610 1611 // notify last line information 1612 lir.lineInfo(lastLine, lastLinePosition); 1613 1614 // Record end of string. 1615 stringState.setLimit(stringEnd); 1616 1617 // If marker is missing. 1618 if (stringState.isEmpty() || atEOF()) { 1619 error(Lexer.message("here.missing.end.marker", source.getString(identStart, identLength)), last, position, position); 1620 restoreState(saved); 1621 1622 return false; 1623 } 1624 1625 // Remove last end of line if specified. 1626 if (excludeLastEOL) { 1627 // Handles \n. 1628 if (content[stringEnd - 1] == '\n') { 1629 stringEnd--; 1630 } 1631 1632 // Handles \r and \r\n. 1633 if (content[stringEnd - 1] == '\r') { 1634 stringEnd--; 1635 } 1636 1637 // Update end of string. 1638 stringState.setLimit(stringEnd); 1639 } 1640 1641 // Edit string if appropriate. 1642 if (!noStringEditing && !stringState.isEmpty()) { 1643 editString(STRING, stringState); 1644 } else { 1645 // Add here string. 1646 add(STRING, stringState.position, stringState.limit); 1647 } 1648 1649 // Scan rest of original line. 1650 final Lexer restLexer = new Lexer(this, restState); 1651 1652 restLexer.lexify(); 1653 1654 return true; 1655 } 1656 1657 return false; 1658 } 1659 1660 /** 1661 * Breaks source content down into lex units, adding tokens to the token 1662 * stream. The routine scans until the stream buffer is full. Can be called 1663 * repeatedly until EOF is detected. 1664 */ 1665 public void lexify() { 1666 while (!stream.isFull() || nested) { 1667 // Skip over whitespace. 1668 skipWhitespace(true); 1669 1670 // Detect end of file. 1671 if (atEOF()) { 1672 if (!nested) { 1673 // Add an EOF token at the end. 1674 add(EOF, position); 1675 } 1676 1677 break; 1678 } 1679 1680 // Check for comments. Note that we don't scan for regexp and other literals here as 1681 // we may not have enough context to distinguish them from similar looking operators. 1682 // Instead we break on ambiguous operators below and let the parser decide. 1683 if (ch0 == '/' && skipComments()) { 1684 continue; 1685 } 1686 1687 if (scripting && ch0 == '#' && skipComments()) { 1688 continue; 1689 } 1690 1691 // TokenType for lookup of delimiter or operator. 1692 TokenType type; 1693 1694 if (ch0 == '.' && convertDigit(ch1, 10) != -1) { 1695 // '.' followed by digit. 1696 // Scan and add a number. 1697 scanNumber(); 1698 } else if ((type = TokenLookup.lookupOperator(ch0, ch1, ch2, ch3)) != null) { 1699 if (templateExpressionOpenBraces > 0) { 1700 if (type == LBRACE) { 1701 templateExpressionOpenBraces++; 1702 } else if (type == RBRACE) { 1703 if (--templateExpressionOpenBraces == 0) { 1704 break; 1705 } 1706 } 1707 } 1708 1709 // Get the number of characters in the token. 1710 final int typeLength = type.getLength(); 1711 // Skip that many characters. 1712 skip(typeLength); 1713 // Add operator token. 1714 add(type, position - typeLength); 1715 // Some operator tokens also mark the beginning of regexp, XML, or here string literals. 1716 // We break to let the parser decide what it is. 1717 if (canStartLiteral(type)) { 1718 break; 1719 } else if (type == LBRACE && pauseOnNextLeftBrace) { 1720 pauseOnNextLeftBrace = false; 1721 break; 1722 } 1723 } else if (Character.isJavaIdentifierStart(ch0) || ch0 == '\\' && ch1 == 'u') { 1724 // Scan and add identifier or keyword. 1725 scanIdentifierOrKeyword(); 1726 } else if (isStringDelimiter(ch0)) { 1727 // Scan and add a string. 1728 scanString(true); 1729 } else if (Character.isDigit(ch0)) { 1730 // Scan and add a number. 1731 scanNumber(); 1732 } else if (isTemplateDelimiter(ch0) && es6) { 1733 // Scan and add template in ES6 mode. 1734 scanTemplate(); 1735 } else if (isTemplateDelimiter(ch0) && scripting) { 1736 // Scan and add an exec string ('`') in scripting mode. 1737 scanString(true); 1738 } else { 1739 // Don't recognize this character. 1740 skip(1); 1741 add(ERROR, position - 1); 1742 } 1743 } 1744 } 1745 1746 /** 1747 * Return value of token given its token descriptor. 1748 * 1749 * @param token Token descriptor. 1750 * @return JavaScript value. 1751 */ 1752 Object getValueOf(final long token, final boolean strict) { 1753 final int start = Token.descPosition(token); 1754 final int len = Token.descLength(token); 1755 1756 switch (Token.descType(token)) { 1757 case DECIMAL: 1758 return Lexer.valueOf(source.getString(start, len), 10); // number 1759 case HEXADECIMAL: 1760 return Lexer.valueOf(source.getString(start + 2, len - 2), 16); // number 1761 case OCTAL_LEGACY: 1762 return Lexer.valueOf(source.getString(start, len), 8); // number 1763 case OCTAL: 1764 return Lexer.valueOf(source.getString(start + 2, len - 2), 8); // number 1765 case BINARY_NUMBER: 1766 return Lexer.valueOf(source.getString(start + 2, len - 2), 2); // number 1767 case FLOATING: 1768 final String str = source.getString(start, len); 1769 final double value = Double.valueOf(str); 1770 if (str.indexOf('.') != -1) { 1771 return value; //number 1772 } 1773 //anything without an explicit decimal point is still subject to a 1774 //"representable as int or long" check. Then the programmer does not 1775 //explicitly code something as a double. For example new Color(int, int, int) 1776 //and new Color(float, float, float) will get ambiguous for cases like 1777 //new Color(1.0, 1.5, 1.5) if we don't respect the decimal point. 1778 //yet we don't want e.g. 1e6 to be a double unnecessarily 1779 if (JSType.isStrictlyRepresentableAsInt(value)) { 1780 return (int)value; 1781 } 1782 return value; 1783 case STRING: 1784 return source.getString(start, len); // String 1785 case ESCSTRING: 1786 return valueOfString(start, len, strict); // String 1787 case IDENT: 1788 return valueOfIdent(start, len); // String 1789 case REGEX: 1790 return valueOfPattern(start, len); // RegexToken::LexerToken 1791 case TEMPLATE: 1792 case TEMPLATE_HEAD: 1793 case TEMPLATE_MIDDLE: 1794 case TEMPLATE_TAIL: 1795 return valueOfString(start, len, true); // String 1796 case XML: 1797 return valueOfXML(start, len); // XMLToken::LexerToken 1798 case DIRECTIVE_COMMENT: 1799 return source.getString(start, len); 1800 default: 1801 break; 1802 } 1803 1804 return null; 1805 } 1806 1807 /** 1808 * Get the raw string value of a template literal string part. 1809 * 1810 * @param token template string token 1811 * @return raw string 1812 */ 1813 public String valueOfRawString(final long token) { 1814 final int start = Token.descPosition(token); 1815 final int length = Token.descLength(token); 1816 1817 // Save the current position. 1818 final int savePosition = position; 1819 // Calculate the end position. 1820 final int end = start + length; 1821 // Reset to beginning of string. 1822 reset(start); 1823 1824 // Buffer for recording characters. 1825 final StringBuilder sb = new StringBuilder(length); 1826 1827 // Scan until end of string. 1828 while (position < end) { 1829 if (ch0 == '\r') { 1830 // Convert CR-LF or CR to LF line terminator. 1831 sb.append('\n'); 1832 skip(ch1 == '\n' ? 2 : 1); 1833 } else { 1834 // Add regular character. 1835 sb.append(ch0); 1836 skip(1); 1837 } 1838 } 1839 1840 // Restore position. 1841 reset(savePosition); 1842 1843 return sb.toString(); 1844 } 1845 1846 /** 1847 * Get the correctly localized error message for a given message id format arguments 1848 * @param msgId message id 1849 * @param args format arguments 1850 * @return message 1851 */ 1852 protected static String message(final String msgId, final String... args) { 1853 return ECMAErrors.getMessage("lexer.error." + msgId, args); 1854 } 1855 1856 /** 1857 * Generate a runtime exception 1858 * 1859 * @param message error message 1860 * @param type token type 1861 * @param start start position of lexed error 1862 * @param length length of lexed error 1863 * @throws ParserException unconditionally 1864 */ 1865 protected void error(final String message, final TokenType type, final int start, final int length) throws ParserException { 1866 final long token = Token.toDesc(type, start, length); 1867 final int pos = Token.descPosition(token); 1868 final int lineNum = source.getLine(pos); 1869 final int columnNum = source.getColumn(pos); 1870 final String formatted = ErrorManager.format(message, source, lineNum, columnNum, token); 1871 throw new ParserException(JSErrorType.SYNTAX_ERROR, formatted, source, lineNum, columnNum, token); 1872 } 1873 1874 /** 1875 * Helper class for Lexer tokens, e.g XML or RegExp tokens. 1876 * This is the abstract superclass 1877 */ 1878 public static abstract class LexerToken implements Serializable { 1879 private static final long serialVersionUID = 1L; 1880 1881 private final String expression; 1882 1883 /** 1884 * Constructor 1885 * @param expression token expression 1886 */ 1887 protected LexerToken(final String expression) { 1888 this.expression = expression; 1889 } 1890 1891 /** 1892 * Get the expression 1893 * @return expression 1894 */ 1895 public String getExpression() { 1896 return expression; 1897 } 1898 } 1899 1900 /** 1901 * Temporary container for regular expressions. 1902 */ 1903 public static class RegexToken extends LexerToken { 1904 private static final long serialVersionUID = 1L; 1905 1906 /** Options. */ 1907 private final String options; 1908 1909 /** 1910 * Constructor. 1911 * 1912 * @param expression regexp expression 1913 * @param options regexp options 1914 */ 1915 public RegexToken(final String expression, final String options) { 1916 super(expression); 1917 this.options = options; 1918 } 1919 1920 /** 1921 * Get regexp options 1922 * @return options 1923 */ 1924 public String getOptions() { 1925 return options; 1926 } 1927 1928 @Override 1929 public String toString() { 1930 return '/' + getExpression() + '/' + options; 1931 } 1932 } 1933 1934 /** 1935 * Temporary container for XML expression. 1936 */ 1937 public static class XMLToken extends LexerToken { 1938 private static final long serialVersionUID = 1L; 1939 1940 /** 1941 * Constructor. 1942 * 1943 * @param expression XML expression 1944 */ 1945 public XMLToken(final String expression) { 1946 super(expression); 1947 } 1948 } 1949} 1950