Source code for tic.utils.jsparser

#!/usr/bin/python2.5

# ***** BEGIN LICENSE BLOCK *****
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is the Narcissus JavaScript engine, written in Javascript.
#
# The Initial Developer of the Original Code is
# Brendan Eich <brendan@mozilla.org>.
# Portions created by the Initial Developer are Copyright (C) 2004
# the Initial Developer. All Rights Reserved.
#
# The Python version of the code was created by JT Olds <jtolds@xnet5.com>,
# and is a direct translation from the Javascript version.
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ***** END LICENSE BLOCK ***** */

"""
 PyNarcissus

 A lexical scanner and parser. JS implemented in JS, ported to Python.
"""

__author__ = "JT Olds"
__author_email__ = "jtolds@xnet5.com"
__date__ = "2009-03-24"
__all__ = ["ParseError", "parse", "tokens"]

import re, sys, types

class Object: pass
class Error_(Exception): pass
[docs]class ParseError(Error_): pass
tokens = dict(enumerate(( # End of source. "END", # Operators and punctuators. Some pair-wise order matters, e.g. (+, -) # and (UNARY_PLUS, UNARY_MINUS). "\n", ";", ",", "=", "?", ":", "CONDITIONAL", "||", "&&", "|", "^", "&", "==", "!=", "===", "!==", "<", "<=", ">=", ">", "<<", ">>", ">>>", "+", "-", "*", "/", "%", "!", "~", "UNARY_PLUS", "UNARY_MINUS", "++", "--", ".", "[", "]", "{", "}", "(", ")", # Nonterminal tree node type codes. "SCRIPT", "BLOCK", "LABEL", "FOR_IN", "CALL", "NEW_WITH_ARGS", "INDEX", "ARRAY_INIT", "OBJECT_INIT", "PROPERTY_INIT", "GETTER", "SETTER", "GROUP", "LIST", # Terminals. "IDENTIFIER", "NUMBER", "STRING", "REGEXP", # Keywords. "break", "case", "catch", "const", "continue", "debugger", "default", "delete", "do", "else", "enum", "false", "finally", "for", "function", "if", "in", "instanceof", "new", "null", "return", "switch", "this", "throw", "true", "try", "typeof", "var", "void", "while", "with"))) # Operator and punctuator mapping from token to tree node type name. # NB: superstring tokens (e.g., ++) must come before their substring token # counterparts (+ in the example), so that the opRegExp regular expression # synthesized from this list makes the longest possible match. opTypeNames = [ ('\n', "NEWLINE"), (';', "SEMICOLON"), (',', "COMMA"), ('?', "HOOK"), (':', "COLON"), ('||', "OR"), ('&&', "AND"), ('|', "BITWISE_OR"), ('^', "BITWISE_XOR"), ('&', "BITWISE_AND"), ('===', "STRICT_EQ"), ('==', "EQ"), ('=', "ASSIGN"), ('!==', "STRICT_NE"), ('!=', "NE"), ('<<', "LSH"), ('<=', "LE"), ('<', "LT"), ('>>>', "URSH"), ('>>', "RSH"), ('>=', "GE"), ('>', "GT"), ('++', "INCREMENT"), ('--', "DECREMENT"), ('+', "PLUS"), ('-', "MINUS"), ('*', "MUL"), ('/', "DIV"), ('%', "MOD"), ('!', "NOT"), ('~', "BITWISE_NOT"), ('.', "DOT"), ('[', "LEFT_BRACKET"), (']', "RIGHT_BRACKET"), ('{', "LEFT_CURLY"), ('}', "RIGHT_CURLY"), ('(', "LEFT_PAREN"), (')', "RIGHT_PAREN"), ] keywords = {} # Define const END, etc., based on the token names. Also map name to index. for i, t in tokens.copy().iteritems(): if re.match(r'^[a-z]', t): const_name = t.upper() keywords[t] = i elif re.match(r'^\W', t): const_name = dict(opTypeNames)[t] else: const_name = t globals()[const_name] = i tokens[t] = i assignOps = {} # Map assignment operators to their indexes in the tokens array. for i, t in enumerate(['|', '^', '&', '<<', '>>', '>>>', '+', '-', '*', '/', '%']): assignOps[t] = tokens[t] assignOps[i] = t # Build a regexp that recognizes operators and punctuators (except newline). opRegExpSrc = "^" for i, j in opTypeNames: if i == "\n": continue if opRegExpSrc != "^": opRegExpSrc += "|^" opRegExpSrc += re.sub(r'[?|^&(){}\[\]+\-*\/\.]', lambda x: "\\%s" % x.group(0), i) opRegExp = re.compile(opRegExpSrc) # Convert opTypeNames to an actual dictionary now that we don't care about ordering opTypeNames = dict(opTypeNames) # A regexp to match floating point literals (but not integer literals). fpRegExp = re.compile(r'^\d+\.\d*(?:[eE][-+]?\d+)?|^\d+(?:\.\d*)?[eE][-+]?\d+|^\.\d+(?:[eE][-+]?\d+)?') # A regexp to match regexp literals. reRegExp = re.compile(r'^\/((?:\\.|\[(?:\\.|[^\]])*\]|[^\/])+)\/([gimy]*)') class SyntaxError_(ParseError): def __init__(self, message, filename, lineno): ParseError.__init__(self, "Syntax error: %s\n%s:%s" % (message, filename, lineno)) class Tokenizer(object): def __init__(self, s, f, l): self.cursor = 0 self.source = str(s) self.tokens = {} self.tokenIndex = 0 self.lookahead = 0 self.scanNewlines = False self.scanOperand = True self.filename = f self.lineno = l input_ = property(lambda self: self.source[self.cursor:]) done = property(lambda self: self.peek() == END) token = property(lambda self: self.tokens.get(self.tokenIndex)) def match(self, tt): return self.get() == tt or self.unget() def mustMatch(self, tt): if not self.match(tt): raise self.newSyntaxError("Missing " + tokens.get(tt).lower()) return self.token def peek(self): if self.lookahead: next = self.tokens.get((self.tokenIndex + self.lookahead) & 3) if self.scanNewlines and (getattr(next, "lineno", None) != getattr(self, "lineno", None)): tt = NEWLINE else: tt = getattr(next, "type_", None) else: tt = self.get() self.unget() return tt def peekOnSameLine(self): self.scanNewlines = True tt = self.peek() self.scanNewlines = False return tt def get(self): while self.lookahead: self.lookahead -= 1 self.tokenIndex = (self.tokenIndex + 1) & 3 token = self.tokens.get(self.tokenIndex) if getattr(token, "type_", None) != NEWLINE or self.scanNewlines: return getattr(token, "type_", None) while True: input__ = self.input_ if self.scanNewlines: match = re.match(r'^[ \t]+', input__) else: match = re.match(r'^\s+', input__) if match: spaces = match.group(0) self.cursor += len(spaces) newlines = re.findall(r'\n', spaces) if newlines: self.lineno += len(newlines) input__ = self.input_ match = re.match(r'^\/(?:\*(?:.|\n)*?\*\/|\/.*)', input__) if not match: break comment = match.group(0) self.cursor += len(comment) newlines = re.findall(r'\n', comment) if newlines: self.lineno += len(newlines) self.tokenIndex = (self.tokenIndex + 1) & 3 token = self.tokens.get(self.tokenIndex) if not token: token = Object() self.tokens[self.tokenIndex] = token if not input__: token.type_ = END return END def matchInput(): match = fpRegExp.match(input__) if match: token.type_ = NUMBER token.value = float(match.group(0)) return match.group(0) match = re.match(r'^0[xX][\da-fA-F]+|^0[0-7]*|^\d+', input__) if match: token.type_ = NUMBER token.value = eval(match.group(0)) return match.group(0) match = re.match(r'^[$_\w]+', input__) # FIXME no ES3 unicode if match: id_ = match.group(0) token.type_ = keywords.get(id_, IDENTIFIER) token.value = id_ return match.group(0) match = re.match(r'^"(?:\\.|[^"])*"|^\'(?:\\.|[^\'])*\'', input__) if match: token.type_ = STRING token.value = eval(match.group(0)) return match.group(0) if self.scanOperand: match = reRegExp.match(input__) if match: token.type_ = REGEXP token.value = {"regexp": match.group(1), "modifiers": match.group(2)} return match.group(0) match = opRegExp.match(input__) if match: op = match.group(0) if assignOps.has_key(op) and input__[len(op)] == '=': token.type_ = ASSIGN token.assignOp = globals()[opTypeNames[op]] token.value = op return match.group(0) + "=" token.type_ = globals()[opTypeNames[op]] if self.scanOperand and (token.type_ in (PLUS, MINUS)): token.type_ += UNARY_PLUS - PLUS token.assignOp = None token.value = op return match.group(0) if self.scanNewlines: match = re.match(r'^\n', input__) if match: token.type_ = NEWLINE return match.group(0) raise self.newSyntaxError("Illegal token") token.start = self.cursor self.cursor += len(matchInput()) token.end = self.cursor token.lineno = self.lineno return getattr(token, "type_", None) def unget(self): self.lookahead += 1 if self.lookahead == 4: raise "PANIC: too much lookahead!" self.tokenIndex = (self.tokenIndex - 1) & 3 def newSyntaxError(self, m): return SyntaxError_(m, self.filename, self.lineno) class CompilerContext(object): def __init__(self, inFunction): self.inFunction = inFunction self.stmtStack = [] self.funDecls = [] self.varDecls = [] self.bracketLevel = 0 self.curlyLevel = 0 self.parenLevel = 0 self.hookLevel = 0 self.ecmaStrictMode = False self.inForLoopInit = False def Script(t, x): n = Statements(t, x) n.type_ = SCRIPT n.funDecls = x.funDecls n.varDecls = x.varDecls return n class Node(list): def __init__(self, t, type_=None, args=[]): list.__init__(self) token = t.token if token: if type_: self.type_ = type_ else: self.type_ = getattr(token, "type_", None) self.value = token.value self.lineno = token.lineno self.start = token.start self.end = token.end else: self.type_ = type_ self.lineno = t.lineno self.tokenizer = t for arg in args: self.append(arg) type = property(lambda self: tokenstr(self.type_)) # Always use push to add operands to an expression, to update start and end. def append(self, kid, numbers=[]): if kid: if hasattr(self, "start") and kid.start < self.start: self.start = kid.start if hasattr(self, "end") and self.end < kid.end: self.end = kid.end return list.append(self, kid) indentLevel = 0 def __str__(self): a = list((str(i), v) for i, v in enumerate(self)) for attr in dir(self): if attr[0] == "_": continue elif attr == "tokenizer": a.append((attr, "[object Object]")) elif attr in ("append", "count", "extend", "getSource", "index", "insert", "pop", "remove", "reverse", "sort", "type_", "target", "filename", "indentLevel", "type"): continue else: a.append((attr, getattr(self, attr))) if len(self): a.append(("length", len(self))) a.sort(lambda a, b: cmp(a[0], b[0])) INDENTATION = " " Node.indentLevel += 1 n = Node.indentLevel s = "{\n%stype: %s" % ((INDENTATION * n), tokenstr(self.type_)) for i, value in a: s += ",\n%s%s: " % ((INDENTATION * n), i) if i == "value" and self.type_ == REGEXP: s += "/%s/%s" % (value["regexp"], value["modifiers"]) elif value is None: s += "null" elif value is False: s += "false" elif value is True: s += "true" elif type(value) == list: s += ','.join((str(x) for x in value)) else: s += str(value) Node.indentLevel -= 1 n = Node.indentLevel s += "\n%s}" % (INDENTATION * n) return s __repr__ = __str__ def getSource(self): if getattr(self, "start", None) is not None: if getattr(self, "end", None) is not None: return self.tokenizer.source[self.start:self.end] return self.tokenizer.source[self.start:] if getattr(self, "end", None) is not None: return self.tokenizer.source[:self.end] return self.tokenizer.source[:] filename = property(lambda self: self.tokenizer.filename) def __nonzero__(self): return True # Statement stack and nested statement handler. def nest(t, x, node, func, end=None): x.stmtStack.append(node) n = func(t, x) x.stmtStack.pop() if end: t.mustMatch(end) return n def tokenstr(tt): t = tokens[tt] if re.match(r'^\W', t): return opTypeNames[t] return t.upper() def Statements(t, x): n = Node(t, BLOCK) x.stmtStack.append(n) while not t.done and t.peek() != RIGHT_CURLY: n.append(Statement(t, x)) x.stmtStack.pop() return n def Block(t, x): t.mustMatch(LEFT_CURLY) n = Statements(t, x) t.mustMatch(RIGHT_CURLY) return n DECLARED_FORM = 0 EXPRESSED_FORM = 1 STATEMENT_FORM = 2 def Statement(t, x): tt = t.get() # Cases for statements ending in a right curly return early, avoiding the # common semicolon insertion magic after this switch. if tt == FUNCTION: if len(x.stmtStack) > 1: type_ = STATEMENT_FORM else: type_ = DECLARED_FORM return FunctionDefinition(t, x, True, type_) elif tt == LEFT_CURLY: n = Statements(t, x) t.mustMatch(RIGHT_CURLY) return n elif tt == IF: n = Node(t) n.condition = ParenExpression(t, x) x.stmtStack.append(n) n.thenPart = Statement(t, x) if t.match(ELSE): n.elsePart = Statement(t, x) else: n.elsePart = None x.stmtStack.pop() return n elif tt == SWITCH: n = Node(t) t.mustMatch(LEFT_PAREN) n.discriminant = Expression(t, x) t.mustMatch(RIGHT_PAREN) n.cases = [] n.defaultIndex = -1 x.stmtStack.append(n) t.mustMatch(LEFT_CURLY) while True: tt = t.get() if tt == RIGHT_CURLY: break if tt in (DEFAULT, CASE): if tt == DEFAULT and n.defaultIndex >= 0: raise t.newSyntaxError("More than one switch default") n2 = Node(t) if tt == DEFAULT: n.defaultIndex = len(n.cases) else: n2.caseLabel = Expression(t, x, COLON) else: raise t.newSyntaxError("Invalid switch case") t.mustMatch(COLON) n2.statements = Node(t, BLOCK) while True: tt = t.peek() if(tt == CASE or tt == DEFAULT or tt == RIGHT_CURLY): break n2.statements.append(Statement(t, x)) n.cases.append(n2) x.stmtStack.pop() return n elif tt == FOR: n = Node(t) n2 = None n.isLoop = True t.mustMatch(LEFT_PAREN) tt = t.peek() if tt != SEMICOLON: x.inForLoopInit = True if tt == VAR or tt == CONST: t.get() n2 = Variables(t, x) else: n2 = Expression(t, x) x.inForLoopInit = False if n2 and t.match(IN): n.type_ = FOR_IN if n2.type_ == VAR: if len(n2) != 1: raise SyntaxError("Invalid for..in left-hand side", t.filename, n2.lineno) # NB: n2[0].type_ == INDENTIFIER and n2[0].value == n2[0].name n.iterator = n2[0] n.varDecl = n2 else: n.iterator = n2 n.varDecl = None n.object = Expression(t, x) else: if n2: n.setup = n2 else: n.setup = None t.mustMatch(SEMICOLON) if t.peek() == SEMICOLON: n.condition = None else: n.condition = Expression(t, x) t.mustMatch(SEMICOLON) if t.peek() == RIGHT_PAREN: n.update = None else: n.update = Expression(t, x) t.mustMatch(RIGHT_PAREN) n.body = nest(t, x, n, Statement) return n elif tt == WHILE: n = Node(t) n.isLoop = True n.condition = ParenExpression(t, x) n.body = nest(t, x, n, Statement) return n elif tt == DO: n = Node(t) n.isLoop = True n.body = nest(t, x, n, Statement, WHILE) n.condition = ParenExpression(t, x) if not x.ecmaStrictMode: # <script language="JavaScript"> (without version hints) may need # automatic semicolon insertion without a newline after do-while. # See http://bugzilla.mozilla.org/show_bug.cgi?id=238945. t.match(SEMICOLON) return n elif tt in (BREAK, CONTINUE): n = Node(t) if t.peekOnSameLine() == IDENTIFIER: t.get() n.label = t.token.value ss = x.stmtStack i = len(ss) label = getattr(n, "label", None) if label: while True: i -= 1 if i < 0: raise t.newSyntaxError("Label not found") if getattr(ss[i], "label", None) == label: break else: while True: i -= 1 if i < 0: if tt == BREAK: raise t.newSyntaxError("Invalid break") else: raise t.newSyntaxError("Invalid continue") if (getattr(ss[i], "isLoop", None) or (tt == BREAK and ss[i].type_ == SWITCH)): break n.target = ss[i] elif tt == TRY: n = Node(t) n.tryBlock = Block(t, x) n.catchClauses = [] while t.match(CATCH): n2 = Node(t) t.mustMatch(LEFT_PAREN) n2.varName = t.mustMatch(IDENTIFIER).value if t.match(IF): if x.ecmaStrictMode: raise t.newSyntaxError("Illegal catch guard") if n.catchClauses and not n.catchClauses[-1].guard: raise t.newSyntaxError("Gaurded catch after unguarded") n2.guard = Expression(t, x) else: n2.guard = None t.mustMatch(RIGHT_PAREN) n2.block = Block(t, x) n.catchClauses.append(n2) if t.match(FINALLY): n.finallyBlock = Block(t, x) if not n.catchClauses and not getattr(n, "finallyBlock", None): raise t.newSyntaxError("Invalid try statement") return n elif tt in (CATCH, FINALLY): raise t.newSyntaxError(tokens[tt] + " without preceding try") elif tt == THROW: n = Node(t) n.exception = Expression(t, x) elif tt == RETURN: if not x.inFunction: raise t.newSyntaxError("Invalid return") n = Node(t) tt = t.peekOnSameLine() if tt not in (END, NEWLINE, SEMICOLON, RIGHT_CURLY): n.value = Expression(t, x) elif tt == WITH: n = Node(t) n.object = ParenExpression(t, x) n.body = nest(t, x, n, Statement) return n elif tt in (VAR, CONST): n = Variables(t, x) elif tt == DEBUGGER: n = Node(t) elif tt in (NEWLINE, SEMICOLON): n = Node(t, SEMICOLON) n.expression = None return n else: if tt == IDENTIFIER: t.scanOperand = False tt = t.peek() t.scanOperand = True if tt == COLON: label = t.token.value ss = x.stmtStack i = len(ss) - 1 while i >= 0: if getattr(ss[i], "label", None) == label: raise t.newSyntaxError("Duplicate label") i -= 1 t.get() n = Node(t, LABEL) n.label = label n.statement = nest(t, x, n, Statement) return n n = Node(t, SEMICOLON) t.unget() n.expression = Expression(t, x) n.end = n.expression.end if t.lineno == t.token.lineno: tt = t.peekOnSameLine() if tt not in (END, NEWLINE, SEMICOLON, RIGHT_CURLY): raise t.newSyntaxError("Missing ; before statement") t.match(SEMICOLON) return n def FunctionDefinition(t, x, requireName, functionForm): f = Node(t) if f.type_ != FUNCTION: if f.value == "get": f.type_ = GETTER else: f.type_ = SETTER if t.match(IDENTIFIER): f.name = t.token.value elif requireName: raise t.newSyntaxError("Missing function identifier") t.mustMatch(LEFT_PAREN) f.params = [] while True: tt = t.get() if tt == RIGHT_PAREN: break if tt != IDENTIFIER: raise t.newSyntaxError("Missing formal parameter") f.params.append(t.token.value) if t.peek() != RIGHT_PAREN: t.mustMatch(COMMA) t.mustMatch(LEFT_CURLY) x2 = CompilerContext(True) f.body = Script(t, x2) t.mustMatch(RIGHT_CURLY) f.end = t.token.end f.functionForm = functionForm if functionForm == DECLARED_FORM: x.funDecls.append(f) return f def Variables(t, x): n = Node(t) while True: t.mustMatch(IDENTIFIER) n2 = Node(t) n2.name = n2.value if t.match(ASSIGN): if t.token.assignOp: raise t.newSyntaxError("Invalid variable initialization") n2.initializer = Expression(t, x, COMMA) n2.readOnly = not not (n.type_ == CONST) n.append(n2) x.varDecls.append(n2) if not t.match(COMMA): break return n def ParenExpression(t, x): t.mustMatch(LEFT_PAREN) n = Expression(t, x) t.mustMatch(RIGHT_PAREN) return n opPrecedence = { "SEMICOLON": 0, "COMMA": 1, "ASSIGN": 2, "HOOK": 2, "COLON": 2, # The above all have to have the same precedence, see bug 330975. "OR": 4, "AND": 5, "BITWISE_OR": 6, "BITWISE_XOR": 7, "BITWISE_AND": 8, "EQ": 9, "NE": 9, "STRICT_EQ": 9, "STRICT_NE": 9, "LT": 10, "LE": 10, "GE": 10, "GT": 10, "IN": 10, "INSTANCEOF": 10, "LSH": 11, "RSH": 11, "URSH": 11, "PLUS": 12, "MINUS": 12, "MUL": 13, "DIV": 13, "MOD": 13, "DELETE": 14, "VOID": 14, "TYPEOF": 14, # "PRE_INCREMENT": 14, "PRE_DECREMENT": 14, "NOT": 14, "BITWISE_NOT": 14, "UNARY_PLUS": 14, "UNARY_MINUS": 14, "INCREMENT": 15, "DECREMENT": 15, # postfix "NEW": 16, "DOT": 17 } # Map operator type code to precedence for i in opPrecedence.copy(): opPrecedence[globals()[i]] = opPrecedence[i] opArity = { "COMMA": -2, "ASSIGN": 2, "HOOK": 3, "OR": 2, "AND": 2, "BITWISE_OR": 2, "BITWISE_XOR": 2, "BITWISE_AND": 2, "EQ": 2, "NE": 2, "STRICT_EQ": 2, "STRICT_NE": 2, "LT": 2, "LE": 2, "GE": 2, "GT": 2, "IN": 2, "INSTANCEOF": 2, "LSH": 2, "RSH": 2, "URSH": 2, "PLUS": 2, "MINUS": 2, "MUL": 2, "DIV": 2, "MOD": 2, "DELETE": 1, "VOID": 1, "TYPEOF": 1, # "PRE_INCREMENT": 1, "PRE_DECREMENT": 1, "NOT": 1, "BITWISE_NOT": 1, "UNARY_PLUS": 1, "UNARY_MINUS": 1, "INCREMENT": 1, "DECREMENT": 1, # postfix "NEW": 1, "NEW_WITH_ARGS": 2, "DOT": 2, "INDEX": 2, "CALL": 2, "ARRAY_INIT": 1, "OBJECT_INIT": 1, "GROUP": 1 } # Map operator type code to arity. for i in opArity.copy(): opArity[globals()[i]] = opArity[i] def Expression(t, x, stop=None): operators = [] operands = [] bl = x.bracketLevel cl = x.curlyLevel pl = x.parenLevel hl = x.hookLevel def reduce_(): n = operators.pop() op = n.type_ arity = opArity[op] if arity == -2: # Flatten left-associative trees. left = (len(operands) >= 2 and operands[-2]) if left.type_ == op: right = operands.pop() left.append(right) return left arity = 2 # Always use append to add operands to n, to update start and end. a = operands[-arity:] del operands[-arity:] for operand in a: n.append(operand) # Include closing bracket or postfix operator in [start,end). if n.end < t.token.end: n.end = t.token.end operands.append(n) return n class BreakOutOfLoops(Exception): pass try: while True: tt = t.get() if tt == END: break if (tt == stop and x.bracketLevel == bl and x.curlyLevel == cl and x.parenLevel == pl and x.hookLevel == hl): # Stop only if tt matches the optional stop parameter, and that # token is not quoted by some kind of bracket. break if tt == SEMICOLON: # NB: cannot be empty, Statement handled that. raise BreakOutOfLoops elif tt in (ASSIGN, HOOK, COLON): if t.scanOperand: raise BreakOutOfLoops while ((operators and opPrecedence.get(operators[-1].type_, None) > opPrecedence.get(tt)) or (tt == COLON and operators and operators[-1].type_ == ASSIGN)): reduce_() if tt == COLON: if operators: n = operators[-1] if not operators or n.type_ != HOOK: raise t.newSyntaxError("Invalid label") x.hookLevel -= 1 else: operators.append(Node(t)) if tt == ASSIGN: operands[-1].assignOp = t.token.assignOp else: x.hookLevel += 1 t.scanOperand = True elif tt in (IN, COMMA, OR, AND, BITWISE_OR, BITWISE_XOR, BITWISE_AND, EQ, NE, STRICT_EQ, STRICT_NE, LT, LE, GE, GT, INSTANCEOF, LSH, RSH, URSH, PLUS, MINUS, MUL, DIV, MOD, DOT): # We're treating comma as left-associative so reduce can fold # left-heavy COMMA trees into a single array. if tt == IN: # An in operator should not be parsed if we're parsing the # head of a for (...) loop, unless it is in the then part of # a conditional expression, or parenthesized somehow. if (x.inForLoopInit and not x.hookLevel and not x.bracketLevel and not x.curlyLevel and not x.parenLevel): raise BreakOutOfLoops if t.scanOperand: raise BreakOutOfLoops while (operators and opPrecedence.get(operators[-1].type_) >= opPrecedence.get(tt)): reduce_() if tt == DOT: t.mustMatch(IDENTIFIER) operands.append(Node(t, DOT, [operands.pop(), Node(t)])) else: operators.append(Node(t)) t.scanOperand = True elif tt in (DELETE, VOID, TYPEOF, NOT, BITWISE_NOT, UNARY_PLUS, UNARY_MINUS, NEW): if not t.scanOperand: raise BreakOutOfLoops operators.append(Node(t)) elif tt in (INCREMENT, DECREMENT): if t.scanOperand: operators.append(Node(t)) # prefix increment or decrement else: # Don't cross a line boundary for postfix {in,de}crement. if (t.tokens.get((t.tokenIndex + t.lookahead - 1) & 3).lineno != t.lineno): raise BreakOutOfLoops # Use >, not >=, so postfix has higher precedence than # prefix. while (operators and opPrecedence.get(operators[-1].type_, None) > opPrecedence.get(tt)): reduce_() n = Node(t, tt, [operands.pop()]) n.postfix = True operands.append(n) elif tt == FUNCTION: if not t.scanOperand: raise BreakOutOfLoops operands.append(FunctionDefinition(t, x, False, EXPRESSED_FORM)) t.scanOperand = False elif tt in (NULL, THIS, TRUE, FALSE, IDENTIFIER, NUMBER, STRING, REGEXP): if not t.scanOperand: raise BreakOutOfLoops operands.append(Node(t)) t.scanOperand = False elif tt == LEFT_BRACKET: if t.scanOperand: # Array initializer. Parse using recursive descent, as the # sub-grammer here is not an operator grammar. n = Node(t, ARRAY_INIT) while True: tt = t.peek() if tt == RIGHT_BRACKET: break if tt == COMMA: t.get() n.append(None) continue n.append(Expression(t, x, COMMA)) if not t.match(COMMA): break t.mustMatch(RIGHT_BRACKET) operands.append(n) t.scanOperand = False else: operators.append(Node(t, INDEX)) t.scanOperand = True x.bracketLevel += 1 elif tt == RIGHT_BRACKET: if t.scanOperand or x.bracketLevel == bl: raise BreakOutOfLoops while reduce_().type_ != INDEX: continue x.bracketLevel -= 1 elif tt == LEFT_CURLY: if not t.scanOperand: raise BreakOutOfLoops # Object initializer. As for array initializers (see above), # parse using recursive descent. x.curlyLevel += 1 n = Node(t, OBJECT_INIT) class BreakOutOfObjectInit(Exception): pass try: if not t.match(RIGHT_CURLY): while True: tt = t.get() if ((t.token.value == "get" or t.token.value == "set") and t.peek == IDENTIFIER): if x.ecmaStrictMode: raise t.newSyntaxError("Illegal property " "accessor") n.append(FunctionDefinition(t, x, True, EXPRESSED_FORM)) else: if tt in (IDENTIFIER, NUMBER, STRING): id_ = Node(t) elif tt == RIGHT_CURLY: if x.ecmaStrictMode: raise t.newSyntaxError("Illegal " "trailing ,") raise BreakOutOfObjectInit else: raise t.newSyntaxError("Invalid property " "name") t.mustMatch(COLON) n.append(Node(t, PROPERTY_INIT, [id_, Expression(t, x, COMMA)])) if not t.match(COMMA): break t.mustMatch(RIGHT_CURLY) except BreakOutOfObjectInit, e: pass operands.append(n) t.scanOperand = False x.curlyLevel -= 1 elif tt == RIGHT_CURLY: if not t.scanOperand and x.curlyLevel != cl: raise ParseError("PANIC: right curly botch") raise BreakOutOfLoops elif tt == LEFT_PAREN: if t.scanOperand: operators.append(Node(t, GROUP)) x.parenLevel += 1 else: while (operators and opPrecedence.get(operators[-1].type_) > opPrecedence[NEW]): reduce_() # Handle () now, to regularize the n-ary case for n > 0. # We must set scanOperand in case there are arguments and # the first one is a regexp or unary+/-. if operators: n = operators[-1] else: n = Object() n.type_ = None t.scanOperand = True if t.match(RIGHT_PAREN): if n.type_ == NEW: operators.pop() n.append(operands.pop()) else: n = Node(t, CALL, [operands.pop(), Node(t, LIST)]) operands.append(n) t.scanOperand = False else: if n.type_ == NEW: n.type_ = NEW_WITH_ARGS else: operators.append(Node(t, CALL)) x.parenLevel += 1 elif tt == RIGHT_PAREN: if t.scanOperand or x.parenLevel == pl: raise BreakOutOfLoops while True: tt = reduce_().type_ if tt in (GROUP, CALL, NEW_WITH_ARGS): break if tt != GROUP: if operands: n = operands[-1] if n[1].type_ != COMMA: n[1] = Node(t, LIST, [n[1]]) else: n[1].type_ = LIST else: raise ParseError, "Unexpected amount of operands" x.parenLevel -= 1 # Automatic semicolon insertion means we may scan across a newline # and into the beginning of another statement. If so, break out of # the while loop and let the t.scanOperand logic handle errors. else: raise BreakOutOfLoops except BreakOutOfLoops, e: pass if x.hookLevel != hl: raise t.newSyntaxError("Missing : after ?") if x.parenLevel != pl: raise t.newSyntaxError("Missing ) in parenthetical") if x.bracketLevel != bl: raise t.newSyntaxError("Missing ] in index expression") if t.scanOperand: raise t.newSyntaxError("Missing operand") t.scanOperand = True t.unget() while operators: reduce_() return operands.pop()
[docs]def parse(source, filename=None, starting_line_number=1): """Parse some Javascript Args: source: the Javascript source, as a string filename: the filename to include in messages starting_line_number: the line number of the first line of the passed in source, for output messages Returns: the parsed source code data structure Raises: ParseError """ t = Tokenizer(source, filename, starting_line_number) x = CompilerContext(False) n = Script(t, x) if not t.done: raise t.newSyntaxError("Syntax error") return n
if __name__ == "__main__": print str(parse(file(sys.argv[1]).read(),sys.argv[1]))