// Java 1.1 Recognizer Grammar // // A note about conflicts. There are four points in this grammar where // two tokens of lookahead cannot predict which alternative to select. These // points are: // 1) standard "else" ambiguity // 2) index reference after array creation: // new int[1][2][3] // is the "3" an index or dim spec? // 3) {1,2,3,} // does the last comma start a new init or is it garbage? // 4) ((caseLabel|"default")+ (statement)* )* // nasty conflict, but proper code is generated // // Each of these conflicts are noted in the grammar where they occur, and these // are no worry as long as these are the only conflicts reported by ANTLR // // Run 'java JavaParser ' // // Terence Parr tested this grammar with antlr 2.20b7 and Scott // said he tested it under 2.11. // // Authors: // John Mitchell johnm@non.net // Terence Parr parrt@magelang.com // John Lilley jlilley@empathy.com // Scott Stanchfield thetick@magelang.com // // Version 1.00 December 9, 1997 -- initial release // Version 1.01 December 10, 1997 // fixed bug in octal def (0..7 not 0..8) // // This grammar is in the PUBLIC DOMAIN // header { package java; } // tell ANTLR that we want to generate Java source code options { language="Java"; } // Import the necessary classes { import java.io.*; import aioli.vtp.*; } //----------------------------------------------------------------------------- // Define a Parser, calling it JavaParser //----------------------------------------------------------------------------- class JavaParser extends Parser; options { k = 2; // two token lookahead tokenVocabulary=Java; // Call its vocabulary "Java" codeGenMakeSwitchThreshold = 2; // Some optimizations codeGenBitsetTestThreshold = 3; vtpLanguage = "java"; defaultErrorHandler = false; // Don't generate parser error handlers } // Define some methods and variables to use in the generated parser. { // Define a main static Formalism fm = FManager.getFormalism("java"); public static void main(String[] args) { // Use a try/catch block for parser exceptions try { // if we have at least one command-line argument if (args.length > 0 ) { System.err.println("Parsing..."); // for each directory/file specified on the command line for(int i=0; i< args.length;i++) doFile(new File(args[i])); // parse it } else System.err.println("Usage: java JavaRecogizer "+ ""); } catch(Exception e) { System.err.println("exception: "+e); e.printStackTrace(System.err); // so we can get stack trace } } // This method decides what action to take based on the type of // file we are looking at public static void doFile(File f) throws Exception { // If this is a directory, walk each file/dir in that directory if (f.isDirectory()) { String files[] = f.list(); for(int i=0; i < files.length; i++) doFile(new File(f, files[i])); } // otherwise, if this is a java file, parse it! else if ((f.getName().length()>5) && f.getName().substring(f.getName().length()-5).equals(".java")) { System.err.println(" "+f.getAbsolutePath()); parse(new FileInputStream(f)); } } // Here's where we do the real work... public static Tree parse(InputStream s) throws Exception { try { // Create a scanner that reads from the input stream passed to us JavaLexer lexer = new JavaLexer(s); // Create a parser that reads from the scanner JavaParser parser = new JavaParser(lexer); // start parsing at the compilationUnit rule return parser.compilationUnit(); } catch (Exception e) { System.err.println("parser exception: "+e); e.printStackTrace(); // so we can get stack trace } return null; } } // Compilation Unit: In Java, this is a single file. This is the start // rule for this parser compilationUnit returns [Tree tree] << pd == none(); id == *; ids == importDeclarations[]; td == *; tds == typeDeclarations[]; tree = compilationUnit(*pd,*ids,*tds); >> : // A compilation unit starts with an optional package definition ( pd=packageDefinition << tree.1 = *pd;>> | /* nothing */ ) // Next we have a series of zero or more import statements ( id=importDefinition << ids =+ *id;>> )* // Wrapping things up with any number of class or interface // definitions ( td=typeDefinition << tds =+ *td;>> )* EOF ; // Package statement: "package" followed by an identifier. packageDefinition returns [Tree tree] options {defaultErrorHandler = true;} // let ANTLR handle errors <> : "package" tree=identifier SEMI ; // Import statement: import followed by a package or class name importDefinition returns [Tree tree] options {defaultErrorHandler = true;} <> : "import" tree=identifierStar SEMI ; // A type definition in a file is either a class or interface definition. typeDefinition returns [Tree tree] options {defaultErrorHandler = true;} << tree = *; tm == *; >> : tm=modifiers ( tree=classDefinition[tm] | tree=interfaceDefinition[tm] ) | SEMI ; // A declaration is the creation of a reference or primitive-type variable declaration returns [Tree tree] <> : ("final" <>)? typ=typeSpec var=variableDefinitions <> ; // A list of zero or more modifiers. We could have used (modifier)* in // place of a call to modifiers, but I thought it was a good idea to keep // this rule separate so they can easily be collected in a Vector if // someone so desires modifiers returns [Tree tree] << tree = modifiers[]; md == *;>> : ( md=modifier << tree =+ *md; >> )* ; // A type specification is a type name with possible brackets afterwards // (which would make it an array type). typeSpec returns [Tree tree] << tmp == *; tree = *; >> : tree=type (LBRACK <> RBRACK )* ; // A type name. which is either a (possibly qualified) class name or // a primitive (builtin) type type returns [Tree tree] <> : tree=identifier | tree=builtInType ; builtInType returns [Tree tree] <> : "void" <> | "boolean" <> | "byte" <> | "char" <> | "short" <> | "int" <> | "float" <> | "long" <> | "double" <> ; // A (possibly-qualified) java identifier. We start with the first IDENT // and expand its name by adding dots and following IDENTS identifier returns [Tree tree] <> : id:IDENT <> (DOT id1:IDENT <> )* ; identifierStar returns [Tree tree] <> : id:IDENT <> ( DOT id1:IDENT <> )* ( DOT STAR << tree = onDemand(*tree);>> )? ; // modifiers for Java classes, interfaces, class/instance vars and methods modifier returns [Tree tree] << tree = *;>> : "private" <> | "public" <> | "protected" <> | "static" <> | "transient" <> | "final" <> | "abstract" <> | "native" <> | "threadsafe" <> | "synchronized" <> | "const" <> ; // Definition of a Java class classDefinition [Tree tm] returns [Tree tree] << tree = *; cb == fieldDeclarations[]; ic == typeNames[]; ec == none(); >> : "class" id:IDENT // aha! a class! // it _might_ have a superclass... ( "extends" ec=identifier )? // it might implement some interfaces... ( ic=implementsClause )? // now parse the body of the class classBlock[cb] << tree = classDeclaration(*tm,identifier *id^,*ec,*ic,*cb);>> ; // Definition of a Java Interface interfaceDefinition [Tree tm] returns [Tree tree] << tree = *; cb == interfaceFieldDeclarations[]; ie == typeNames[]; >> : "interface" id:IDENT // aha! an interface! // it might extend some other interfaces (ie=interfaceExtends)? // now parse the body of the interface (looks like a class...) classBlock[cb] << tree = interfaceDeclaration(*tm,identifier *id^,*ie,*cb);>> ; // This is the body of a class. You can have fields and extra semicolons, // That's about it (until you see what a field is...) classBlock [Tree tree] {Tree fd;} : LCURLY ( fd=field <> | SEMI )* RCURLY ; // An interface can extend several other interfaces... interfaceExtends returns [Tree tree] <> : "extends" id=identifier <> (COMMA id=identifier <> )* ; // A class can implement several interfaces... implementsClause returns [Tree tree] << tree = typeNames[]; id == *; >> : "implements" id=identifier <> (COMMA id=identifier <> )* ; // Now the various things that can be defined inside a class or interface... // Note that not all of these are really valid in an interface (constructors, // for example), and if this grammar were used for a compiler there would // need to be some semantic checks to make sure we're doing the right thing... field returns [Tree tree] << tree = none(); md == *; mds == *; typ == *; tmp == *; tmp1 == *; vd == *; cb == *; >> : // method, constructor, or variable declaration mds=modifiers ( tree=methodHead[mds] { tmp=fm.operator("constructorDeclaration").tree(); tmp.changeSon(mds,1); tmp1=fm.operator("constructor").tree(); tmp1.changeSon(tree.down(3).down(1),1); tmp1.changeSon(tree.down(3).down(2),2); tmp.changeSon(tmp1,2); tmp.changeSon(tree.down(4),3); tmp.changeSon(tree.down(5),4); tree=tmp; } cb=compoundStatement // constructor <> | tree=classDefinition[mds] // inner class | tree=interfaceDefinition[mds] // inner interface | typ=typeSpec // method or variable declaration(s) ( tree=methodHead[mds] <> ( cb=compoundStatement <> | SEMI ) | vd=variableDefinitions <> SEMI ) ) // "static { ... }" class initializer | "static" tree=compoundStatement <> // "{ ... }" instance initializer | tree=compoundStatement <> ; variableDefinitions returns [Tree tree] << vd == *; tree=variableDeclarators[]; >> : vd=variableDeclarator <> (COMMA vd=variableDeclarator <> )* ; // Declaration of a variable. This can be a class/instance variable, // or a local variable in a method // It can also include possible initialization. variableDeclarator returns [Tree tree] <> : id:IDENT <> (LBRACK <> RBRACK )* (ASSIGN init=initializer <> )? ; // This is an initializer used to set up an array. // CONFLICT: does a COMMA after an initializer start a new // (...)* or start the (...)? // ANTLR generates proper code due to LA(2) arrayInitializer returns [Tree tree] <> : LCURLY <> (init=initializer <> ( COMMA init=initializer <> )* (COMMA)? )? RCURLY ; // The two "things" that can initialize an array element are an expression // and another (nested) array initializer. initializer returns [Tree tree] : tree=expression | tree=arrayInitializer ; // This is the header of a method. It includes the name and parameters // for the method. // This also watches for a list of exception classes in a "throws" clause. methodHead [Tree tm] returns [Tree tree] << tree = methodDeclaration(*tm,identifier "id",*,*,none()); mc == methodDeclarator(*,*); pm == parameters[]; th == typeNames[]; >> : i:IDENT // the name of the method <> // parse the formal parameter declarations. LPAREN (pm=parameterDeclarationList)? RPAREN << mc.2 = *pm; >> // again, the array specification is skipped... (LBRACK <> RBRACK)* // get the list of exceptions that this method is declared to throw (th=throwsClause | ) <> ; // This is a list of exception classes that the method is declared to throw throwsClause returns [Tree tree] <> : "throws" i1=identifier <> ( COMMA i1=identifier <> )* ; // A list of formal parameters parameterDeclarationList returns [Tree tree] <> : pd=parameterDeclaration <> ( COMMA pd=parameterDeclaration <> )* ; // A formal parameter. parameterDeclaration returns [Tree tree] <> : ("final" <> )? typ=typeSpec id:IDENT <> (LBRACK <> RBRACK)* <> ; // Compound statement. This is used in many contexts: // Inside a class definition prefixed with "static": // it is a class initializer // Inside a class definition without "static": // it is an instance initializer // As the body of a method // As a completely indepdent braced block of code inside a method // it starts a new scope for variable definitions compoundStatement returns [Tree tree] <> : LCURLY // include the (possibly-empty) list of statements (st=statement <> )* RCURLY ; // Here are all the wonderful Java statements... statement returns [Tree tree] {Tree st1=null,st2=null,id,exp=null,tmp,block=null,block1=null; tree=null;} // A list of statements in curly braces -- start a new scope! : tree=compoundStatement // If it _looks_ like a decl, it's a decl... | (declaration)=> tree=declaration SEMI // Attach a label to the front of a statement | id1:IDENT COLON st1=statement <> // An expression statement. This could be a method call, assignment // statement, or any other expression evaluated for side-effects. | tree=expression SEMI // If-else statement // CONFLICT: the old "dangling-else" problem... // ANTLR generates proper code by just making the "else" // optional! | "if" LPAREN exp=expression RPAREN st1=statement <> ( "else" st2=statement )? <> // the "else" part above is ambiguous. The intent // is to keep it as close to the corresponding "if" // as possible. The generated code will do this, // so we can live with the ambiguity. We could do // ( ("else")=> "else" statement // | // no else clause // ) // instead, but that's less efficient... // For statement | "for" <> LPAREN (st1=forInit)? SEMI // initializer (st2=expression)? SEMI // condition test (exp=expressionList)? // updater RPAREN tree=statement // statement to loop over <> // While statement | "while" LPAREN exp=expression RPAREN st1=statement <> // do-while statement | "do" st1=statement "while" LPAREN exp=expression RPAREN SEMI <> // get out of a loop (or switch) | "break" <> (id2:IDENT <> )? SEMI // do next iteration of a loop | "continue" <> (id3:IDENT <> )? SEMI // Return an expression | "return" <> (exp=expression <> )? SEMI // switch/case statement // CONFLICT: to which "cases" does the statement bind? // ANTLR generates proper code as it groups as // many "case"/"default" labels together then // follows them with the statements | "switch" LPAREN exp=expression RPAREN << tree=switch(*exp,*);>> LCURLY <> ( (("case" exp=expression {if (st2==null) { st2=fm.operator("case").tree(); st2.changeSon(exp,1); st2.changeSon(block1,2); block.adopt(st2,-1); }else{ st2.changeSon(fm.operator("case").tree(),2); st2=st2.down(2); st2.changeSon(exp,1); } } | "default" {if (st2==null) { st2=fm.operator("default").tree(); st2.changeSon(block1,1); block.adopt(st2,-1); }else{ st1=fm.operator("default").tree(); st1.changeSon(st2,1); st2=st1; } } ) COLON <> )+ (st1=statement <> )* // ambiguous but proper code will be generated... <> )* RCURLY // exception try-catch block | tree=tryBlock // throw an exception | "throw" exp=expression SEMI <> // synchronize a statement | "synchronized" LPAREN exp=expression RPAREN st2=statement <> // empty statement | SEMI <> ; // The initializer for a for loop forInit returns[Tree tree] // if it looks like a declaration, it is : (declaration)=> tree=declaration // otherwise it could be an expression list... | tree=expressionList ; // an exception handler try/catch block tryBlock returns [Tree tree] << tree=try(*,*); fin == none(); hds == catches[]; st == *; hd == *; >> : "try" st=compoundStatement (hd=handler <> )* ( "finally" fin=compoundStatement )? {if (hds.length() == 1) hds=hds.down(1); tree.changeSon(st,1); tree.changeSon(hds,2); tree.changeSon(fin,3); } ; // an exception handler handler returns [Tree tree] << tree=*; par== *; st ==*; >> : "catch" LPAREN par=parameterDeclaration RPAREN st=compoundStatement <> ; // expressions -- the FUN stuff! // Note that most of these expressions follow the pattern // thisLevelExpression : // nextHigherPrecedenceExpression // (OPERATOR nextHigherPrecedenceExpression)* // which is a standard recursive definition for a parsing an expression. // The operators in java have the following precedences: // lowest (13) = *= /= %= += -= <<= >>= >>>= &= ^= |= // (12) ?: // (11) || // (10) && // ( 9) | // ( 8) ^ // ( 7) & // ( 6) == != // ( 5) < <= > >= // ( 4) << >> // ( 3) +(binary) -(binary) // ( 2) * / % // ( 1) ++ -- +(unary) -(unary) ~ ! (type) // [] () (method call) . (dot -- identifier qualification) // new () (explicit parenthesis) // // the last two are not usually on a precedence chart; I put them in // to point out that new has a higher precedence than '.', so you // can validy use // new Frame().show() // // Note that the above precedence levels map to the rules below... // Once you have a precedence chart, writing the appropriate rules as below // is usually very straightfoward // the mother of all expressions expression returns [Tree tree] << tree = *;>> : tree=assignmentExpression ; // This is a list of expressions. expressionList returns [Tree tree] <> : expr=expression << tree =+ *expr;>> (COMMA expr=expression << tree =+ *expr;>> )* ; // assignment expression (level 13) assignmentExpression returns [Tree tree] {Tree assOp=null,ass;tree=null;} : tree=conditionalExpression ( ( ASSIGN <> | PLUS_ASSIGN <> | MINUS_ASSIGN <> | STAR_ASSIGN <> | DIV_ASSIGN <> | MOD_ASSIGN <> | SR_ASSIGN <> | BSR_ASSIGN <> | SL_ASSIGN <> | BAND_ASSIGN <> | BXOR_ASSIGN <> | BOR_ASSIGN <> ) ass=assignmentExpression <> )? ; // conditional test (level 12) conditionalExpression returns [Tree tree] {tree = null; Tree cond1,cond2,tmp;} : tree=logicalOrExpression ( QUESTION cond1=conditionalExpression COLON cond2=conditionalExpression <> )? ; // logical or (||) (level 11) logicalOrExpression returns [Tree tree] {Tree tmp,op=null,logicalAnd; tree=null;} : tree=logicalAndExpression ((LOR <> ) logicalAnd=logicalAndExpression <> )* ; // logical and (&&) (level 10) logicalAndExpression returns [Tree tree] {Tree tmp,op=null,inclusiveOr; tree=null;} : tree=inclusiveOrExpression ((LAND <> ) inclusiveOr=inclusiveOrExpression <> )* ; // bitwise or non-short-circuiting or (|) (level 9) inclusiveOrExpression returns [Tree tree] {Tree tmp,op=null,exclusiveOr; tree=null;} : tree=exclusiveOrExpression ((BOR <> ) exclusiveOr=exclusiveOrExpression <> )* ; // exclusive or (^) (level 8) exclusiveOrExpression returns [Tree tree] {Tree tmp,op=null,and; tree=null; } : tree=andExpression ((BXOR <> ) and=andExpression <> )* ; // bitwise or non-short-circuiting and (&) (level 7) andExpression returns [Tree tree] {Tree tmp,op=null,equality; tree=null; } : tree=equalityExpression ((BAND <> ) equality=equalityExpression <> )* ; // equality/inequality (==/!=) (level 6) equalityExpression returns [Tree tree] {Tree tmp,op=null,relational; tree=null; } : tree=relationalExpression (( NOT_EQUAL <> | EQUAL <> ) relational=relationalExpression <> )* ; // boolean relational expressions (level 5) relationalExpression returns [Tree tree] {Tree tmp,op=null,shift; tree=null; } : tree=shiftExpression (( LT <> | GT <> | LE <> | GE <> ) shift=shiftExpression <> )* ; // bit shift expressions (level 4) shiftExpression returns [Tree tree] {Tree tmp,op=null,add; tree=null; } : tree=additiveExpression (( SL <> | SR <> | BSR <> ) add=additiveExpression <> )* ; // binary addition/subtraction (level 3) additiveExpression returns [Tree tree] {Tree tmp,op=null,mult; tree=null; } : tree=multiplicativeExpression (( PLUS <> | MINUS <> ) mult=multiplicativeExpression <> )* ; // multiplication/division/modulo (level 2) multiplicativeExpression returns [Tree tree] {Tree tmp,op=null,cast; tree=null; } : tree=castExpression ((STAR <> | DIV <> | MOD <> ) cast=castExpression <> )* ; // cast/unary (level 1) castExpression returns [Tree tree] {Tree typ,cast; tree=null; Tree tmp,id; } // if it _looks_ like a cast, it _is_ a cast : ( LPAREN typeSpec RPAREN castExpression )=> LPAREN typ=typeSpec RPAREN cast=castExpression <> // otherwise it's a unary expression | INC tmp=castExpression <> | DEC tmp=castExpression <> | MINUS tmp=castExpression <> | LNOT tmp=castExpression <> | BNOT tmp=castExpression <> | tree=postfixExpression ( "instanceof" id=typeSpec <> )? // instanceof should not allow just primitives (x instanceof int) // need a semantic check if we're compiling... ; // qualified names, array expressions, method invocation, post inc/dec postfixExpression returns [Tree tree] {tree = null; Tree eL=null,tmp; } : tree=primaryExpression // start with a primary ( // qualified id (id.id.id.id...) -- buid the name DOT ( id:IDENT <> | "this" <> | "class" <> ) // the above line needs a semantic check to make sure "class" // is the _last_ qualifier. // an array indexing operation | LBRACK eL=expression RBRACK <> // method invocation // The next line is not strictly proper; it allows x(3)(4) or // x[2](4) which are not valid in Java. If this grammar were used // to validate a Java program a semantic check would be needed, or // this rule would get really ugly... | LPAREN <> ( eL=expressionList | /*nothing*/ ) RPAREN { String str=tree.operator().name(); if ("this".equals(str)) { tmp=fm.operator("thisInvocation").tree(); tmp.changeSon(eL,1); tree=tmp; }else if ("super".equals(str)){ tmp=fm.operator("superInvocation").tree(); tmp.changeSon(eL,1); tree=tmp; }else{ tmp=fm.operator("call").tree(); tmp.changeSon(tree,1); tmp.changeSon(eL,2); tree=tmp; } } )* // possibly add on a post-increment or post-decrement ( INC <> | DEC <> | // nothing ) ; // the basic element of an expression primaryExpression returns [Tree tree] {tree = null;} : id1:IDENT <> | tree=builtInType DOT "class" <> | tree=newExpression | tree=constant | "super" <> | "true" <> | "false" <> | "this" <> | "null" <> | LPAREN tree=expression RPAREN ; // object instantiation. newExpression returns [Tree tree] << tree = *; tmp == *; typ == *; args== expressions[]; aI == arrayInitializer[]; cb == fieldDeclarations[]; dms==dims[]; dme==dims[]; >> : "new" tree=type ( LPAREN ( args=expressionList | /*nothing*/ ) RPAREN // java 1.1 (classBlock[cb])? <> //java 1.1 // Note: This will allow bad constructs like // new int[4][][3] {exp,exp}. // There needs to be a semantic check here... // to make sure: // a) [ expr ] and [ ] are not mixed // b) [ expr ] and an init are not used together | ( // CONFLICT: // newExpression is a primaryExpression which can be // followed by an array index reference. This is ok, // as the generated code will stay in this loop as // long as it sees an LBRACK (proper behavior) LBRACK {tmp=null;} (tmp=expression)? {if (tmp==null) dms.adopt(fm.operator("dim").tree(),-1); else dme.adopt(tmp,-1); } RBRACK )+ (aI=arrayInitializer)? {if (dms.length()==0) { tmp=fm.operator("newArray").tree(); tmp.changeSon(tree,1); tmp.changeSon(dme,2); tree=tmp; }else{ tmp=fm.operator("newDim").tree(); tmp.changeSon(tree,1); tmp.changeSon(dms,2); tmp.changeSon(dme,3); tmp.changeSon(aI,4); tree=tmp; } } ) ; constant returns [Tree tree] {tree = null;} : id0:NUM_INT <> | id1:NUM_LINT <> | id2:CHAR_LITERAL <> | id3:STRING_LITERAL <> | id4:NUM_FLOAT <> | id5:NUM_HEXA <> | id6:NUM_OCTAL <> ; //---------------------------------------------------------------------------- // The Java scanner //---------------------------------------------------------------------------- class JavaLexer extends Lexer; options { tokenVocabulary=Java; // call the vocabulary "Java" testLiterals=false; // don't automatically test for literals k=4; // four characters of lookahead } // OPERATORS QUESTION : '?' ; LPAREN : '(' ; RPAREN : ')' ; LBRACK : '[' ; RBRACK : ']' ; LCURLY : '{' ; RCURLY : '}' ; COLON : ':' ; COMMA : ',' ; //DOT : '.' ; ASSIGN : '=' ; EQUAL : "==" ; LNOT : '!' ; BNOT : '~' ; NOT_EQUAL : "!=" ; DIV : '/' ; DIV_ASSIGN : "/=" ; PLUS : '+' ; PLUS_ASSIGN : "+=" ; INC : "++" ; MINUS : '-' ; MINUS_ASSIGN : "-=" ; DEC : "--" ; STAR : '*' ; STAR_ASSIGN : "*=" ; MOD : '%' ; MOD_ASSIGN : "%=" ; SR : ">>" ; SR_ASSIGN : ">>=" ; BSR : ">>>" ; BSR_ASSIGN : ">>>=" ; GE : ">=" ; GT : ">" ; SL : "<<" ; SL_ASSIGN : "<<=" ; LE : "<=" ; LT : '<' ; BXOR : '^' ; BXOR_ASSIGN : "^=" ; BOR : '|' ; BOR_ASSIGN : "|=" ; LOR : "||" ; BAND : '&' ; BAND_ASSIGN : "&=" ; LAND : "&&" ; SEMI : ';' ; // Whitespace -- ignored WS : ( ' ' | '\t' | '\f' // handle newlines | ( "\r\n" // Evil DOS | '\r' // Macintosh | '\n' // Unix (the right way) ) { newline(); } ) { _ttype = Token.SKIP; } ; // Single-line comments SL_COMMENT : "//" (~('\n'|'\r'))* ('\n'|'\r'('\n')?) {$setType(Token.SKIP); newline();} ; // multiple-line comments ML_COMMENT : "/*" ( { LA(2)!='/' }? '*' | '\r' '\n' {newline();} | '\r' {newline();} | '\n' {newline();} | ~('*'|'\n'|'\r') )* "*/" {$setType(Token.SKIP);} ; // character literals CHAR_LITERAL : '\'' ( ESC | ~'\'' ) '\'' ; // string literals STRING_LITERAL : '"' (ESC|~('"'|'\\'))* '"' ; // escape sequence -- note that this is protected; it can only be called // from another lexer rule -- it will not ever directly return a token to // the parser // There are various ambiguities hushed in this rule. The optional // '0'...'9' digit matches should be matched here rather than letting // them go back to STRING_LITERAL to be matched. ANTLR does the // right thing by matching immediately; hence, it's ok to shut off // the FOLLOW ambig warnings. protected ESC : '\\' ( 'n' | 'r' | 't' | 'b' | 'f' | '"' | '\'' | '\\' | ('u')+ HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT | ('0'..'3') ( options { warnWhenFollowAmbig = false; } : ('0'..'9') ( options { warnWhenFollowAmbig = false; } : '0'..'9' )? )? | ('4'..'7') ( options { warnWhenFollowAmbig = false; } : ('0'..'9') )? ) ; // hexadecimal digit (again, note it's protected!) protected HEX_DIGIT : ('0'..'9'|'A'..'F'|'a'..'f') ; // a dummy rule to force vocabulary to be all characters (except special // ones that ANTLR uses internally (0 to 2) protected VOCAB : '\3'..'\377' ; // an identifier. Note that testLiterals is set to true! This means // that after we match the rule, we look in the literals table to see // if it's a literal or really an identifer IDENT options {testLiterals=true;} : ('a'..'z'|'A'..'Z'|'_'|'$') ('a'..'z'|'A'..'Z'|'_'|'0'..'9'|'$')* ; // a numeric literal NUM_INT {boolean isDecimal=false;} : '.' {_ttype = DOT;} (('0'..'9')+ (EXPONENT)? (FLOAT_SUFFIX)? { _ttype = NUM_FLOAT; })? | ( '0' {isDecimal = true;} // special case for just '0' ( ('x'|'X') ( // hex // the 'e'|'E' and float suffix stuff look // like hex digits, hence the (...)+ doesn't // know when to stop: ambig. ANTLR resolves // it correctly by matching immediately. It // is therefor ok to hush warning. options { warnWhenFollowAmbig=false; } : HEX_DIGIT )+ | ('0'..'7')+ // octal )? | ('1'..'9') ('0'..'9')* {isDecimal=true;} // non-zero decimal ) ( ('l'|'L') // only check to see if it's a float if looks like decimal so far | {isDecimal}? ( '.' ('0'..'9')* (EXPONENT)? (FLOAT_SUFFIX)? | EXPONENT (FLOAT_SUFFIX)? | FLOAT_SUFFIX ) { _ttype = NUM_FLOAT; } )? ; // a couple protected methods to assist in matching floating point numbers protected EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ; protected FLOAT_SUFFIX : 'f'|'F'|'d'|'D' ;