/* * Java 1.0.2 Grammar for ANTLR parser generator. * * Developed by MageLang Institute (www.MageLang.com) * Authors: * Terence Parr (parrt@magelang.com) * John Mitchell of Non, Inc. (john@non.net) * Jim Coker (jcoker@magelang.com) * * The grammar looks best at tabs = 4. * * SOFTWARE RIGHTS * * This file is a Java language grammar and is free software. We do not * restrict its use or distribution, but you may NOT claim ownership or * authorship of this grammar or support code. An individual or company * may otherwise do whatever they wish with the grammar distributed * herewith including the incorporation of the grammar or the output * generated by ANTLR into commerical software. You may redistribute in * source or binary form without payment of royalties to us as long as * this header remains in all source distributions. * * We encourage users to develop parsers/tools using this grammar. * In return, we ask that credit is given to us for developing this * grammar. By "credit", we mean that if you incorporate our grammar or * the generated code into one of your programs (commercial product, * research project, or otherwise) that you acknowledge this fact in the * documentation, research report, etc.... In addition, you should say nice * things about us at every opportunity. * * As long as these guidelines are kept, we expect to continue enhancing * this grammar. Feel free to send us enhancements, fixes, bug reports, * suggestions, or general words of encouragement at parrt@magelang.com. * * DISCLAIMER: We make no guarantees that this grammar works, makes sense, * or can be used to do anything useful. * * HISTORY: * * 1.00 * Initial release * * 1.10 * Modified grammar to use rule names that are closer to Arthur * Van Hoff's rule names in his JDK compiler. * Fixed the grammar so that it accepts all files in hotjava.src * except for a few that look like errors in java source. * * 1.20 * Various modifications to match grammar in _The Java Language Specification_ * Note that the language spec using "declaration" instead of the term * "definition". In the old days, a declaration ala C++ was "class A;" * and the definition was "class A {...};". We use definition in this * grammar. * * KNOWN PROBLEMS: * * It doesn't handle some escape sequences. * * It doesn't handle inner classes in the 1.1 language spec. */ #header << #include "AToken.h" #include "ATokPtr.h" typedef ANTLRCommonToken ANTLRToken; >> << #include "PBlackBox.h" #include "DLGLexer.h" int main(int argc, char *argv[]) { FILE *in=stdin, *out=stdout; int trace = 0; if ( argc>1 ) { int i = 1; if ( strcmp("-trace", argv[i])==0 ) { trace = 1; i++; } if ( strcmp("-in", argv[i])==0 ) { in = fopen(argv[i+1], "r"); if ( in == NULL ) { fprintf(stderr, "cannot open java file %s\n", argv[i+1]); exit(-1); } i += 2; } if ( strcmp("-out", argv[i])==0 ) { out = fopen(argv[i+1], "w"); if ( out == NULL ) { fprintf(stderr, "cannot open tag file %s\n", argv[i+1]); exit(-1); } } } ParserBlackBox p(in); if ( trace ) { p.parser()->traceOn(); } p.parser()->compilationUnit(out); return 0; } >> #lexclass COMMENTS #token "\*/" << mode(START); skip(); >> #token "\n" << skip(); newline(); >> #token "\*" << skip(); >> #token "~[\*\n]+" << skip(); >> #lexclass STRINGS #token STRINGVAL "\"" << mode (START); >> #token "\\n" << replchar('\n'); more(); >> #token "\\r" << replchar('\r'); more(); >> #token "\\t" << replchar('\t'); more(); >> #token "\\\n" << replstr(""); more(); >> #token "\\\\" << replchar('\\'); more(); >> #token "\\\"" << replchar('"'); more(); >> #token "~[\"\\]+" << more(); >> #lexclass START #token "/\*" << mode(COMMENTS); skip(); >> #token "[\t\ ]+" << skip(); >> #token "\n" << newline(); skip(); >> #token "// ~[\n]* \n" << newline(); skip(); >> #token "\"" << mode(STRINGS); more(); >> #token CHARVAL "'(~[\\]|\\~[]|\\u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])'" class JavaParser { << /* Parser Members */ protected: int traceIndentLevel; int doTracing; void tracein(char *r); void traceout(char *r); public: void init() { ANTLRParser::init(); traceIndentLevel = 0; doTracing = 0; } void traceOn() { doTracing=1; } char *currentClassOrInterface; ANTLRTokenPtr currentMethod; int currentBlockNumber; int numBlocks; FILE *out; void syn(_ANTLRTokenPtr tok, ANTLRChar *egroup, SetWordType *eset, ANTLRTokenType etok, int k) { // print nothing out as we don't have anyplace to display it.!!!! } >> /* F I L E S / P A C K A G E S */ compilationUnit[FILE *output] /* pass in the output stream */ : <> { package } ( import )* ( typeDefinition )* "@" // Eof ; package : "package" qualifiedName ";" ; import : "import" qualifiedNameStar ";" ; typeDefinition : (modifier)* ( classDefinition | interfaceDefinition ) | ";" ; /* T Y P E S / D E C L S */ referenceType : qualifiedName ("\[" "\]")* ; typeSpec : type ("\[" "\]")* ; type: qualifiedName | builtInType ; builtInType : "void" | "boolean" | "byte" | "char" | "short" | "int" | "float" | "long" | "double" ; qualifiedName : IDENT ("." IDENT)* ; qualifiedNameStar : qualifiedName { "." "\*" } ; modifier : "private" | "public" | "protected" | "static" | "transient" | "final" | "abstract" | "native" | "threadsafe" | "synchronized" | "const" ; /* C L A S S E S */ classDefinition : "class" id:IDENT extends implements << currentClassOrInterface = id->getText(); fprintf(out,"class %s %d\n", id->getText(), id->getLine()); >> classBlock ; interfaceDefinition : "interface" id:IDENT interfaceExtends implements << currentClassOrInterface = id->getText(); fprintf(out,"interface %s %d\n", id->getText(), id->getLine()); >> classBlock ; classBlock : "\{" ( field )* "\}" ; extends : "extends" qualifiedName | ; interfaceExtends : "extends" qualifiedName ("," qualifiedName)* | ; implements : "implements" qualifiedName ( "," qualifiedName )* | ; /** in the following rule, two syntactic predicates (the expressions in * parens followed by '?') are used to resolve lookahead issues between * constructors and method defs and between method defs and variable defs. * Without the predicates, the rule would be: * * field * : constructorDefinition * | methodDefinition * | variableDefinitions * | "static" compoundStatement * | ";" * ; * * We could left-factor out the modifiers and typespec, but it does * not cost us much to backtrack over these few tokens and the grammar * is more readable with the predicate. */ field : ( (modifier)* methodHead "\{" )? constructorDefinition | ( (modifier)* typeSpec methodHead ( "\{" | ";" ) )? methodDefinition | variableDefinitions | "static" compoundStatement // "static { ... }" initializer | ";" ; localVariableDefinitions : typeSpec variableDeclarator ( "," variableDeclarator )* ; variableDefinitions : (modifier)* typeSpec variableDeclarator ( "," variableDeclarator )* ; variableDeclarator : id:IDENT ("\[" "\]")* { "=" initializer } << if ( currentMethod!=NULL ) { fprintf(out,"local %s %s %s %d %d\n", currentClassOrInterface, currentMethod->getText(), id->getText(), currentBlockNumber, id->getLine()); } else { fprintf(out,"variable %s %s %d\n", currentClassOrInterface, id->getText(), id->getLine()); } >> ; initializer : assignmentExpression | arrayInitializer ; arrayInitializer : "\{" { initializer ( "," initializer )* {","} } "\}" ; /* M E T H O D S */ methodHead > [ANTLRTokenPtr id] : i:IDENT <<$id=i; currentMethod=$id; numBlocks = -1;>> "\(" {parameterDefinitionList} "\)" ("\[" "\]")* {throwsClause} ; throwsClause : "throws" qualifiedName ("," qualifiedName)* ; methodDefinition : <> (modifier)* typeSpec methodHead>[id] << fprintf(out,"method %s %s %d\n", currentClassOrInterface, id->getText(), id->getLine()); >> ( compoundStatement | ";" ) << currentMethod=NULL; >> ; constructorDefinition : <> (modifier)* methodHead > [id] << fprintf(out,"method %s %s %d\n", currentClassOrInterface, id->getText(), id->getLine()); >> compoundStatement << currentMethod=NULL; >> ; parameterDefinitionList : parameterDefinition ( "," parameterDefinition )* ; parameterDefinition : typeSpec IDENT ("\[" "\]")* ; /* S T A T E M E N T S */ compoundStatement : << numBlocks++; int saveBlock = currentBlockNumber; currentBlockNumber = numBlocks; >> "\{" (statement)* << currentBlockNumber = saveBlock; >> "\}" ; statement : IDENT ":" statement | compoundStatement /* distinguishing between a local variable definition and * an expression requires k>2 lookahead. Rather than increase * the lookahead of the overall parser, we use backtracking to * ensure we match local variables. If a local variable declaration * is not found, an expression (the next alternative) is attempted. * Consider that after having seen "t[" you don't know if it's * an assignment to an array "t[3]=4;" or an variable def "t[] b;" */ | (localVariableDefinitions ";")? | expression ";" | "if" "\(" expression "\)" statement /* the {"else" statement} optional clause is a language ambiguity * that results in a parser nondeterminism. The parser's default * response of simply matching the "else" if it sees it, resolves * the problem. We use a #pragma to tell the parser that it's * approximate lookahead is sufficient to handle the problem-- * the desired side effect is that ANTLR doesn't warn us about * this ambiguity with the #pragma in place. */ #pragma approx { "else" statement } /* As with locals versus expressions at the statement level, * loop variables must be distinguished from expressions. */ | "for" "\(" ( (localVariableDefinitions ";")? | expressionList ";" | ";" ) {expression} ";" {expressionList} "\)" statement | "while" "\(" expression "\)" statement | "do" statement "while" "\(" expression "\)" ";" | "break" {IDENT} ";" | "continue" {IDENT} ";" | "return" {expression} ";" | "switch" "\(" expression "\)" "\{" ( "case" expression ":" (statement)* | "default" ":" (statement)* )* "\}" | tryBlock | "throw" expression ";" | "goto" IDENT ";" | "synchronized" "\(" expression "\)" compoundStatement | ";" ; /* "catch" and "finally" clauses cause ambiguity that is resolved * correctly by ANTLR; this is similar to the dangling-else ambiguity. * Again, the #pragma is used to turn off a warning message from ANTLR * during grammar analysis. See the statement rule. */ tryBlock : "try" compoundStatement #pragma approx ( handler )* #pragma approx { "finally" compoundStatement } ; handler : "catch" "\(" parameterDefinition "\)" compoundStatement ; /* E X P R E S S I O N S */ expressionList : assignmentExpression ("," assignmentExpression)* ; expression : assignmentExpression ; /* right-to-left for assignment op -> use tail recursion */ assignmentExpression : conditionalExpression { assignmentOp assignmentExpression } ; assignmentOp : "=" | "\+=" | "\-=" | "\*=" | "/=" | "\%=" | "\>\>=" | "\>\>\>=" | "\<\<=" | "&=" | "^=" | "\|=" ; conditionalExpression : logicalOrExpression { "?" conditionalExpression ":" conditionalExpression } ; logicalOrExpression : logicalAndExpression ("\|\|" logicalAndExpression)* ; logicalAndExpression : inclusiveOrExpression ("&&" inclusiveOrExpression)* ; inclusiveOrExpression : exclusiveOrExpression ("\|" exclusiveOrExpression)* ; exclusiveOrExpression : andExpression ("^" andExpression)* ; andExpression : equalityExpression ("&" equalityExpression)* ; equalityExpression : relationalExpression (("!=" | "==") relationalExpression)* ; relationalExpression : shiftExpression ( ( "<" | ">" | "<=" | ">=" ) shiftExpression )* ; shiftExpression : additiveExpression (("\<\<" | "\>\>" | "\>\>\>") additiveExpression)* ; additiveExpression : multiplicativeExpression (("\+" | "\-") multiplicativeExpression)* ; multiplicativeExpression : castExpression (("\*" | "/" | "\%" ) castExpression)* ; /* * This is the way castExpression should look if I had a symbol table: * * castExpression * : unaryExpression * | <getText())>>? "\(" typeSpec "\)" castExpression * ; * * I use a syntactic pred (...)? here to just check the lookahead arbitrarily * ahead; slower, but it works. */ castExpression : ( "\(" typeSpec "\)" castExpression )? | unaryExpression ; unaryExpression : "\+\+" castExpression | "\-\-" castExpression | "\-" castExpression | "\~" castExpression | "!" castExpression | postfixExpression { "instanceof" referenceType } ; /* ambiguity warning turned off with the pragma. Ambiguity is * new T[n] with "new T" returning from newExpression or * with "new T[n]" returning from newExpression. The [..] stuff * could also be matched by the postfixExpression. */ newArray : #pragma approx ( "\[" expression "\]" )+ ( "\[" "\]" )* ; postfixExpression : primaryExpression ( "\[" expression "\]" | "\(" { expressionList } "\)" | "." primaryExpression | "\+\+" | "\-\-" )* ; /* * Valid new expressions: * new Class(...) * new type[n][m][]... * new Package.Class(...) * * NOTE: This binding differs from C++. */ newExpression : "new" type ( "\(" { expressionList } "\)" | newArray ) ; primaryExpression : IDENT | newExpression | constant | "super" | "this" | "true" | "false" | "null" | STRINGVAL | "\(" expression "\)" ; constant : OCTALINT | DECIMALINT | HEXADECIMALINT | CHARVAL | FLOATONE | FLOATTWO ; } #token OCTALINT "0[0-7]*{[uUlL]}" #token DECIMALINT "[1-9][0-9]*{[uUlL]}" #token HEXADECIMALINT "(0x|0X)[0-9a-fA-F]+{[uUlL]}" #token FLOATONE "([0-9]+.[0-9]* | [0-9]*.[0-9]+) {[eE]{[\-\+]}[0-9]+} {[fFlLdD]}" #token FLOATTWO "[0-9]+ [eE]{[\-\+]}[0-9]+ {[fFlLdD]}" #token IDENT "[a-zA-Z_][a-zA-Z0-9_]*" <<;>> << void JavaParser:: tracein(char *r) { if ( !doTracing ) return; for (int i=1; i<=traceIndentLevel; i++) fprintf(stderr, " "); traceIndentLevel++; fprintf(stderr, "enter %s('%s %s')%s line %d\n", r, LT(1)->getText(), LT(2)->getText(), guessing?" [guessing]":"", LT(1)->getLine()); } void JavaParser:: traceout(char *r) { if ( !doTracing ) return; traceIndentLevel--; for (int i=1; i<=traceIndentLevel; i++) fprintf(stderr, " "); fprintf(stderr, "exit %s('%s %s')%s line %d\n", r, LT(1)->getText(), LT(2)->getText(), guessing?" [guessing]":"", LT(1)->getLine()); } >>