/* * PUBLIC DOMAIN ANSI C RECOGNIZER * * Terence Parr, Parr Research Corporation * with Randy McRee, Tandem Corporation * Released as public-domain by Tandem Corporation * Originally taken from Tory Eneboe (tory@cs.montana.edu) who typed * in and partially "ANTLR-ized" the K&R 2nd edition grammar. * * January 1995 */ #header << #include "CDictionary.h" >> /* These token names are unnecessary in that the regular expressions * could easily have been placed into the grammar rules themselves. * However, if SORCERER is to be used to do any sort of translation * labels must be assigned for each token type. Hence, I've left * the token label definitions here. TJP. */ #token LCURLYBRACE "\{" #token RCURLYBRACE "\}" #token LSQUAREBRACKET "\[" #token RSQUAREBRACKET "\]" #token LPARENTHESIS "\(" #token RPARENTHESIS "\)" #token COLON ":" #token SEMICOLON ";" #token COMMA "," #token QUESTIONMARK "?" #token ELLIPSIS "..." #token ASSIGNEQUAL "=" #token TIMESEQUAL "\*=" #token DIVIDEEQUAL "/=" #token MODEQUAL "\%=" #token PLUSEQUAL "\+=" #token MINUSEQUAL "\-=" #token SHIFTLEFTEQUAL "\<\<=" #token SHIFTRIGHTEQUAL "\>\>=" #token BITWISEANDEQUAL "&=" #token BITWISEXOREQUAL "^=" #token BITWISEOREQUAL "!=" #token OR "\|\|" #token AND "&&" #token BITWISEOR "\|" #token BITWISEXOR "^" #token AMPERSAND "&" #token EQUAL "==" #token NOTEQUAL "!=" #token LESSTHAN "<" #token GREATERTHAN "\>" #token LESSTHANOREQUALTO "\<=" #token GREATERTHANOREQUALTO "\>=" #token SHIFTLEFT "\<\<" #token SHIFTRIGHT "\>\>" #token PLUS "\+" #token MINUS "\-" #token STAR "\*" #token DIVIDE "/" #token MOD "\%" #token PLUSPLUS "\+\+" #token MINUSMINUS "\-\-" #token ONESCOMPLEMENT "\~" #token NOT "!" #token DOT "." #token POINTERTO "\-\>" #token AUTO "auto" #token BREAK "break" #token CASE "case" #token CHAR "char" #token CONST "const" #token CONTINUE "continue" #token DEFAULT "default" #token DO "do" #token DOUBLE "double" #token ELSE "else" #token ENUM "enum" #token EXTERN "extern" #token FLOAT "float" #token FOR "for" #token GOTO "goto" #token IF "if" #token INT "int" #token LONG "long" #token REGISTER "register" #token RETURN "return" #token SHORT "short" #token SIGNED "signed" #token SIZEOF "sizeof" #token STATIC "static" #token STRUCT "struct" #token SWITCH "switch" #token TYPEDEF "typedef" #token UNION "union" #token UNSIGNED "unsigned" #token VOID "void" #token VOLATILE "volatile" #token WHILE "while" #token OCTALINT "0[0-7]*{[uUlL]}" #token DECIMALINT "[1-9][0-9]*{[uUlL]}" #token HEXADECIMALINT "(0x|0X)[0-9a-fA-F]+{[uUlL]}" #token "'" << mode (CHARACTERS); more (); >> #token "\"" << mode (STRINGS); more (); >> #token FLOATONE "([0-9]+.[0-9]* | [0-9]*.[0-9]+) {[eE]{[\-\+]}[0-9]+} {[fFlL]}" #token FLOATTWO "[0-9]+ [eE]{[\-\+]}[0-9]+ {[fFlL]}" #token IDENTIFIER "[a-zA-Z_][a-zA-Z0-9_]*" #token "/\*" << mode (COMMENT); skip (); >> #token "[\t\ ]+" << skip (); >> #token "[\n\r]" << newline(); skip(); >> #token "#pragma ~[\n]*" << newline(); skip(); >> // line number and file stuff from preprocessor #token "#[\ \t]* [0-9]+ {[\ \t]* \"~[\"]+\" [\ \t]* [0-9]*} \n" << _line = atoi(begexpr()+1); skip(); >> #token Eof "@" #lexclass COMMENT #token "[\n\r]" << skip(); newline(); >> #token "\*/" << mode (START); skip (); >> #token "\*~[/]" << skip (); >> #token "~[\*\n\r]+" << skip (); >> #lexclass STRINGS #token STRING "\"" << mode (START); >> #token "\\n" << replchar ((char) 0x0A); more (); >> #token "\\t" << replchar ((char) 0x09); more (); >> #token "\\v" << replchar ((char) 0x0B); more (); >> #token "\\b" << replchar ((char) 0x08); more (); >> #token "\\r" << replchar ((char) 0x0D); more (); >> #token "\\f" << replchar ((char) 0x0C); more (); >> #token "\\a" << replchar ((char) 0x07); more (); >> #token "\\\\" << replchar ((char) 0x5C); more (); >> #token "\\?" << replchar ((char) 0x3F); more (); >> #token "\\'" << replchar ((char) 0x27); more (); >> #token "\\\"" << replchar ((char) 0x22); more (); >> #token "\\0[0-7]*" << replchar ((char) strtol (begexpr(), NULL, 8)); more (); >> #token "\\[1-9][0-9]*" << replchar ((char) strtol (begexpr(), NULL, 10)); more (); >> #token "\\(0x|0X)[0-9a-fA-F]+" << replchar ((char) strtol (begexpr(), NULL, 16)); more (); >> #token "[\n\r]" << newline(); more (); >> #token "~[\"\n\r\\]+" << more (); >> #lexclass CHARACTERS #token CHARACTER "'" << mode (START); >> #token "\\n" << replchar ((char) 0x0A); more (); mode (DONE); >> #token "\\t" << replchar ((char) 0x09); more (); mode (DONE); >> #token "\\v" << replchar ((char) 0x0B); more (); mode (DONE); >> #token "\\b" << replchar ((char) 0x08); more (); mode (DONE); >> #token "\\r" << replchar ((char) 0x0D); more (); mode (DONE); >> #token "\\f" << replchar ((char) 0x0C); more (); mode (DONE); >> #token "\\a" << replchar ((char) 0x07); more (); mode (DONE); >> #token "\\\\" << replchar ((char) 0x5C); more (); mode (DONE); >> #token "\\?" << replchar ((char) 0x3F); more (); mode (DONE); >> #token "\\'" << replchar ((char) 0x27); more (); mode (DONE); >> #token "\\\"" << replchar ((char) 0x22); more (); mode (DONE); >> #token "\\0[0-7]*" << replchar ((char) strtol (begexpr(), NULL, 8)); more (); mode (DONE); >> #token "\\[1-9][0-9]*" << replchar ((char) strtol (begexpr(), NULL, 10)); more (); mode (DONE); >> #token "\\(0x|0X)[0-9a-fA-F]+" << replchar ((char) strtol (begexpr(), NULL, 16)); more (); mode (DONE); >> #token "[\n\r]" << newline(); more (); >> #token "~['\n\r\\]" << more (); mode (DONE); >> #lexclass DONE #token CHARACTER "'" << mode (START); >> #lexclass START << typedef ANTLRCommonToken ANTLRToken; >> class CParser { << public: enum TypeSpecifier { tsINVALID=0, tsVOID=0x1, tsCHAR=0x2, tsSHORT=0x4, tsINT=0x8, tsLONG=0x10, tsFLOAT=0x20, tsDOUBLE=0x40, tsSIGNED=0x80, tsUNSIGNED=0x100, tsTYPEID=0x200, tsSTRUCT=0x400, tsENUM=0x800, tsUNION=0x1000 }; enum TypeQualifier { tqINVALID=0, tqCONST=1, tqVOLATILE }; enum StorageClass { scINVALID=0, scAUTO=1, scREGISTER, scSTATIC, scEXTERN, scTYPEDEF }; protected: // Symbol table management stuff CDictionary *symbols; StorageClass _sc; TypeQualifier _tq; TypeSpecifier _ts; unsigned char functionDefinition; int traceIndentLevel, doTracing; void tracein(char *r); void traceout(char *r); public: void init() { ANTLRParser::init(); symbols = new CDictionary(101, 200); traceIndentLevel = 0; doTracing = 0; } void traceOn() { doTracing=1; } protected: // Semantic interface; subclass and redefine these functions // so you don't have to mess with the grammar itself. virtual int isTypeName(char *s); virtual void enterNewLocalScope(); virtual void exitLocalScope(); virtual void enterExternalScope(); virtual void exitExternalScope(); // Declaration stuff virtual void beginDeclaration(); virtual void endDeclaration(); virtual void beginFunctionDefinition(); virtual void endFunctionDefinition(); virtual void beginParameterDeclaration(); virtual void beginFieldDeclaration(); virtual void declarationSpecifier(StorageClass, TypeQualifier, TypeSpecifier); virtual void beginAggrDefinition(TypeSpecifier, char *); virtual void endAggrDefinition(); // Declarator stuff virtual void declaratorPointerTo(); virtual void declaratorID(char *); virtual void declaratorArray(); virtual void declaratorParameterList(); virtual void declaratorEndParameterList(); virtual void panic(char *); >> translation_unit : <> (external_declaration)+ Eof <> ; external_declaration : (declaration_specifiers declarator LCURLYBRACE)? function_definition | declaration ; function_definition : <<;>> // don't want next action as an init-action due to (...)? above <> declaration_specifiers declarator compound_statement <> ; declaration : <> declaration_specifiers { init_declarator_list } SEMICOLON <> ; declaration_specifiers : << TypeSpecifier ts = tsINVALID, ts2; TypeQualifier tq = tqINVALID; StorageClass sc = scINVALID; >> ( storage_class_specifier>[sc] | type_specifier>[ts2] <> | type_qualifier>[tq] )+ <> ; storage_class_specifier > [CParser::StorageClass sc] : AUTO <<$sc = scAUTO;>> | REGISTER <<$sc = scREGISTER;>> | STATIC <<$sc = scSTATIC;>> | EXTERN <<$sc = scEXTERN;>> | TYPEDEF <<$sc = scTYPEDEF;>> ; type_specifier > [CParser::TypeSpecifier ts] : VOID <<$ts = tsVOID;>> | CHAR <<$ts = tsCHAR;>> | SHORT <<$ts = tsSHORT;>> | INT <<$ts = tsINT;>> | LONG <<$ts = tsLONG;>> | FLOAT <<$ts = tsFLOAT;>> | DOUBLE <<$ts = tsDOUBLE;>> | SIGNED <<$ts = tsSIGNED;>> | UNSIGNED <<$ts = tsUNSIGNED;>> | struct_or_union_specifier>[$ts] | enum_specifier <<$ts = tsENUM;>> | typeID <<$ts = tsTYPEID;>> ; typeID : <getType()==IDENTIFIER ? isTypeName(LT(1)->getText()) : 1>>? IDENTIFIER ; type_qualifier > [CParser::TypeQualifier tq] : CONST <<$tq = tqCONST;>> | VOLATILE <<$tq = tqVOLATILE;>> ; struct_or_union_specifier>[CParser::TypeSpecifier ts] : ( STRUCT <<$ts=tsSTRUCT;>> | UNION <<$ts=tsUNION;>> ) ( IDENTIFIER | id:IDENTIFIER LCURLYBRACE <getText());>> (struct_declaration)+ <> RCURLYBRACE | LCURLYBRACE <> (struct_declaration)+ <> RCURLYBRACE ) ; init_declarator_list : init_declarator (COMMA init_declarator)* ; init_declarator : declarator { ASSIGNEQUAL initializer } ; struct_declaration : <<;>> <> specifier_qualifier_list struct_declarator_list SEMICOLON ; specifier_qualifier_list : <> ( type_specifier>[ts2] <> | type_qualifier>[tq] )+ <> ; struct_declarator_list : struct_declarator (COMMA struct_declarator)* ; struct_declarator : declarator ; enum_specifier : ENUM ( IDENTIFIER { LCURLYBRACE enumerator_list RCURLYBRACE } | LCURLYBRACE enumerator_list RCURLYBRACE ) ; enumerator_list : enumerator (COMMA enumerator)* ; enumerator : IDENTIFIER { ASSIGNEQUAL constant_expression } ; declarator : STAR declarator <> | direct_declarator ; direct_declarator : id:IDENTIFIER <getText());>> (declarator_suffix)* | LPARENTHESIS declarator RPARENTHESIS (declarator_suffix)* ; declarator_suffix : LSQUAREBRACKET { constant_expression } RSQUAREBRACKET <> | LPARENTHESIS <> { parameter_list } RPARENTHESIS <> ; parameter_list : parameter_declaration_list { "," "..." } ; parameter_declaration : <<;>> <> declaration_specifiers ( (declarator)? // if arg name given | abstract_declarator // if arg name not given ) ; initializer : assignment_expression | LCURLYBRACE initializer (COMMA initializer)* RCURLYBRACE ; type_name : specifier_qualifier_list abstract_declarator ; /* This rule looks a bit weird because (...) can happen in two * places within the declaration such as "void (*)()" (ptr to * function returning nothing). However, the () of a function * can only occur after having seen either a (abstract_declarator) * and not after a [..] or simple '*'. These are the only two * valid () func-groups: * int (*)(); // ptr to func * int (*[])(); // array of ptr to func */ abstract_declarator : STAR abstract_declarator <> | LPARENTHESIS abstract_declarator RPARENTHESIS (abstract_declarator_suffix)+ | ( LSQUAREBRACKET { constant_expression } RSQUAREBRACKET <> )+ | ; abstract_declarator_suffix : LSQUAREBRACKET { constant_expression } RSQUAREBRACKET <> | LPARENTHESIS <> { parameter_declaration_list } RPARENTHESIS <> ; parameter_declaration_list : parameter_declaration ( COMMA parameter_declaration )* ; statement_list : (statement)+ ; /* the dummy action in alt2 is to prevent hoisting of predicates from * expression. A valid predicate is hoisted to test for the "(TYPENAME" * lookahead case. However, it prevents "(var) = 3;" statements. I * could put a predicate for the opposite case that would be hoisted, * but the combination of the two would not be handled correctly due * to the lookahead context test I put into the predicates. For now, * we allow any "(ID" to pass into expression. */ statement : labeled_statement | <<;>> expression SEMICOLON | compound_statement | selection_statement | iteration_statement | jump_statement | SEMICOLON ; labeled_statement : IDENTIFIER COLON statement | CASE constant_expression COLON statement | DEFAULT COLON statement ; compound_statement : LCURLYBRACE <> (declaration)* { statement_list } RCURLYBRACE <> ; /* NOTE: cannot remove ELSE ambiguity warning, but it parses correctly */ selection_statement : IF LPARENTHESIS expression RPARENTHESIS statement { ELSE statement } | SWITCH LPARENTHESIS expression RPARENTHESIS statement ; iteration_statement : WHILE LPARENTHESIS expression RPARENTHESIS statement | DO statement WHILE LPARENTHESIS expression RPARENTHESIS SEMICOLON | FOR LPARENTHESIS { expression } SEMICOLON { expression } SEMICOLON { expression } RPARENTHESIS statement ; jump_statement : GOTO IDENTIFIER SEMICOLON | CONTINUE SEMICOLON | BREAK SEMICOLON | RETURN { expression } SEMICOLON ; expression : assignment_expression (COMMA assignment_expression)* ; /* right-to-left for assignment op */ assignment_expression : conditional_expression { ( ASSIGNEQUAL | TIMESEQUAL | DIVIDEEQUAL | MODEQUAL | PLUSEQUAL | MINUSEQUAL | SHIFTLEFTEQUAL | SHIFTRIGHTEQUAL | BITWISEANDEQUAL | BITWISEXOREQUAL | BITWISEOREQUAL ) assignment_expression } ; conditional_expression : logical_or_expression { QUESTIONMARK logical_or_expression COLON logical_or_expression } ; constant_expression : conditional_expression ; logical_or_expression : logical_and_expression (OR logical_and_expression)* ; logical_and_expression : inclusive_or_expression (AND inclusive_or_expression)* ; inclusive_or_expression : exclusive_or_expression (BITWISEOR exclusive_or_expression)* ; exclusive_or_expression : and_expression (BITWISEXOR and_expression)* ; and_expression : equality_expression (AMPERSAND equality_expression)* ; equality_expression : relational_expression ((NOTEQUAL | EQUAL) relational_expression)* ; relational_expression : shift_expression ( ( LESSTHAN | GREATERTHAN | LESSTHANOREQUALTO | GREATERTHANOREQUALTO ) shift_expression )* ; shift_expression : additive_expression ((SHIFTLEFT | SHIFTRIGHT) additive_expression)* ; /* See comment for multiplicative_expression regarding #pragma */ additive_expression : multiplicative_expression #pragma approx ((PLUS | MINUS) multiplicative_expression)* ; /* ANTLR has trouble dealing with the analysis of the confusing unary/binary * operators such as STAR, AMPERSAND, PLUS, etc... With the #pragma * we simply tell ANTLR to use the "quick-to-analyze" approximate lookahead * as full LL(k) lookahead will not resolve the ambiguity anyway. Might * as well not bother. This has the side-benefit that ANTLR doesn't go * off to lunch here (take infinite time to read grammar). */ multiplicative_expression : cast_expression #pragma approx ((STAR | DIVIDE | MOD) cast_expression)* ; /* The string "( IDENTIFIER" can be either the start of a cast or * the start of a unary_expression. However, the IDENTIFIER must * be a type name for it to be a cast. Since ANTLR can only hoist * semantic predicates that are visible without consuming a token, * the semantic predicate in rule typeID is not hoisted--hence, the * rule is reported to be ambiguous. I am manually putting in the * correctly hoisted predicate. */ cast_expression : <<(LT(1)->getType()==LPARENTHESIS && LT(2)->getType()==IDENTIFIER) ? isTypeName(LT(2)->getText()) : 1>>? LPARENTHESIS type_name RPARENTHESIS cast_expression | unary_expression ; unary_expression : postfix_expression | PLUSPLUS unary_expression | MINUSMINUS unary_expression | unary_operator cast_expression | SIZEOF ( /* see comment for rule cast_expression for info on predicate */ <<(LT(1)->getType()==LPARENTHESIS && LT(2)->getType()==IDENTIFIER) ? isTypeName(LT(2)->getText()) : 1>>? LPARENTHESIS type_name RPARENTHESIS | unary_expression ) ; unary_operator : AMPERSAND | STAR | MINUS | ONESCOMPLEMENT | NOT ; postfix_expression : primary_expression #pragma approx ( LSQUAREBRACKET expression RSQUAREBRACKET | LPARENTHESIS { argument_expression_list } RPARENTHESIS | DOT IDENTIFIER | POINTERTO IDENTIFIER | PLUSPLUS | MINUSMINUS )* ; primary_expression : IDENTIFIER | constant | STRING | LPARENTHESIS expression RPARENTHESIS ; argument_expression_list : assignment_expression (COMMA assignment_expression)* ; constant : OCTALINT | DECIMALINT | HEXADECIMALINT | CHARACTER | FLOATONE | FLOATTWO ; }