tweak to use proper lexer in GUI. changes formatted output. was always using java lexer. add clojure grammar as test for antlr grammars.

parrt · parrt · commit d4c362739c21 · 2016-04-03T10:46:33.000-07:00
diff --git a/java/grammars/org/antlr/codebuff/ANTLRv4Lexer.g4 b/java/grammars/org/antlr/codebuff/ANTLRv4Lexer.g4
@@ -75,8 +75,17 @@ tokens {
 
 	@Override
 	public Token emit() {
-		if (_type == TOKEN_REF || _type == RULE_REF ) {
-            _currentRuleType = _type;
+		if (_type == ID) {
+			String firstChar = _input.getText(Interval.of(_tokenStartCharIndex, _tokenStartCharIndex));
+			if (Character.isUpperCase(firstChar.charAt(0))) {
+				_type = TOKEN_REF;
+			} else {
+				_type = RULE_REF;
+			}
+
+			if (_currentRuleType == Token.INVALID_TYPE) { // if outside of rule def
+				_currentRuleType = _type;                 // set to inside lexer or parser rule
+			}
 		}
 		else if (_type == SEMI) {                  // exit rule def
 			_currentRuleType = Token.INVALID_TYPE;
@@ -156,11 +165,7 @@ NOT          : '~'                    ;
 RBRACE       : '}'                    ;
 
 /** Allow unicode rule/token names */
-//ID	:	NameStartChar NameChar*;
-
-// ##################### to allow testing ANTLR grammars in intellij preview
-RULE_REF  : [a-z][a-zA-Z_0-9]* ;
-TOKEN_REF : [A-Z][a-zA-Z_0-9]* ;
+ID	:	NameStartChar NameChar*;
 
 fragment
 NameChar
@@ -327,4 +332,3 @@ mode LexerCharSet;
 	UNTERMINATED_CHAR_SET
 		:	EOF							-> popMode
 		;
-
diff --git a/java/grammars/org/antlr/codebuff/Clojure.g4 b/java/grammars/org/antlr/codebuff/Clojure.g4
@@ -0,0 +1,261 @@
+/* Reworked for grammar specificity by Reid Mckenzie. Did a bunch of
+   work so that rather than reading "a bunch of crap in parens" some
+   syntactic information is preserved and recovered. Dec. 14 2014.
+
+   Converted to ANTLR 4 by Terence Parr. Unsure of provence. I see
+   it commited by matthias.koester for clojure-eclipse project on
+   Oct 5, 2009:
+
+   https://code.google.com/p/clojure-eclipse/
+
+   Seems to me Laurent Petit had a version of this. I also see
+   Jingguo Yao submitting a link to a now-dead github project on
+   Jan 1, 2011.
+
+   https://github.com/laurentpetit/ccw/tree/master/clojure-antlr-grammar
+
+   Regardless, there are some issues perhaps related to "sugar";
+   I've tried to fix them.
+
+   This parses https://github.com/weavejester/compojure project.
+
+   I also note this is hardly a grammar; more like "match a bunch of
+   crap in parens" but I guess that is LISP for you ;)
+ */
+
+grammar Clojure;
+
+file: form *;
+
+form: literal
+    | list
+    | vector
+    | map
+    | reader_macro
+    ;
+
+forms: form* ;
+
+list: '(' forms ')' ;
+
+vector: '[' forms ']' ;
+
+map: '{' (form form)* '}' ;
+
+set: '#{' forms '}' ;
+
+reader_macro
+    : lambda
+    | meta_data
+    | regex
+    | var_quote
+    | host_expr
+    | set
+    | tag
+    | discard
+    | dispatch
+    | deref
+    | quote
+    | backtick
+    | unquote
+    | unquote_splicing
+    | gensym
+    ;
+
+// TJP added '&' (gather a variable number of arguments)
+quote
+    : '\'' form
+    ;
+
+backtick
+    : '`' form
+    ;
+
+unquote
+    : '~' form
+    ;
+
+unquote_splicing
+    : '~@' form
+    ;
+
+tag
+    : '^' form form
+    ;
+
+deref
+    : '@' form
+    ;
+
+gensym
+    : SYMBOL '#'
+    ;
+
+lambda
+    : '#(' form* ')'
+    ;
+
+meta_data
+    : '#^' (map form | form) 
+    ;
+
+var_quote
+    : '#\'' symbol
+    ;
+
+host_expr
+    : '#+' form form
+    ;
+
+discard
+    : '#_' form
+    ;
+
+dispatch
+    : '#' symbol form
+    ;
+
+regex
+    : '#' string
+    ;
+
+literal
+    : string
+    | number
+    | character
+    | nil
+    | BOOLEAN
+    | keyword
+    | symbol
+    | param_name
+    ;
+
+string: STRING;
+hex: HEX;
+bin: BIN;
+bign: BIGN;
+number
+    : FLOAT
+    | hex
+    | bin
+    | bign
+    | LONG
+    ;
+
+character
+    : named_char
+    | u_hex_quad
+    | any_char
+    ;
+named_char: CHAR_NAMED ;
+any_char: CHAR_ANY ;
+u_hex_quad: CHAR_U ;
+
+nil: NIL;
+
+keyword: macro_keyword | simple_keyword;
+simple_keyword: ':' symbol;
+macro_keyword: ':' ':' symbol;
+
+symbol: ns_symbol | simple_sym;
+simple_sym: SYMBOL;
+ns_symbol: NS_SYMBOL;
+
+param_name: PARAM_NAME;
+
+// Lexers
+//--------------------------------------------------------------------
+
+STRING : '"' ( ~'"' | '\\' '"' )* '"' ;
+
+// FIXME: Doesn't deal with arbitrary read radixes, BigNums
+FLOAT
+    : '-'? [0-9]+ FLOAT_TAIL
+    | '-'? 'Infinity'
+    | '-'? 'NaN'
+    ;
+
+fragment
+FLOAT_TAIL
+    : FLOAT_DECIMAL FLOAT_EXP
+    | FLOAT_DECIMAL
+    | FLOAT_EXP
+    ;
+
+fragment
+FLOAT_DECIMAL
+    : '.' [0-9]+
+    ;
+
+fragment
+FLOAT_EXP
+    : [eE] '-'? [0-9]+
+    ;
+fragment
+HEXD: [0-9a-fA-F] ;
+HEX: '0' [xX] HEXD+ ;
+BIN: '0' [bB] [10]+ ;
+LONG: '-'? [0-9]+[lL]?;
+BIGN: '-'? [0-9]+[nN];
+
+CHAR_U
+    : '\\' 'u'[0-9D-Fd-f] HEXD HEXD HEXD ;
+CHAR_NAMED
+    : '\\' ( 'newline'
+           | 'return'
+           | 'space'
+           | 'tab'
+           | 'formfeed'
+           | 'backspace' ) ;
+CHAR_ANY
+    : '\\' . ;
+
+NIL : 'nil';
+
+BOOLEAN : 'true' | 'false' ;
+
+SYMBOL
+    : '.'
+    | '/'
+    | NAME
+    ;
+
+NS_SYMBOL
+    : NAME '/' SYMBOL
+    ;
+
+PARAM_NAME: '%' ((('1'..'9')('0'..'9')*)|'&')? ;
+
+// Fragments
+//--------------------------------------------------------------------
+
+fragment
+NAME: SYMBOL_HEAD SYMBOL_REST* (':' SYMBOL_REST+)* ;
+
+fragment
+SYMBOL_HEAD
+    : ~('0' .. '9'
+        | '^' | '`' | '\'' | '"' | '#' | '~' | '@' | ':' | '/' | '%' | '(' | ')' | '[' | ']' | '{' | '}' // FIXME: could be one group
+        | [ \n\r\t\,] // FIXME: could be WS
+        )
+    ;
+
+fragment
+SYMBOL_REST
+    : SYMBOL_HEAD
+    | '0'..'9'
+    | '.'
+    ;
+
+// Discard
+//--------------------------------------------------------------------
+
+fragment
+WS : [ \n\r\t\,] ;
+
+fragment
+COMMENT: ';' ~[\r\n]* ;
+
+TRASH
+    : ( WS | COMMENT ) -> channel(HIDDEN)
+    ;
diff --git a/java/src/org/antlr/codebuff/Tool.java b/java/src/org/antlr/codebuff/Tool.java
@@ -57,7 +57,7 @@ public static void main(String[] args)
 			Pair<String,List<TokenPositionAnalysis>> results = format(corpus, testDoc, ANTLRv4Lexer.class, ANTLRv4Parser.class, "grammarSpec", tabSize);
 			output = results.a;
 			List<TokenPositionAnalysis> analysisPerToken = results.b;
-			GUIController controller = new GUIController(analysisPerToken, testDoc, output, JavaLexer.class);
+			GUIController controller = new GUIController(analysisPerToken, testDoc, output, ANTLRv4Lexer.class);
 			controller.show();
 		}
 		System.out.println(output);

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ public static void main(String[] args)`
`57`	`57`	`Pair<String,List<TokenPositionAnalysis>> results = format(corpus, testDoc, ANTLRv4Lexer.class, ANTLRv4Parser.class, "grammarSpec", tabSize);`
`58`	`58`	`output = results.a;`
`59`	`59`	`List<TokenPositionAnalysis> analysisPerToken = results.b;`
`60`		`- GUIController controller = new GUIController(analysisPerToken, testDoc, output, JavaLexer.class);`
	`60`	`+ GUIController controller = new GUIController(analysisPerToken, testDoc, output, ANTLRv4Lexer.class);`
`61`	`61`	`controller.show();`
`62`	`62`	`}`
`63`	`63`	`System.out.println(output);`