Skip to content

Commit d4c3627

Browse files
committed
tweak to use proper lexer in GUI. changes formatted output. was always using java lexer. add clojure grammar as test for antlr grammars.
1 parent 75417dc commit d4c3627

File tree

3 files changed

+274
-9
lines changed

3 files changed

+274
-9
lines changed

java/grammars/org/antlr/codebuff/ANTLRv4Lexer.g4

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,17 @@ tokens {
7575

7676
@Override
7777
public Token emit() {
78-
if (_type == TOKEN_REF || _type == RULE_REF ) {
79-
_currentRuleType = _type;
78+
if (_type == ID) {
79+
String firstChar = _input.getText(Interval.of(_tokenStartCharIndex, _tokenStartCharIndex));
80+
if (Character.isUpperCase(firstChar.charAt(0))) {
81+
_type = TOKEN_REF;
82+
} else {
83+
_type = RULE_REF;
84+
}
85+
86+
if (_currentRuleType == Token.INVALID_TYPE) { // if outside of rule def
87+
_currentRuleType = _type; // set to inside lexer or parser rule
88+
}
8089
}
8190
else if (_type == SEMI) { // exit rule def
8291
_currentRuleType = Token.INVALID_TYPE;
@@ -156,11 +165,7 @@ NOT : '~' ;
156165
RBRACE : '}' ;
157166

158167
/** Allow unicode rule/token names */
159-
//ID : NameStartChar NameChar*;
160-
161-
// ##################### to allow testing ANTLR grammars in intellij preview
162-
RULE_REF : [a-z][a-zA-Z_0-9]* ;
163-
TOKEN_REF : [A-Z][a-zA-Z_0-9]* ;
168+
ID : NameStartChar NameChar*;
164169

165170
fragment
166171
NameChar
@@ -327,4 +332,3 @@ mode LexerCharSet;
327332
UNTERMINATED_CHAR_SET
328333
: EOF -> popMode
329334
;
330-
Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
/* Reworked for grammar specificity by Reid Mckenzie. Did a bunch of
2+
work so that rather than reading "a bunch of crap in parens" some
3+
syntactic information is preserved and recovered. Dec. 14 2014.
4+
5+
Converted to ANTLR 4 by Terence Parr. Unsure of provence. I see
6+
it commited by matthias.koester for clojure-eclipse project on
7+
Oct 5, 2009:
8+
9+
https://code.google.com/p/clojure-eclipse/
10+
11+
Seems to me Laurent Petit had a version of this. I also see
12+
Jingguo Yao submitting a link to a now-dead github project on
13+
Jan 1, 2011.
14+
15+
https://github.com/laurentpetit/ccw/tree/master/clojure-antlr-grammar
16+
17+
Regardless, there are some issues perhaps related to "sugar";
18+
I've tried to fix them.
19+
20+
This parses https://github.com/weavejester/compojure project.
21+
22+
I also note this is hardly a grammar; more like "match a bunch of
23+
crap in parens" but I guess that is LISP for you ;)
24+
*/
25+
26+
grammar Clojure;
27+
28+
file: form *;
29+
30+
form: literal
31+
| list
32+
| vector
33+
| map
34+
| reader_macro
35+
;
36+
37+
forms: form* ;
38+
39+
list: '(' forms ')' ;
40+
41+
vector: '[' forms ']' ;
42+
43+
map: '{' (form form)* '}' ;
44+
45+
set: '#{' forms '}' ;
46+
47+
reader_macro
48+
: lambda
49+
| meta_data
50+
| regex
51+
| var_quote
52+
| host_expr
53+
| set
54+
| tag
55+
| discard
56+
| dispatch
57+
| deref
58+
| quote
59+
| backtick
60+
| unquote
61+
| unquote_splicing
62+
| gensym
63+
;
64+
65+
// TJP added '&' (gather a variable number of arguments)
66+
quote
67+
: '\'' form
68+
;
69+
70+
backtick
71+
: '`' form
72+
;
73+
74+
unquote
75+
: '~' form
76+
;
77+
78+
unquote_splicing
79+
: '~@' form
80+
;
81+
82+
tag
83+
: '^' form form
84+
;
85+
86+
deref
87+
: '@' form
88+
;
89+
90+
gensym
91+
: SYMBOL '#'
92+
;
93+
94+
lambda
95+
: '#(' form* ')'
96+
;
97+
98+
meta_data
99+
: '#^' (map form | form)
100+
;
101+
102+
var_quote
103+
: '#\'' symbol
104+
;
105+
106+
host_expr
107+
: '#+' form form
108+
;
109+
110+
discard
111+
: '#_' form
112+
;
113+
114+
dispatch
115+
: '#' symbol form
116+
;
117+
118+
regex
119+
: '#' string
120+
;
121+
122+
literal
123+
: string
124+
| number
125+
| character
126+
| nil
127+
| BOOLEAN
128+
| keyword
129+
| symbol
130+
| param_name
131+
;
132+
133+
string: STRING;
134+
hex: HEX;
135+
bin: BIN;
136+
bign: BIGN;
137+
number
138+
: FLOAT
139+
| hex
140+
| bin
141+
| bign
142+
| LONG
143+
;
144+
145+
character
146+
: named_char
147+
| u_hex_quad
148+
| any_char
149+
;
150+
named_char: CHAR_NAMED ;
151+
any_char: CHAR_ANY ;
152+
u_hex_quad: CHAR_U ;
153+
154+
nil: NIL;
155+
156+
keyword: macro_keyword | simple_keyword;
157+
simple_keyword: ':' symbol;
158+
macro_keyword: ':' ':' symbol;
159+
160+
symbol: ns_symbol | simple_sym;
161+
simple_sym: SYMBOL;
162+
ns_symbol: NS_SYMBOL;
163+
164+
param_name: PARAM_NAME;
165+
166+
// Lexers
167+
//--------------------------------------------------------------------
168+
169+
STRING : '"' ( ~'"' | '\\' '"' )* '"' ;
170+
171+
// FIXME: Doesn't deal with arbitrary read radixes, BigNums
172+
FLOAT
173+
: '-'? [0-9]+ FLOAT_TAIL
174+
| '-'? 'Infinity'
175+
| '-'? 'NaN'
176+
;
177+
178+
fragment
179+
FLOAT_TAIL
180+
: FLOAT_DECIMAL FLOAT_EXP
181+
| FLOAT_DECIMAL
182+
| FLOAT_EXP
183+
;
184+
185+
fragment
186+
FLOAT_DECIMAL
187+
: '.' [0-9]+
188+
;
189+
190+
fragment
191+
FLOAT_EXP
192+
: [eE] '-'? [0-9]+
193+
;
194+
fragment
195+
HEXD: [0-9a-fA-F] ;
196+
HEX: '0' [xX] HEXD+ ;
197+
BIN: '0' [bB] [10]+ ;
198+
LONG: '-'? [0-9]+[lL]?;
199+
BIGN: '-'? [0-9]+[nN];
200+
201+
CHAR_U
202+
: '\\' 'u'[0-9D-Fd-f] HEXD HEXD HEXD ;
203+
CHAR_NAMED
204+
: '\\' ( 'newline'
205+
| 'return'
206+
| 'space'
207+
| 'tab'
208+
| 'formfeed'
209+
| 'backspace' ) ;
210+
CHAR_ANY
211+
: '\\' . ;
212+
213+
NIL : 'nil';
214+
215+
BOOLEAN : 'true' | 'false' ;
216+
217+
SYMBOL
218+
: '.'
219+
| '/'
220+
| NAME
221+
;
222+
223+
NS_SYMBOL
224+
: NAME '/' SYMBOL
225+
;
226+
227+
PARAM_NAME: '%' ((('1'..'9')('0'..'9')*)|'&')? ;
228+
229+
// Fragments
230+
//--------------------------------------------------------------------
231+
232+
fragment
233+
NAME: SYMBOL_HEAD SYMBOL_REST* (':' SYMBOL_REST+)* ;
234+
235+
fragment
236+
SYMBOL_HEAD
237+
: ~('0' .. '9'
238+
| '^' | '`' | '\'' | '"' | '#' | '~' | '@' | ':' | '/' | '%' | '(' | ')' | '[' | ']' | '{' | '}' // FIXME: could be one group
239+
| [ \n\r\t\,] // FIXME: could be WS
240+
)
241+
;
242+
243+
fragment
244+
SYMBOL_REST
245+
: SYMBOL_HEAD
246+
| '0'..'9'
247+
| '.'
248+
;
249+
250+
// Discard
251+
//--------------------------------------------------------------------
252+
253+
fragment
254+
WS : [ \n\r\t\,] ;
255+
256+
fragment
257+
COMMENT: ';' ~[\r\n]* ;
258+
259+
TRASH
260+
: ( WS | COMMENT ) -> channel(HIDDEN)
261+
;

java/src/org/antlr/codebuff/Tool.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ public static void main(String[] args)
5757
Pair<String,List<TokenPositionAnalysis>> results = format(corpus, testDoc, ANTLRv4Lexer.class, ANTLRv4Parser.class, "grammarSpec", tabSize);
5858
output = results.a;
5959
List<TokenPositionAnalysis> analysisPerToken = results.b;
60-
GUIController controller = new GUIController(analysisPerToken, testDoc, output, JavaLexer.class);
60+
GUIController controller = new GUIController(analysisPerToken, testDoc, output, ANTLRv4Lexer.class);
6161
controller.show();
6262
}
6363
System.out.println(output);

0 commit comments

Comments
 (0)