Skip to content

Commit 15dfe10

Browse files
committed
add start of rev2 dir
1 parent 1c564db commit 15dfe10

4 files changed

Lines changed: 1883 additions & 0 deletions

File tree

Lines changed: 339 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,339 @@
1+
/*
2+
* [The "BSD license"]
3+
* Copyright (c) 2014 Terence Parr
4+
* Copyright (c) 2014 Sam Harwell
5+
* All rights reserved.
6+
*
7+
* Redistribution and use in source and binary forms, with or without
8+
* modification, are permitted provided that the following conditions
9+
* are met:
10+
*
11+
* 1. Redistributions of source code must retain the above copyright
12+
* notice, this list of conditions and the following disclaimer.
13+
* 2. Redistributions in binary form must reproduce the above copyright
14+
* notice, this list of conditions and the following disclaimer in the
15+
* documentation and/or other materials provided with the distribution.
16+
* 3. The name of the author may not be used to endorse or promote products
17+
* derived from this software without specific prior written permission.
18+
*
19+
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20+
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21+
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22+
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23+
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24+
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25+
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26+
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28+
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29+
*/
30+
31+
/** A grammar for ANTLR v4 tokens */
32+
lexer grammar ANTLRv4Lexer;
33+
34+
tokens {
35+
TOKEN_REF,
36+
RULE_REF,
37+
LEXER_CHAR_SET
38+
}
39+
40+
@members {
41+
/** Track whether we are inside of a rule and whether it is lexical parser.
42+
* _currentRuleType==Token.INVALID_TYPE means that we are outside of a rule.
43+
* At the first sign of a rule name reference and _currentRuleType==invalid,
44+
* we can assume that we are starting a parser rule. Similarly, seeing
45+
* a token reference when not already in rule means starting a token
46+
* rule. The terminating ';' of a rule, flips this back to invalid type.
47+
*
48+
* This is not perfect logic but works. For example, "grammar T;" means
49+
* that we start and stop a lexical rule for the "T;". Dangerous but works.
50+
*
51+
* The whole point of this state information is to distinguish
52+
* between [..arg actions..] and [charsets]. Char sets can only occur in
53+
* lexical rules and arg actions cannot occur.
54+
*/
55+
private int _currentRuleType = Token.INVALID_TYPE;
56+
57+
public int getCurrentRuleType() {
58+
return _currentRuleType;
59+
}
60+
61+
public void setCurrentRuleType(int ruleType) {
62+
this._currentRuleType = ruleType;
63+
}
64+
65+
protected void handleBeginArgAction() {
66+
if (inLexerRule()) {
67+
pushMode(LexerCharSet);
68+
more();
69+
}
70+
else {
71+
pushMode(ArgAction);
72+
more();
73+
}
74+
}
75+
76+
@Override
77+
public Token emit() {
78+
if (_type == /*ID*/ 999999) { // ################### just testing
79+
String firstChar = _input.getText(Interval.of(_tokenStartCharIndex, _tokenStartCharIndex));
80+
if (Character.isUpperCase(firstChar.charAt(0))) {
81+
_type = TOKEN_REF;
82+
} else {
83+
_type = RULE_REF;
84+
}
85+
86+
if (_currentRuleType == Token.INVALID_TYPE) { // if outside of rule def
87+
_currentRuleType = _type; // set to inside lexer or parser rule
88+
}
89+
}
90+
else if (_type == SEMI) { // exit rule def
91+
_currentRuleType = Token.INVALID_TYPE;
92+
}
93+
94+
return super.emit();
95+
}
96+
97+
private boolean inLexerRule() {
98+
return _currentRuleType == TOKEN_REF;
99+
}
100+
private boolean inParserRule() { // not used, but added for clarity
101+
return _currentRuleType == RULE_REF;
102+
}
103+
}
104+
105+
DOC_COMMENT
106+
: '/**' .*? ('*/' | EOF)
107+
;
108+
109+
BLOCK_COMMENT
110+
: '/*' .*? ('*/' | EOF) -> channel(HIDDEN)
111+
;
112+
113+
LINE_COMMENT
114+
: '//' ~[\r\n]* -> channel(HIDDEN)
115+
;
116+
117+
BEGIN_ARG_ACTION
118+
: '[' {handleBeginArgAction();}
119+
;
120+
121+
// OPTIONS and TOKENS must also consume the opening brace that captures
122+
// their option block, as this is the easiest way to parse it separate
123+
// to an ACTION block, despite it using the same {} delimiters.
124+
//
125+
OPTIONS : 'options' [ \t\f\n\r]* '{' ;
126+
TOKENS : 'tokens' [ \t\f\n\r]* '{' ;
127+
CHANNELS : 'channels' [ \t\f\n\r]* '{' ;
128+
129+
IMPORT : 'import' ;
130+
FRAGMENT : 'fragment' ;
131+
LEXER : 'lexer' ;
132+
PARSER : 'parser' ;
133+
GRAMMAR : 'grammar' ;
134+
PROTECTED : 'protected' ;
135+
PUBLIC : 'public' ;
136+
PRIVATE : 'private' ;
137+
RETURNS : 'returns' ;
138+
LOCALS : 'locals' ;
139+
THROWS : 'throws' ;
140+
CATCH : 'catch' ;
141+
FINALLY : 'finally' ;
142+
MODE : 'mode' ;
143+
144+
COLON : ':' ;
145+
COLONCOLON : '::' ;
146+
COMMA : ',' ;
147+
SEMI : ';' ;
148+
LPAREN : '(' ;
149+
RPAREN : ')' ;
150+
RARROW : '->' ;
151+
LT : '<' ;
152+
GT : '>' ;
153+
ASSIGN : '=' ;
154+
QUESTION : '?' ;
155+
STAR : '*' ;
156+
PLUS : '+' ;
157+
PLUS_ASSIGN : '+=' ;
158+
OR : '|' ;
159+
DOLLAR : '$' ;
160+
DOT : '.' ;
161+
RANGE : '..' ;
162+
AT : '@' ;
163+
POUND : '#' ;
164+
NOT : '~' ;
165+
RBRACE : '}' ;
166+
167+
/** Allow unicode rule/token names */
168+
//ID : NameStartChar NameChar*;
169+
170+
// ##################### to allow testing ANTLR grammars in intellij preview
171+
RULE_REF : [a-z][a-zA-Z_0-9]* ;
172+
TOKEN_REF : [A-Z][a-zA-Z_0-9]* ;
173+
174+
fragment
175+
NameChar
176+
: NameStartChar
177+
| '0'..'9'
178+
| '_'
179+
| '\u00B7'
180+
| '\u0300'..'\u036F'
181+
| '\u203F'..'\u2040'
182+
;
183+
184+
fragment
185+
NameStartChar
186+
: 'A'..'Z'
187+
| 'a'..'z'
188+
| '\u00C0'..'\u00D6'
189+
| '\u00D8'..'\u00F6'
190+
| '\u00F8'..'\u02FF'
191+
| '\u0370'..'\u037D'
192+
| '\u037F'..'\u1FFF'
193+
| '\u200C'..'\u200D'
194+
| '\u2070'..'\u218F'
195+
| '\u2C00'..'\u2FEF'
196+
| '\u3001'..'\uD7FF'
197+
| '\uF900'..'\uFDCF'
198+
| '\uFDF0'..'\uFFFD'
199+
; // ignores | ['\u10000-'\uEFFFF] ;
200+
201+
INT : [0-9]+
202+
;
203+
204+
// ANTLR makes no distinction between a single character literal and a
205+
// multi-character string. All literals are single quote delimited and
206+
// may contain unicode escape sequences of the form \uxxxx, where x
207+
// is a valid hexadecimal number (as per Java basically).
208+
STRING_LITERAL
209+
: '\'' (ESC_SEQ | ~['\r\n\\])* '\''
210+
;
211+
212+
UNTERMINATED_STRING_LITERAL
213+
: '\'' (ESC_SEQ | ~['\r\n\\])*
214+
;
215+
216+
// Any kind of escaped character that we can embed within ANTLR
217+
// literal strings.
218+
fragment
219+
ESC_SEQ
220+
: '\\'
221+
( // The standard escaped character set such as tab, newline, etc.
222+
[btnfr"'\\]
223+
| // A Java style Unicode escape sequence
224+
UNICODE_ESC
225+
| // Invalid escape
226+
.
227+
| // Invalid escape at end of file
228+
EOF
229+
)
230+
;
231+
232+
fragment
233+
UNICODE_ESC
234+
: 'u' (HEX_DIGIT (HEX_DIGIT (HEX_DIGIT HEX_DIGIT?)?)?)?
235+
;
236+
237+
fragment
238+
HEX_DIGIT : [0-9a-fA-F] ;
239+
240+
WS : [ \t\r\n\f]+ -> channel(HIDDEN) ;
241+
242+
// Many language targets use {} as block delimiters and so we
243+
// must recursively match {} delimited blocks to balance the
244+
// braces. Additionally, we must make some assumptions about
245+
// literal string representation in the target language. We assume
246+
// that they are delimited by ' or " and so consume these
247+
// in their own alts so as not to inadvertantly match {}.
248+
249+
ACTION
250+
: '{'
251+
( ACTION
252+
| ACTION_ESCAPE
253+
| ACTION_STRING_LITERAL
254+
| ACTION_CHAR_LITERAL
255+
| '/*' .*? '*/' // ('*/' | EOF)
256+
| '//' ~[\r\n]*
257+
| .
258+
)*?
259+
('}'|EOF)
260+
;
261+
262+
fragment
263+
ACTION_ESCAPE
264+
: '\\' .
265+
;
266+
267+
fragment
268+
ACTION_STRING_LITERAL
269+
: '"' (ACTION_ESCAPE | ~["\\])* '"'
270+
;
271+
272+
fragment
273+
ACTION_CHAR_LITERAL
274+
: '\'' (ACTION_ESCAPE | ~['\\])* '\''
275+
;
276+
277+
// -----------------
278+
// Illegal Character
279+
//
280+
// This is an illegal character trap which is always the last rule in the
281+
// lexer specification. It matches a single character of any value and being
282+
// the last rule in the file will match when no other rule knows what to do
283+
// about the character. It is reported as an error but is not passed on to the
284+
// parser. This means that the parser to deal with the gramamr file anyway
285+
// but we will not try to analyse or code generate from a file with lexical
286+
// errors.
287+
//
288+
ERRCHAR
289+
: . -> channel(HIDDEN)
290+
;
291+
292+
mode ArgAction; // E.g., [int x, List<String> a[]]
293+
294+
NESTED_ARG_ACTION
295+
: '[' -> more, pushMode(ArgAction)
296+
;
297+
298+
ARG_ACTION_ESCAPE
299+
: '\\' . -> more
300+
;
301+
302+
ARG_ACTION_STRING_LITERAL
303+
: ('"' ('\\' . | ~["\\])* '"')-> more
304+
;
305+
306+
ARG_ACTION_CHAR_LITERAL
307+
: ('"' '\\' . | ~["\\] '"') -> more
308+
;
309+
310+
ARG_ACTION
311+
: ']' -> popMode
312+
;
313+
314+
UNTERMINATED_ARG_ACTION // added this to return non-EOF token type here. EOF did something weird
315+
: EOF -> popMode
316+
;
317+
318+
ARG_ACTION_CHAR // must be last
319+
: . -> more
320+
;
321+
322+
323+
mode LexerCharSet;
324+
325+
LEXER_CHAR_SET_BODY
326+
: ( ~[\]\\]
327+
| '\\' .
328+
)
329+
-> more
330+
;
331+
332+
LEXER_CHAR_SET
333+
: ']' -> popMode
334+
;
335+
336+
UNTERMINATED_CHAR_SET
337+
: EOF -> popMode
338+
;
339+

0 commit comments

Comments
 (0)