Skip to content

Commit 4e6aad3

Browse files
committed
Merge pull request #16 from antlr/alter-features
Major improvement; combined align/indent. randomized exemplars. drop k to 11 max
2 parents 25ef229 + 9f121c9 commit 4e6aad3

13 files changed

Lines changed: 423 additions & 3017 deletions

File tree

java/grammars/org/antlr/codebuff/Java.g4

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1016,9 +1016,9 @@ WS : [ \t\r\n\u000C]+ -> channel(HIDDEN) // CodeBuff needs to see all whitespa
10161016
;
10171017

10181018
COMMENT
1019-
: '/*' .*? '*/' -> skip
1019+
: '/*' .*? '*/' -> channel(HIDDEN)
10201020
;
10211021

10221022
LINE_COMMENT
1023-
: '//' ~[\r\n]* -> skip
1023+
: '//' ~[\r\n]* -> channel(HIDDEN)
10241024
;

java/src/org/antlr/codebuff/CollectFeatures.java

Lines changed: 204 additions & 78 deletions
Large diffs are not rendered by default.

java/src/org/antlr/codebuff/Corpus.java

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,19 @@
66
import java.util.HashMap;
77
import java.util.List;
88
import java.util.Map;
9+
import java.util.Random;
910

1011
public class Corpus {
11-
public static final int NUM_DEPENDENT_VARS = 4;
12+
public static final int NUM_DEPENDENT_VARS = 3;
1213
public static final int INDEX_FEATURE_NEWLINES = 0;
1314
public static final int INDEX_FEATURE_WS = 1;
14-
public static final int INDEX_FEATURE_INDENT = 2;
15-
public static final int INDEX_FEATURE_ALIGN_WITH_PREVIOUS = 3;
15+
public static final int INDEX_FEATURE_ALIGN_WITH_PREVIOUS = 2;
1616

1717
List<InputDocument> documents; // an entry for each X
1818
List<int[]> X;
1919
List<Integer> injectNewlines;
20+
List<Integer> align; // steps to common ancestor whose first token is alignment anchor
2021
List<Integer> injectWS;
21-
List<Integer> indent;
22-
List<Integer> alignWithPrevious; // steps to common ancestor whose first token is alignment anchor
2322

2423
/** an index to narrow down the number of vectors we compute distance() on each classification.
2524
* The key is (previous token's rule index, current token's rule index). It yields
@@ -30,16 +29,53 @@ public class Corpus {
3029
public Corpus(List<InputDocument> documents,
3130
List<int[]> X,
3231
List<Integer> injectNewlines,
33-
List<Integer> injectWS,
34-
List<Integer> indent,
35-
List<Integer> alignWithPrevious)
32+
List<Integer> align,
33+
List<Integer> injectWS)
3634
{
3735
this.documents = documents;
3836
this.X = X;
3937
this.injectNewlines = injectNewlines;
4038
this.injectWS = injectWS;
41-
this.indent = indent;
42-
this.alignWithPrevious = alignWithPrevious;
39+
this.align = align;
40+
}
41+
42+
/** Feature vectors in X are lumped together as they are read in each
43+
* document. In kNN, this tends to find features from the same document
44+
* rather than from across the corpus since we grab k neighbors.
45+
* For k=11, we might only see exemplars from a single corpus document.
46+
* If all exemplars fit in k, this wouldn't be an issue.
47+
*
48+
* Fisher-Yates / Knuth shuffling
49+
* "To shuffle an array a of n elements (indices 0..n-1)":
50+
* https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
51+
*/
52+
public void randomShuffleInPlace() {
53+
Random r = new Random();
54+
// for i from n−1 downto 1 do
55+
int n = X.size();
56+
for (int i=n-1; i>=1; i--) {
57+
// j ← random integer such that 0 ≤ j ≤ i
58+
int j = r.nextInt(i+1);
59+
// exchange a[j] and a[i]
60+
// Swap X
61+
int[] tmp = X.get(i);
62+
X.set(i, X.get(j));
63+
X.set(j, tmp);
64+
// And now swap all prediction lists
65+
Integer tmpI = injectNewlines.get(i);
66+
injectNewlines.set(i, injectNewlines.get(j));
67+
injectNewlines.set(j, tmpI);
68+
tmpI = align.get(i);
69+
align.set(i, align.get(j));
70+
align.set(j, tmpI);
71+
tmpI = injectWS.get(i);
72+
injectWS.set(i, injectWS.get(j));
73+
injectWS.set(j, tmpI);
74+
// Finally, swap documents
75+
InputDocument tmpD = documents.get(i);
76+
documents.set(i, documents.get(j));
77+
documents.set(j, tmpD);
78+
}
4379
}
4480

4581
public void buildTokenContextIndex() {

java/src/org/antlr/codebuff/FeatureType.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
public enum FeatureType {
44
TOKEN(12), RULE(14), INT(7), BOOL(5), COL(7),
5-
INFO_FILE(4), INFO_LINE(4), INFO_CHARPOS(4),
5+
INFO_FILE(15), INFO_LINE(4), INFO_CHARPOS(4),
66
UNUSED(0);
77
public int displayWidth;
88

java/src/org/antlr/codebuff/Formatter.java

Lines changed: 151 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,35 @@
44
import org.antlr.v4.runtime.CommonTokenStream;
55
import org.antlr.v4.runtime.ParserRuleContext;
66
import org.antlr.v4.runtime.Token;
7+
import org.antlr.v4.runtime.WritableToken;
78
import org.antlr.v4.runtime.misc.Interval;
89
import org.antlr.v4.runtime.tree.TerminalNode;
910

1011
import java.util.List;
1112
import java.util.Map;
1213
import java.util.Vector;
1314

15+
import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_ANCESTORS_PARENT_FIRST_TOKEN;
16+
import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_ANCESTOR_FIRST_TOKEN;
17+
import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_LIST_FIRST_ELEMENT;
18+
import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_PAIR;
19+
import static org.antlr.codebuff.CollectFeatures.CAT_INDENT;
20+
import static org.antlr.codebuff.CollectFeatures.CAT_NO_ALIGNMENT;
21+
import static org.antlr.codebuff.CollectFeatures.FEATURES_ALIGN;
22+
import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_NL;
23+
import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_WS;
24+
import static org.antlr.codebuff.CollectFeatures.INDEX_FIRST_ON_LINE;
25+
import static org.antlr.codebuff.CollectFeatures.INDEX_PREV_END_COLUMN;
26+
import static org.antlr.codebuff.CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD;
27+
import static org.antlr.codebuff.CollectFeatures.earliestAncestorEndingWithToken;
28+
import static org.antlr.codebuff.CollectFeatures.getListSiblings;
29+
import static org.antlr.codebuff.CollectFeatures.getMatchingLeftSymbol;
30+
import static org.antlr.codebuff.CollectFeatures.getNodeFeatures;
31+
import static org.antlr.codebuff.CollectFeatures.getRealTokens;
32+
import static org.antlr.codebuff.CollectFeatures.getTokensOnPreviousLine;
33+
import static org.antlr.codebuff.CollectFeatures.indexTree;
34+
import static org.antlr.codebuff.CollectFeatures.isAlignedWithFirstSiblingOfList;
35+
1436
public class Formatter {
1537
protected final Corpus corpus;
1638
protected StringBuilder output = new StringBuilder();
@@ -26,13 +48,11 @@ public class Formatter {
2648

2749
protected CodekNNClassifier newlineClassifier;
2850
protected CodekNNClassifier wsClassifier;
29-
protected CodekNNClassifier indentClassifier;
3051
protected CodekNNClassifier alignClassifier;
3152
protected int k;
3253

3354
protected int line = 1;
3455
protected int charPosInLine = 0;
35-
protected int currentIndent = 0;
3656

3757
protected int tabSize;
3858

@@ -47,11 +67,11 @@ public Formatter(Corpus corpus, InputDocument doc, int tabSize) {
4767
this.tokens = doc.tokens;
4868
this.originalTokens = Tool.copy(tokens);
4969
Tool.wipeLineAndPositionInfo(tokens);
50-
newlineClassifier = new CodekNNClassifier(corpus, CollectFeatures.FEATURES_INJECT_NL);
51-
wsClassifier = new CodekNNClassifier(corpus, CollectFeatures.FEATURES_INJECT_WS);
52-
indentClassifier = new CodekNNClassifier(corpus, CollectFeatures.FEATURES_INDENT);
53-
alignClassifier = new CodekNNClassifier(corpus, CollectFeatures.FEATURES_ALIGN);
54-
k = (int)Math.sqrt(corpus.X.size());
70+
newlineClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_NL);
71+
wsClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_WS);
72+
alignClassifier = new CodekNNClassifier(corpus, FEATURES_ALIGN);
73+
// k = (int)Math.sqrt(corpus.X.size());
74+
k = 11;
5575
this.tabSize = tabSize;
5676
}
5777

@@ -66,15 +86,23 @@ public List<TokenPositionAnalysis> getAnalysisPerToken() {
6686

6787
public String format() {
6888
if ( tokenToNodeMap == null ) {
69-
tokenToNodeMap = CollectFeatures.indexTree(root);
89+
tokenToNodeMap = indexTree(root);
7090
}
7191

7292
tokens.seek(0);
73-
Token secondToken = tokens.LT(2);
93+
WritableToken firstToken = (WritableToken)tokens.LT(1);
94+
WritableToken secondToken = (WritableToken)tokens.LT(2);
95+
// all tokens are wiped of line/col info so set them for first 2
96+
firstToken.setLine(1);
97+
firstToken.setCharPositionInLine(0);
98+
secondToken.setLine(1);
99+
secondToken.setCharPositionInLine(firstToken.getText().length());
100+
74101
String prefix = tokens.getText(Interval.of(0, secondToken.getTokenIndex()));
75102
output.append(prefix);
76103

77-
realTokens = CollectFeatures.getRealTokens(tokens);
104+
105+
realTokens = getRealTokens(tokens);
78106
for (int i = 2; i<realTokens.size(); i++) { // can't process first 2 tokens
79107
int tokenIndexInStream = realTokens.get(i).getTokenIndex();
80108
processToken(i, tokenIndexInStream);
@@ -86,18 +114,23 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
86114
CommonToken curToken = (CommonToken)tokens.get(tokenIndexInStream);
87115
String tokText = curToken.getText();
88116

89-
int[] features = CollectFeatures.getNodeFeatures(tokenToNodeMap, doc, tokenIndexInStream, line, tabSize);
117+
int[] features = getNodeFeatures(tokenToNodeMap, doc, tokenIndexInStream, line, tabSize);
90118
// must set "prev end column" value as token stream doesn't have it;
91119
// we're tracking it as we emit tokens
92-
features[CollectFeatures.INDEX_PREV_END_COLUMN] = charPosInLine;
120+
features[INDEX_PREV_END_COLUMN] = charPosInLine;
93121

94-
int injectNewline = newlineClassifier.classify(k, features, corpus.injectNewlines, CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
95-
int alignWithPrevious = alignClassifier.classify(k, features, corpus.alignWithPrevious, CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
96-
int indent = indentClassifier.classify(k, features, corpus.indent, CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
97-
int ws = wsClassifier.classify(k, features, corpus.injectWS, CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
122+
int injectNewline = newlineClassifier.classify(k, features, corpus.injectNewlines, MAX_CONTEXT_DIFF_THRESHOLD);
123+
124+
// getNodeFeatures() also doesn't know what line curToken is on. If \n, we need to find exemplars that start a line
125+
features[INDEX_FIRST_ON_LINE] = injectNewline; // use \n prediction to match exemplars for alignment
126+
127+
int align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD);
128+
int indent = 0;
129+
//indentClassifier.classify(k, features, corpus.indent, CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
130+
int ws = wsClassifier.classify(k, features, corpus.injectWS, MAX_CONTEXT_DIFF_THRESHOLD);
98131

99132
TokenPositionAnalysis tokenPositionAnalysis =
100-
getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, injectNewline, alignWithPrevious, indent, ws);
133+
getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, injectNewline, align, indent, ws);
101134
analysis.setSize(tokenIndexInStream+1);
102135
analysis.set(tokenIndexInStream, tokenPositionAnalysis);
103136

@@ -108,36 +141,106 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
108141
if ( injectNewline>0 ) {
109142
output.append(Tool.newlines(injectNewline));
110143
line++;
111-
TerminalNode node = tokenToNodeMap.get(tokens.get(tokenIndexInStream));
112-
ParserRuleContext parent = (ParserRuleContext)node.getParent();
113-
int myIndex = 0;
114-
ParserRuleContext earliestAncestor = CollectFeatures.earliestAncestorStartingAtToken(parent, curToken);
115-
if ( earliestAncestor!=null ) {
116-
ParserRuleContext commonAncestor = earliestAncestor.getParent();
117-
List<ParserRuleContext> siblings = commonAncestor.getRuleContexts(earliestAncestor.getClass());
118-
myIndex = siblings.indexOf(earliestAncestor);
119-
}
120-
if ( myIndex>0 && alignWithPrevious>0 ) { // align with first sibling's start token
121-
ParserRuleContext commonAncestor = earliestAncestor.getParent();
122-
List<ParserRuleContext> siblings = commonAncestor.getRuleContexts(earliestAncestor.getClass());
123-
ParserRuleContext firstSibling = siblings.get(0);
124-
Token firstSiblingStartToken = firstSibling.getStart();
125-
// align but don't update currentIndent
126-
charPosInLine = firstSiblingStartToken.getCharPositionInLine();
127-
output.append(Tool.spaces(charPosInLine));
144+
charPosInLine = 0;
145+
146+
List<Token> tokensOnPreviousLine = getTokensOnPreviousLine(tokens, tokenIndexInStream, line);
147+
Token firstTokenOnPrevLine = null;
148+
if ( tokensOnPreviousLine.size()>0 ) {
149+
firstTokenOnPrevLine = tokensOnPreviousLine.get(0);
128150
}
129-
else {
130-
currentIndent += indent;
131-
if ( currentIndent<0 ) currentIndent = 0; // don't allow bad indents to accumulate
132-
charPosInLine = currentIndent;
133-
output.append(Tool.spaces(currentIndent));
151+
152+
TerminalNode node = tokenToNodeMap.get(curToken);
153+
ParserRuleContext parent = (ParserRuleContext)node.getParent();
154+
ParserRuleContext earliestRightAncestor = earliestAncestorEndingWithToken(parent, curToken);
155+
156+
switch ( align ) {
157+
case CAT_INDENT :
158+
if ( firstTokenOnPrevLine!=null ) { // if not on first line, we can indent indent
159+
int indentedCol = firstTokenOnPrevLine.getCharPositionInLine() + 4;
160+
charPosInLine = indentedCol;
161+
output.append(Tool.spaces(indentedCol));
162+
}
163+
break;
164+
case CAT_ALIGN_WITH_ANCESTOR_FIRST_TOKEN :
165+
if ( earliestRightAncestor!=null ) {
166+
Token earliestRightAncestorStart = earliestRightAncestor.getStart();
167+
int linedUpCol = earliestRightAncestorStart.getCharPositionInLine();
168+
charPosInLine = linedUpCol;
169+
output.append(Tool.spaces(linedUpCol));
170+
}
171+
break;
172+
case CAT_ALIGN_WITH_ANCESTORS_PARENT_FIRST_TOKEN :
173+
if ( earliestRightAncestor!=null ) {
174+
ParserRuleContext earliestAncestorParent = earliestRightAncestor.getParent();
175+
if ( earliestAncestorParent!=null ) {
176+
Token earliestAncestorParentStart = earliestAncestorParent.getStart();
177+
int linedUpCol = earliestAncestorParentStart.getCharPositionInLine();
178+
charPosInLine = linedUpCol;
179+
output.append(Tool.spaces(linedUpCol));
180+
}
181+
}
182+
break;
183+
case CAT_ALIGN_WITH_LIST_FIRST_ELEMENT :
184+
List<ParserRuleContext> listSiblings = getListSiblings(tokenToNodeMap, curToken);
185+
if ( listSiblings!=null ) {
186+
ParserRuleContext firstSibling = listSiblings.get(0);
187+
int linedUpCol = firstSibling.getStart().getCharPositionInLine();
188+
charPosInLine = linedUpCol;
189+
output.append(Tool.spaces(linedUpCol));
190+
}
191+
break;
192+
case CAT_ALIGN_WITH_PAIR :
193+
TerminalNode matchingLeftSymbol = getMatchingLeftSymbol(doc, node);
194+
int linedUpCol = matchingLeftSymbol.getSymbol().getCharPositionInLine();
195+
charPosInLine = linedUpCol;
196+
output.append(Tool.spaces(linedUpCol));
197+
break;
198+
case CAT_NO_ALIGNMENT :
199+
break;
134200
}
201+
// if ( currentIndent<0 ) currentIndent = 0; // don't allow bad indents to accumulate
202+
// charPosInLine = currentIndent;
203+
// output.append(Tool.spaces(currentIndent));
135204
}
136205
else {
137206
// inject whitespace instead of \n?
138207
output.append(Tool.spaces(ws));
139208
charPosInLine += ws;
140209
}
210+
211+
// if ( injectNewline>0 ) {
212+
// output.append(Tool.newlines(injectNewline));
213+
// line++;
214+
// TerminalNode node = tokenToNodeMap.get(tokens.get(tokenIndexInStream));
215+
// ParserRuleContext parent = (ParserRuleContext)node.getParent();
216+
// int myIndex = 0;
217+
// ParserRuleContext earliestAncestor = CollectFeatures.earliestAncestorStartingWithToken(parent, curToken);
218+
// if ( earliestAncestor!=null ) {
219+
// ParserRuleContext commonAncestor = earliestAncestor.getParent();
220+
// List<ParserRuleContext> siblings = commonAncestor.getRuleContexts(earliestAncestor.getClass());
221+
// myIndex = siblings.indexOf(earliestAncestor);
222+
// }
223+
// if ( false ) { //if ( myIndex>0 && align>0 ) { // align with first sibling's start token
224+
// ParserRuleContext commonAncestor = earliestAncestor.getParent();
225+
// List<ParserRuleContext> siblings = commonAncestor.getRuleContexts(earliestAncestor.getClass());
226+
// ParserRuleContext firstSibling = siblings.get(0);
227+
// Token firstSiblingStartToken = firstSibling.getStart();
228+
// // align but don't update currentIndent
229+
// charPosInLine = firstSiblingStartToken.getCharPositionInLine();
230+
// output.append(Tool.spaces(charPosInLine));
231+
// }
232+
// else {
233+
// currentIndent += indent;
234+
// if ( currentIndent<0 ) currentIndent = 0; // don't allow bad indents to accumulate
235+
// charPosInLine = currentIndent;
236+
// output.append(Tool.spaces(currentIndent));
237+
// }
238+
// }
239+
// else {
240+
// // inject whitespace instead of \n?
241+
// output.append(Tool.spaces(ws));
242+
// charPosInLine += ws;
243+
// }
141244
// update Token object with position information now that we are about
142245
// to emit it.
143246
curToken.setLine(line);
@@ -171,8 +274,8 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT
171274
boolean prevIsWS = prevToken.getType()==JavaLexer.WS;
172275
int actualNL = Tool.count(prevToken.getText(), '\n');
173276
int actualWS = Tool.count(prevToken.getText(), ' ');
174-
int actualIndent = originalCurToken.getCharPositionInLine()-currentIndent;
175-
boolean actualAlign = CollectFeatures.isAlignedWithFirstSibling(tokenToNodeMap, tokens, curToken);
277+
int actualIndent = originalCurToken.getCharPositionInLine()-0;// currentIndent;
278+
boolean actualAlign = isAlignedWithFirstSiblingOfList(tokenToNodeMap, tokens, curToken);
176279
String newlinePredictionString = String.format("### line %d: predicted %d \\n actual %s",
177280
originalCurToken.getLine(), injectNewline, prevIsWS ? actualNL : "none");
178281
String alignPredictionString = String.format("### line %d: predicted %s actual %s",
@@ -189,18 +292,15 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT
189292

190293

191294
String newlineAnalysis = newlinePredictionString+"\n"+
192-
newlineClassifier.getPredictionAnalysis(k, features, corpus.injectNewlines,
193-
CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
295+
newlineClassifier.getPredictionAnalysis(doc, k, features, corpus.injectNewlines,
296+
MAX_CONTEXT_DIFF_THRESHOLD);
194297
String alignAnalysis =alignPredictionString+"\n"+
195-
alignClassifier.getPredictionAnalysis(k, features, corpus.alignWithPrevious,
196-
CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
197-
String indentAnalysis =indentPredictionString+"\n"+
198-
indentClassifier.getPredictionAnalysis(k, features, corpus.indent,
199-
CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
298+
alignClassifier.getPredictionAnalysis(doc, k, features, corpus.align,
299+
MAX_CONTEXT_DIFF_THRESHOLD);
200300
String wsAnalysis =wsPredictionString+"\n"+
201-
wsClassifier.getPredictionAnalysis(k, features, corpus.injectWS,
202-
CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
203-
return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, indentAnalysis, wsAnalysis);
301+
wsClassifier.getPredictionAnalysis(doc, k, features, corpus.injectWS,
302+
MAX_CONTEXT_DIFF_THRESHOLD);
303+
return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, wsAnalysis);
204304
}
205305

206306
/** Do not join two words like "finaldouble" or numbers like "3double",

0 commit comments

Comments
 (0)