Skip to content

Commit 8036ba1

Browse files
committed
Merge pull request #20 from antlr/combine-newline-ws
combine ws and nl into single prediction.
2 parents 80955d2 + 9c70e8d commit 8036ba1

File tree

10 files changed

+76
-134
lines changed

10 files changed

+76
-134
lines changed

java/src/org/antlr/codebuff/CollectFeatures.java

Lines changed: 39 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ public class CollectFeatures {
3232
public static final int PAIR_ON_SAME_LINE = 0;
3333
public static final int PAIR_ON_DIFF_LINE = 1;
3434

35+
// Categories for newline, whitespace. CAT_INJECT_NL+n<<8 or CAT_INJECT_WS+n<<8
36+
public static final int CAT_NO_WS = 0;
37+
public static final int CAT_INJECT_NL = 100;
38+
public static final int CAT_INJECT_WS = 200;
39+
3540
// Categories for alignment/indentation
3641
public static final int CAT_NO_ALIGNMENT = 0;
3742

@@ -89,7 +94,7 @@ public class CollectFeatures {
8994

9095
public static final int NUM_FEATURES = 23;
9196

92-
public static FeatureMetaData[] FEATURES_INJECT_NL = {
97+
public static FeatureMetaData[] FEATURES_INJECT_WS = { // inject ws or nl
9398
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1),
9499
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2),
95100
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "rule"}, 2),
@@ -141,32 +146,6 @@ public class CollectFeatures {
141146
new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0)
142147
};
143148

144-
public static FeatureMetaData[] FEATURES_INJECT_WS = {
145-
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1),
146-
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2),
147-
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "rule"}, 2),
148-
FeatureMetaData.UNUSED,
149-
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "right ancestor"}, 3),
150-
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 3),
151-
FeatureMetaData.UNUSED,
152-
new FeatureMetaData(FeatureType.BOOL, new String[]{"Strt", "line"}, 3),
153-
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "rule"}, 2),
154-
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "right ancestor"}, 3),
155-
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 3),
156-
new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^5"}, 1),
157-
new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 1),
158-
new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 1),
159-
new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^3 wid"}, 1),
160-
new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 1),
161-
new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^2 wid"}, 1),
162-
new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 1),
163-
new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent wid"}, 1),
164-
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(2)"}, 1),
165-
new FeatureMetaData(FeatureType.INFO_FILE, new String[] {"", "file"}, 0),
166-
new FeatureMetaData(FeatureType.INFO_LINE, new String[] {"", "line"}, 0),
167-
new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0)
168-
};
169-
170149
public static FeatureMetaData[] FEATURES_ALL = {
171150
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1),
172151
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2),
@@ -201,9 +180,7 @@ public class CollectFeatures {
201180
protected ParserRuleContext root;
202181
protected CommonTokenStream tokens; // track stream so we can examine previous tokens
203182
protected List<int[]> features = new ArrayList<>();
204-
protected List<Integer> injectNewlines = new ArrayList<>();
205-
protected List<Integer> injectWS = new ArrayList<>();
206-
protected List<Integer> indent = new ArrayList<>();
183+
protected List<Integer> injectWhitespace = new ArrayList<>();
207184
protected List<Integer> align = new ArrayList<>();
208185

209186
protected int currentIndent = 0;
@@ -246,7 +223,20 @@ public void computeFeatureVectorForToken(int i) {
246223

247224
int precedingNL = getPrecedingNL(tokens, i); // how many lines to inject
248225

249-
this.injectNewlines.add(precedingNL);
226+
int ws = 0;
227+
if ( precedingNL==0 ) {
228+
ws = curToken.getCharPositionInLine() -
229+
(prevToken.getCharPositionInLine()+prevToken.getText().length());
230+
}
231+
232+
int injectNL_WS = CAT_NO_WS;
233+
if ( precedingNL>0 ) {
234+
injectNL_WS = nlcat(precedingNL);
235+
}
236+
else if ( ws>0 ) {
237+
injectNL_WS = wscat(ws);
238+
}
239+
this.injectWhitespace.add(injectNL_WS);
250240

251241
int columnDelta = 0;
252242
if ( precedingNL>0 ) { // && aligned!=1 ) {
@@ -259,14 +249,6 @@ public void computeFeatureVectorForToken(int i) {
259249
aligned = getAlignmentCategory(node, curToken, columnDelta);
260250
}
261251

262-
int ws = 0;
263-
if ( precedingNL==0 ) {
264-
ws = curToken.getCharPositionInLine() -
265-
(prevToken.getCharPositionInLine()+prevToken.getText().length());
266-
}
267-
268-
this.injectWS.add(ws); // likely negative if precedingNL
269-
270252
this.align.add(aligned);
271253

272254
this.features.add(features);
@@ -631,12 +613,8 @@ public List<int[]> getFeatures() {
631613
return features;
632614
}
633615

634-
public List<Integer> getInjectNewlines() {
635-
return injectNewlines;
636-
}
637-
638-
public List<Integer> getInjectWS() {
639-
return injectWS;
616+
public List<Integer> getInjectWhitespace() {
617+
return injectWhitespace;
640618
}
641619

642620
public List<Integer> getAlign() {
@@ -843,4 +821,20 @@ public static int[] unaligncat(int v) {
843821
int child = (v>>16)&0xFFFF;
844822
return new int[] { deltaFromLeftAncestor, child };
845823
}
824+
825+
public static int wscat(int n) {
826+
return CAT_INJECT_WS | (n<<8);
827+
}
828+
829+
public static int nlcat(int n) {
830+
return CAT_INJECT_NL | (n<<8);
831+
}
832+
833+
public static int unwscat(int v) {
834+
return v >> 8 & 0xFFFF;
835+
}
836+
837+
public static int unnlcat(int v) {
838+
return v >> 8 & 0xFFFF;
839+
}
846840
}

java/src/org/antlr/codebuff/Corpus.java

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,8 @@ public class Corpus {
1818

1919
List<InputDocument> documents; // an entry for each X
2020
List<int[]> X;
21-
List<Integer> injectNewlines;
21+
List<Integer> injectWhitespace;
2222
List<Integer> align; // steps to common ancestor whose first token is alignment anchor
23-
List<Integer> injectWS;
2423

2524
/** an index to narrow down the number of vectors we compute distance() on each classification.
2625
* The key is (previous token's rule index, current token's rule index). It yields
@@ -30,14 +29,12 @@ public class Corpus {
3029

3130
public Corpus(List<InputDocument> documents,
3231
List<int[]> X,
33-
List<Integer> injectNewlines,
34-
List<Integer> align,
35-
List<Integer> injectWS)
32+
List<Integer> injectWhitespace,
33+
List<Integer> align)
3634
{
3735
this.documents = documents;
3836
this.X = X;
39-
this.injectNewlines = injectNewlines;
40-
this.injectWS = injectWS;
37+
this.injectWhitespace = injectWhitespace;
4138
this.align = align;
4239
}
4340

@@ -65,15 +62,12 @@ public void randomShuffleInPlace() {
6562
X.set(i, X.get(j));
6663
X.set(j, tmp);
6764
// And now swap all prediction lists
68-
Integer tmpI = injectNewlines.get(i);
69-
injectNewlines.set(i, injectNewlines.get(j));
70-
injectNewlines.set(j, tmpI);
65+
Integer tmpI = injectWhitespace.get(i);
66+
injectWhitespace.set(i, injectWhitespace.get(j));
67+
injectWhitespace.set(j, tmpI);
7168
tmpI = align.get(i);
7269
align.set(i, align.get(j));
7370
align.set(j, tmpI);
74-
tmpI = injectWS.get(i);
75-
injectWS.set(i, injectWS.get(j));
76-
injectWS.set(j, tmpI);
7771
// Finally, swap documents
7872
InputDocument tmpD = documents.get(i);
7973
documents.set(i, documents.get(j));

java/src/org/antlr/codebuff/Formatter.java

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@
1717
import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_ANCESTOR_CHILD;
1818
import static org.antlr.codebuff.CollectFeatures.CAT_INDENT;
1919
import static org.antlr.codebuff.CollectFeatures.CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN;
20+
import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_NL;
21+
import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_WS;
2022
import static org.antlr.codebuff.CollectFeatures.FEATURES_ALIGN;
21-
import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_NL;
2223
import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_WS;
2324
import static org.antlr.codebuff.CollectFeatures.INDEX_FIRST_ON_LINE;
2425
import static org.antlr.codebuff.CollectFeatures.INDEX_PREV_END_COLUMN;
@@ -44,7 +45,7 @@ public class Formatter {
4445

4546
protected Vector<TokenPositionAnalysis> analysis = new Vector<>();
4647

47-
protected CodekNNClassifier newlineClassifier;
48+
protected CodekNNClassifier nlwsClassifier;
4849
protected CodekNNClassifier wsClassifier;
4950
protected CodekNNClassifier alignClassifier;
5051
protected int k;
@@ -65,8 +66,7 @@ public Formatter(Corpus corpus, InputDocument doc, int tabSize) {
6566
this.tokens = doc.tokens;
6667
this.originalTokens = Tool.copy(tokens);
6768
Tool.wipeLineAndPositionInfo(tokens);
68-
newlineClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_NL);
69-
wsClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_WS);
69+
nlwsClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_WS);
7070
alignClassifier = new CodekNNClassifier(corpus, FEATURES_ALIGN);
7171
// k = (int)Math.sqrt(corpus.X.size());
7272
// k = 7;
@@ -120,26 +120,32 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
120120
// we're tracking it as we emit tokens
121121
features[INDEX_PREV_END_COLUMN] = charPosInLine;
122122

123-
int injectNewline = newlineClassifier.classify(k, features, corpus.injectNewlines, MAX_CONTEXT_DIFF_THRESHOLD);
123+
int injectNL_WS = nlwsClassifier.classify(k, features, corpus.injectWhitespace, MAX_CONTEXT_DIFF_THRESHOLD);
124+
int newlines = 0;
125+
int ws = 0;
126+
if ( (injectNL_WS&0xFF)==CAT_INJECT_NL ) {
127+
newlines = CollectFeatures.unnlcat(injectNL_WS);
128+
}
129+
else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) {
130+
ws = CollectFeatures.unwscat(injectNL_WS);
131+
}
124132

125133
// getNodeFeatures() also doesn't know what line curToken is on. If \n, we need to find exemplars that start a line
126-
features[INDEX_FIRST_ON_LINE] = injectNewline; // use \n prediction to match exemplars for alignment
134+
features[INDEX_FIRST_ON_LINE] = newlines; // use \n prediction to match exemplars for alignment
127135

128136
int align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD);
129137

130-
int ws = wsClassifier.classify(k, features, corpus.injectWS, MAX_CONTEXT_DIFF_THRESHOLD);
131-
132138
TokenPositionAnalysis tokenPositionAnalysis =
133-
getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, injectNewline, align, ws);
139+
getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, newlines, align, ws);
134140
analysis.setSize(tokenIndexInStream+1);
135141
analysis.set(tokenIndexInStream, tokenPositionAnalysis);
136142

137143
if ( ws==0 && cannotJoin(realTokens.get(indexIntoRealTokens-1), curToken) ) { // failsafe!
138144
ws = 1;
139145
}
140146

141-
if ( injectNewline>0 ) {
142-
output.append(Tool.newlines(injectNewline));
147+
if ( newlines>0 ) {
148+
output.append(Tool.newlines(newlines));
143149
line++;
144150
charPosInLine = 0;
145151

@@ -153,7 +159,7 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
153159
ParserRuleContext parent = (ParserRuleContext)node.getParent();
154160

155161
if ( align==CAT_INDENT ) {
156-
if ( firstTokenOnPrevLine!=null ) { // if not on first line, we can indent indent
162+
if ( firstTokenOnPrevLine!=null ) { // if not on first line, we cannot indent
157163
int indentedCol = firstTokenOnPrevLine.getCharPositionInLine()+INDENT_LEVEL;
158164
charPosInLine = indentedCol;
159165
output.append(Tool.spaces(indentedCol));
@@ -287,23 +293,14 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT
287293
originalCurToken.getLine(),
288294
alignWithPrevious==1?"align":"unaligned",
289295
"?");
290-
String wsPredictionString = String.format("### line %d: predicted %d ' ' actual %s",
291-
originalCurToken.getLine(), ws, prevIsWS ? actualWS : "none");
292-
if ( failsafeTriggered ) {
293-
wsPredictionString += " (failsafe triggered)";
294-
}
295-
296296

297297
String newlineAnalysis = newlinePredictionString+"\n"+
298-
newlineClassifier.getPredictionAnalysis(doc, k, features, corpus.injectNewlines,
299-
MAX_CONTEXT_DIFF_THRESHOLD);
298+
nlwsClassifier.getPredictionAnalysis(doc, k, features, corpus.injectWhitespace,
299+
MAX_CONTEXT_DIFF_THRESHOLD);
300300
String alignAnalysis =alignPredictionString+"\n"+
301301
alignClassifier.getPredictionAnalysis(doc, k, features, corpus.align,
302302
MAX_CONTEXT_DIFF_THRESHOLD);
303-
String wsAnalysis =wsPredictionString+"\n"+
304-
wsClassifier.getPredictionAnalysis(doc, k, features, corpus.injectWS,
305-
MAX_CONTEXT_DIFF_THRESHOLD);
306-
return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, wsAnalysis);
303+
return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, "n/a");
307304
}
308305

309306
/** Do not join two words like "finaldouble" or numbers like "3double",

java/src/org/antlr/codebuff/InputDocument.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,8 @@ public class InputDocument {
1616
public Parser parser;
1717
public CommonTokenStream tokens;
1818
public List<int[]> featureVectors;
19-
public List<Integer> injectNewlines;
20-
public List<Integer> injectWS;
21-
public List<Integer> alignWithPrevious;
19+
public List<Integer> injectWhitespace;
20+
public List<Integer> align;
2221
public int allWhiteSpaceCount = 0;
2322
public int incorrectWhiteSpaceCount = 0;
2423
public int misclassifiedNewLineCount = 0;

java/src/org/antlr/codebuff/Optimizer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ public static void main(String[] args) throws Exception {
167167
List<String> allFiles = Tool.getFilenames(new File(testFileDir), ".*\\.java");
168168
ArrayList<InputDocument> documents = (ArrayList<InputDocument>) Tool.load(allFiles, JavaLexer.class, tabSize);
169169

170-
Tester t = new Tester(CollectFeatures.FEATURES_INJECT_NL, corpus, documents, tabSize);
170+
Tester t = new Tester(CollectFeatures.FEATURES_INJECT_WS, corpus, documents, tabSize);
171171
// sorry, had to comment this out
172172
// multiRoundMinimize(Tester::test, LEARNING_RATE, h, PRECISION, CollectFeatures.FEATURES_INJECT_NL, 5);
173173
}

java/src/org/antlr/codebuff/Tool.java

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,6 @@ public static Corpus processSampleDocs(List<InputDocument> docs,
158158
List<InputDocument> documents = new ArrayList<>();
159159
List<int[]> featureVectors = new ArrayList<>();
160160
List<Integer> injectNewlines = new ArrayList<>();
161-
List<Integer> injectWS = new ArrayList<>();
162161
List<Integer> alignWithPrevious = new ArrayList<>();
163162
for (InputDocument doc : docs) {
164163
if ( showFileNames ) System.out.println(doc);
@@ -167,14 +166,13 @@ public static Corpus processSampleDocs(List<InputDocument> docs,
167166
for (int i=0; i<doc.featureVectors.size(); i++) {
168167
documents.add(doc);
169168
int[] featureVec = doc.featureVectors.get(i);
170-
injectNewlines.add(doc.injectNewlines.get(i));
171-
injectWS.add(doc.injectWS.get(i));
172-
alignWithPrevious.add(doc.alignWithPrevious.get(i));
169+
injectNewlines.add(doc.injectWhitespace.get(i));
170+
alignWithPrevious.add(doc.align.get(i));
173171
featureVectors.add(featureVec);
174172
}
175173
}
176174
System.out.printf("%d feature vectors\n", featureVectors.size());
177-
return new Corpus(documents, featureVectors, injectNewlines, alignWithPrevious, injectWS);
175+
return new Corpus(documents, featureVectors, injectNewlines, alignWithPrevious);
178176
}
179177

180178
/** Parse document, save feature vectors to the doc but return it also */
@@ -183,9 +181,8 @@ public static void process(InputDocument doc, int tabSize, Map<String, List<Pair
183181
collector.computeFeatureVectors();
184182

185183
doc.featureVectors = collector.getFeatures();
186-
doc.injectNewlines = collector.getInjectNewlines();
187-
doc.injectWS = collector.getInjectWS();
188-
doc.alignWithPrevious = collector.getAlign();
184+
doc.injectWhitespace = collector.getInjectWhitespace();
185+
doc.align = collector.getAlign();
189186
}
190187

191188
public static CommonTokenStream tokenize(String doc, Class<? extends Lexer> lexerClass)

java/src/org/antlr/codebuff/gui/BuffScope.form

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -144,28 +144,6 @@
144144
</scrollpane>
145145
</children>
146146
</grid>
147-
<grid id="883ac" binding="injectWSTab" layout-manager="BorderLayout" hgap="0" vgap="0">
148-
<constraints>
149-
<tabbedpane title="Inject whitespace"/>
150-
</constraints>
151-
<properties/>
152-
<border type="none"/>
153-
<children>
154-
<scrollpane id="fb4f9">
155-
<constraints border-constraint="Center"/>
156-
<properties/>
157-
<border type="none"/>
158-
<children>
159-
<component id="93e0e" class="javax.swing.JTextArea" binding="injectWSConsole">
160-
<constraints/>
161-
<properties>
162-
<editable value="false"/>
163-
</properties>
164-
</component>
165-
</children>
166-
</scrollpane>
167-
</children>
168-
</grid>
169147
</children>
170148
</tabbedpane>
171149
</children>

0 commit comments

Comments
 (0)