Merge pull request #20 from antlr/combine-newline-ws

parrt · parrt · commit 8036ba14a4a2 · 2016-04-05T17:43:54.000Z
combine ws and nl into single prediction.
diff --git a/java/src/org/antlr/codebuff/CollectFeatures.java b/java/src/org/antlr/codebuff/CollectFeatures.java
@@ -32,6 +32,11 @@ public class CollectFeatures {
 	public static final int PAIR_ON_SAME_LINE = 0;
 	public static final int PAIR_ON_DIFF_LINE = 1;
 
+	// Categories for newline, whitespace. CAT_INJECT_NL+n<<8 or CAT_INJECT_WS+n<<8
+	public static final int CAT_NO_WS = 0;
+	public static final int CAT_INJECT_NL = 100;
+	public static final int CAT_INJECT_WS = 200;
+
 	// Categories for alignment/indentation
 	public static final int CAT_NO_ALIGNMENT = 0;
 
@@ -89,7 +94,7 @@ public class CollectFeatures {
 
 	public static final int NUM_FEATURES            = 23;
 
-	public static FeatureMetaData[] FEATURES_INJECT_NL = {
+	public static FeatureMetaData[] FEATURES_INJECT_WS = { // inject ws or nl
 		new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1),
 		new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2),
 		new FeatureMetaData(FeatureType.RULE,  new String[] {"LT(-1)", "rule"}, 2),
@@ -141,32 +146,6 @@ public class CollectFeatures {
 		new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0)
 	};
 
-	public static FeatureMetaData[] FEATURES_INJECT_WS = {
-		new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1),
-		new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2),
-		new FeatureMetaData(FeatureType.RULE,  new String[] {"LT(-1)", "rule"}, 2),
-		FeatureMetaData.UNUSED,
-		new FeatureMetaData(FeatureType.RULE,  new String[] {"LT(-1)", "right ancestor"}, 3),
-		new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 3),
-		FeatureMetaData.UNUSED,
-		new FeatureMetaData(FeatureType.BOOL,   new String[]{"Strt", "line"}, 3),
-		new FeatureMetaData(FeatureType.RULE,  new String[] {"LT(1)", "rule"}, 2),
-		new FeatureMetaData(FeatureType.RULE,  new String[] {"LT(1)", "right ancestor"}, 3),
-		new FeatureMetaData(FeatureType.RULE,  new String[] {"LT(1)", "left ancestor"}, 3),
-		new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^5"}, 1),
-		new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 1),
-		new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 1),
-		new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^3 wid"}, 1),
-		new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 1),
-		new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^2 wid"}, 1),
-		new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 1),
-		new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent wid"}, 1),
-		new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(2)"}, 1),
-		new FeatureMetaData(FeatureType.INFO_FILE,    new String[] {"", "file"}, 0),
-		new FeatureMetaData(FeatureType.INFO_LINE,    new String[] {"", "line"}, 0),
-		new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0)
-	};
-
 	public static FeatureMetaData[] FEATURES_ALL = {
 		new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1),
 		new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2),
@@ -201,9 +180,7 @@ public class CollectFeatures {
 	protected ParserRuleContext root;
 	protected CommonTokenStream tokens; // track stream so we can examine previous tokens
 	protected List<int[]> features = new ArrayList<>();
-	protected List<Integer> injectNewlines = new ArrayList<>();
-	protected List<Integer> injectWS = new ArrayList<>();
-	protected List<Integer> indent = new ArrayList<>();
+	protected List<Integer> injectWhitespace = new ArrayList<>();
 	protected List<Integer> align = new ArrayList<>();
 
 	protected int currentIndent = 0;
@@ -246,7 +223,20 @@ public void computeFeatureVectorForToken(int i) {
 
 		int precedingNL = getPrecedingNL(tokens, i); // how many lines to inject
 
-		this.injectNewlines.add(precedingNL);
+		int ws = 0;
+		if ( precedingNL==0 ) {
+			ws = curToken.getCharPositionInLine() -
+				(prevToken.getCharPositionInLine()+prevToken.getText().length());
+		}
+
+		int injectNL_WS = CAT_NO_WS;
+		if ( precedingNL>0 ) {
+			injectNL_WS = nlcat(precedingNL);
+		}
+		else if ( ws>0 ) {
+			injectNL_WS = wscat(ws);
+		}
+		this.injectWhitespace.add(injectNL_WS);
 
 		int columnDelta = 0;
 		if ( precedingNL>0 ) { // && aligned!=1 ) {
@@ -259,14 +249,6 @@ public void computeFeatureVectorForToken(int i) {
 			aligned = getAlignmentCategory(node, curToken, columnDelta);
 		}
 
-		int ws = 0;
-		if ( precedingNL==0 ) {
-			ws = curToken.getCharPositionInLine() -
-				(prevToken.getCharPositionInLine()+prevToken.getText().length());
-		}
-
-		this.injectWS.add(ws); // likely negative if precedingNL
-
 		this.align.add(aligned);
 
 		this.features.add(features);
@@ -631,12 +613,8 @@ public List<int[]> getFeatures() {
 		return features;
 	}
 
-	public List<Integer> getInjectNewlines() {
-		return injectNewlines;
-	}
-
-	public List<Integer> getInjectWS() {
-		return injectWS;
+	public List<Integer> getInjectWhitespace() {
+		return injectWhitespace;
 	}
 
 	public List<Integer> getAlign() {
@@ -843,4 +821,20 @@ public static int[] unaligncat(int v) {
 		int child = (v>>16)&0xFFFF;
 		return new int[] { deltaFromLeftAncestor, child };
 	}
+
+	public static int wscat(int n) {
+		return CAT_INJECT_WS | (n<<8);
+	}
+
+	public static int nlcat(int n) {
+		return CAT_INJECT_NL | (n<<8);
+	}
+
+	public static int unwscat(int v) {
+		return v >> 8 & 0xFFFF;
+	}
+
+	public static int unnlcat(int v) {
+		return v >> 8 & 0xFFFF;
+	}
 }
diff --git a/java/src/org/antlr/codebuff/Corpus.java b/java/src/org/antlr/codebuff/Corpus.java
@@ -18,9 +18,8 @@ public class Corpus {
 
 	List<InputDocument> documents; // an entry for each X
 	List<int[]> X;
-	List<Integer> injectNewlines;
+	List<Integer> injectWhitespace;
 	List<Integer> align; // steps to common ancestor whose first token is alignment anchor
-	List<Integer> injectWS;
 
 	/** an index to narrow down the number of vectors we compute distance() on each classification.
 	 *  The key is (previous token's rule index, current token's rule index). It yields
@@ -30,14 +29,12 @@ public class Corpus {
 
 	public Corpus(List<InputDocument> documents,
 				  List<int[]> X,
-				  List<Integer> injectNewlines,
-				  List<Integer> align,
-				  List<Integer> injectWS)
+				  List<Integer> injectWhitespace,
+				  List<Integer> align)
 	{
 		this.documents = documents;
 		this.X = X;
-		this.injectNewlines = injectNewlines;
-		this.injectWS = injectWS;
+		this.injectWhitespace = injectWhitespace;
 		this.align = align;
 	}
 
@@ -65,15 +62,12 @@ public void randomShuffleInPlace() {
 			X.set(i, X.get(j));
 			X.set(j, tmp);
 			// And now swap all prediction lists
-			Integer tmpI = injectNewlines.get(i);
-			injectNewlines.set(i, injectNewlines.get(j));
-			injectNewlines.set(j, tmpI);
+			Integer tmpI = injectWhitespace.get(i);
+			injectWhitespace.set(i, injectWhitespace.get(j));
+			injectWhitespace.set(j, tmpI);
 			tmpI = align.get(i);
 			align.set(i, align.get(j));
 			align.set(j, tmpI);
-			tmpI = injectWS.get(i);
-			injectWS.set(i, injectWS.get(j));
-			injectWS.set(j, tmpI);
 			// Finally, swap documents
 			InputDocument tmpD = documents.get(i);
 			documents.set(i, documents.get(j));
diff --git a/java/src/org/antlr/codebuff/Formatter.java b/java/src/org/antlr/codebuff/Formatter.java
@@ -17,8 +17,9 @@
 import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_ANCESTOR_CHILD;
 import static org.antlr.codebuff.CollectFeatures.CAT_INDENT;
 import static org.antlr.codebuff.CollectFeatures.CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN;
+import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_NL;
+import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_WS;
 import static org.antlr.codebuff.CollectFeatures.FEATURES_ALIGN;
-import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_NL;
 import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_WS;
 import static org.antlr.codebuff.CollectFeatures.INDEX_FIRST_ON_LINE;
 import static org.antlr.codebuff.CollectFeatures.INDEX_PREV_END_COLUMN;
@@ -44,7 +45,7 @@ public class Formatter {
 
 	protected Vector<TokenPositionAnalysis> analysis = new Vector<>();
 
-	protected CodekNNClassifier newlineClassifier;
+	protected CodekNNClassifier nlwsClassifier;
 	protected CodekNNClassifier wsClassifier;
 	protected CodekNNClassifier alignClassifier;
 	protected int k;
@@ -65,8 +66,7 @@ public Formatter(Corpus corpus, InputDocument doc, int tabSize) {
 		this.tokens = doc.tokens;
 		this.originalTokens = Tool.copy(tokens);
 		Tool.wipeLineAndPositionInfo(tokens);
-		newlineClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_NL);
-		wsClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_WS);
+		nlwsClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_WS);
 		alignClassifier = new CodekNNClassifier(corpus, FEATURES_ALIGN);
 //		k = (int)Math.sqrt(corpus.X.size());
 //		k = 7;
@@ -120,26 +120,32 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
 		// we're tracking it as we emit tokens
 		features[INDEX_PREV_END_COLUMN] = charPosInLine;
 
-		int injectNewline = newlineClassifier.classify(k, features, corpus.injectNewlines, MAX_CONTEXT_DIFF_THRESHOLD);
+		int injectNL_WS = nlwsClassifier.classify(k, features, corpus.injectWhitespace, MAX_CONTEXT_DIFF_THRESHOLD);
+		int newlines = 0;
+		int ws = 0;
+		if ( (injectNL_WS&0xFF)==CAT_INJECT_NL ) {
+			newlines = CollectFeatures.unnlcat(injectNL_WS);
+		}
+		else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) {
+			ws = CollectFeatures.unwscat(injectNL_WS);
+		}
 
 		// getNodeFeatures() also doesn't know what line curToken is on. If \n, we need to find exemplars that start a line
-		features[INDEX_FIRST_ON_LINE] = injectNewline; // use \n prediction to match exemplars for alignment
+		features[INDEX_FIRST_ON_LINE] = newlines; // use \n prediction to match exemplars for alignment
 
 		int align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD);
 
-		int ws = wsClassifier.classify(k, features, corpus.injectWS, MAX_CONTEXT_DIFF_THRESHOLD);
-
 		TokenPositionAnalysis tokenPositionAnalysis =
-			getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, injectNewline, align, ws);
+			getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, newlines, align, ws);
 		analysis.setSize(tokenIndexInStream+1);
 		analysis.set(tokenIndexInStream, tokenPositionAnalysis);
 
 		if ( ws==0 && cannotJoin(realTokens.get(indexIntoRealTokens-1), curToken) ) { // failsafe!
 			ws = 1;
 		}
 
-		if ( injectNewline>0 ) {
-			output.append(Tool.newlines(injectNewline));
+		if ( newlines>0 ) {
+			output.append(Tool.newlines(newlines));
 			line++;
 			charPosInLine = 0;
 
@@ -153,7 +159,7 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
 			ParserRuleContext parent = (ParserRuleContext)node.getParent();
 
 			if ( align==CAT_INDENT ) {
-				if ( firstTokenOnPrevLine!=null ) { // if not on first line, we can indent indent
+				if ( firstTokenOnPrevLine!=null ) { // if not on first line, we cannot indent
 					int indentedCol = firstTokenOnPrevLine.getCharPositionInLine()+INDENT_LEVEL;
 					charPosInLine = indentedCol;
 					output.append(Tool.spaces(indentedCol));
@@ -287,23 +293,14 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT
 		                                             originalCurToken.getLine(),
 		                                             alignWithPrevious==1?"align":"unaligned",
 		                                             "?");
-		String wsPredictionString = String.format("### line %d: predicted %d ' ' actual %s",
-		                                          originalCurToken.getLine(), ws, prevIsWS ? actualWS : "none");
-		if ( failsafeTriggered ) {
-			wsPredictionString += " (failsafe triggered)";
-		}
-
 
 		String newlineAnalysis = newlinePredictionString+"\n"+
-			newlineClassifier.getPredictionAnalysis(doc, k, features, corpus.injectNewlines,
-			                                        MAX_CONTEXT_DIFF_THRESHOLD);
+			nlwsClassifier.getPredictionAnalysis(doc, k, features, corpus.injectWhitespace,
+			                                     MAX_CONTEXT_DIFF_THRESHOLD);
 		String alignAnalysis =alignPredictionString+"\n"+
 			alignClassifier.getPredictionAnalysis(doc, k, features, corpus.align,
 			                                      MAX_CONTEXT_DIFF_THRESHOLD);
-		String wsAnalysis =wsPredictionString+"\n"+
-			wsClassifier.getPredictionAnalysis(doc, k, features, corpus.injectWS,
-			                                   MAX_CONTEXT_DIFF_THRESHOLD);
-		return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, wsAnalysis);
+		return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, "n/a");
 	}
 
 	/** Do not join two words like "finaldouble" or numbers like "3double",
diff --git a/java/src/org/antlr/codebuff/InputDocument.java b/java/src/org/antlr/codebuff/InputDocument.java
@@ -16,9 +16,8 @@ public class InputDocument {
 	public Parser parser;
 	public CommonTokenStream tokens;
 	public List<int[]> featureVectors;
-	public List<Integer> injectNewlines;
-	public List<Integer> injectWS;
-	public List<Integer> alignWithPrevious;
+	public List<Integer> injectWhitespace;
+	public List<Integer> align;
 	public int allWhiteSpaceCount = 0;
 	public int incorrectWhiteSpaceCount = 0;
 	public int misclassifiedNewLineCount = 0;
diff --git a/java/src/org/antlr/codebuff/Optimizer.java b/java/src/org/antlr/codebuff/Optimizer.java
@@ -167,7 +167,7 @@ public static void main(String[] args) throws Exception {
 		List<String> allFiles = Tool.getFilenames(new File(testFileDir), ".*\\.java");
 		ArrayList<InputDocument> documents = (ArrayList<InputDocument>) Tool.load(allFiles, JavaLexer.class, tabSize);
 
-		Tester t = new Tester(CollectFeatures.FEATURES_INJECT_NL, corpus, documents, tabSize);
+		Tester t = new Tester(CollectFeatures.FEATURES_INJECT_WS, corpus, documents, tabSize);
 		// sorry, had to comment this out
 //		multiRoundMinimize(Tester::test, LEARNING_RATE, h, PRECISION, CollectFeatures.FEATURES_INJECT_NL, 5);
 	}
diff --git a/java/src/org/antlr/codebuff/Tool.java b/java/src/org/antlr/codebuff/Tool.java
@@ -158,7 +158,6 @@ public static Corpus processSampleDocs(List<InputDocument> docs,
 		List<InputDocument> documents = new ArrayList<>();
 		List<int[]> featureVectors = new ArrayList<>();
 		List<Integer> injectNewlines = new ArrayList<>();
-		List<Integer> injectWS = new ArrayList<>();
 		List<Integer> alignWithPrevious = new ArrayList<>();
 		for (InputDocument doc : docs) {
 			if ( showFileNames ) System.out.println(doc);
@@ -167,14 +166,13 @@ public static Corpus processSampleDocs(List<InputDocument> docs,
 			for (int i=0; i<doc.featureVectors.size(); i++) {
 				documents.add(doc);
 				int[] featureVec = doc.featureVectors.get(i);
-				injectNewlines.add(doc.injectNewlines.get(i));
-				injectWS.add(doc.injectWS.get(i));
-				alignWithPrevious.add(doc.alignWithPrevious.get(i));
+				injectNewlines.add(doc.injectWhitespace.get(i));
+				alignWithPrevious.add(doc.align.get(i));
 				featureVectors.add(featureVec);
 			}
 		}
 		System.out.printf("%d feature vectors\n", featureVectors.size());
-		return new Corpus(documents, featureVectors, injectNewlines, alignWithPrevious, injectWS);
+		return new Corpus(documents, featureVectors, injectNewlines, alignWithPrevious);
 	}
 
 	/** Parse document, save feature vectors to the doc but return it also */
@@ -183,9 +181,8 @@ public static void process(InputDocument doc, int tabSize, Map<String, List<Pair
 		collector.computeFeatureVectors();
 
 		doc.featureVectors = collector.getFeatures();
-		doc.injectNewlines = collector.getInjectNewlines();
-		doc.injectWS = collector.getInjectWS();
-		doc.alignWithPrevious = collector.getAlign();
+		doc.injectWhitespace = collector.getInjectWhitespace();
+		doc.align = collector.getAlign();
 	}
 
 	public static CommonTokenStream tokenize(String doc, Class<? extends Lexer> lexerClass)
diff --git a/java/src/org/antlr/codebuff/gui/BuffScope.form b/java/src/org/antlr/codebuff/gui/BuffScope.form
@@ -144,28 +144,6 @@
                   </scrollpane>
                 </children>
               </grid>
-              <grid id="883ac" binding="injectWSTab" layout-manager="BorderLayout" hgap="0" vgap="0">
-                <constraints>
-                  <tabbedpane title="Inject whitespace"/>
-                </constraints>
-                <properties/>
-                <border type="none"/>
-                <children>
-                  <scrollpane id="fb4f9">
-                    <constraints border-constraint="Center"/>
-                    <properties/>
-                    <border type="none"/>
-                    <children>
-                      <component id="93e0e" class="javax.swing.JTextArea" binding="injectWSConsole">
-                        <constraints/>
-                        <properties>
-                          <editable value="false"/>
-                        </properties>
-                      </component>
-                    </children>
-                  </scrollpane>
-                </children>
-              </grid>
             </children>
           </tabbedpane>
         </children>
diff --git a/java/src/org/antlr/codebuff/gui/BuffScope.java b/java/src/org/antlr/codebuff/gui/BuffScope.java
diff --git a/java/src/org/antlr/codebuff/gui/GUIController.java b/java/src/org/antlr/codebuff/gui/GUIController.java
diff --git a/java/src/org/antlr/codebuff/kNNClassifier.java b/java/src/org/antlr/codebuff/kNNClassifier.java

Original file line number	Diff line number	Diff line change
`@@ -167,7 +167,7 @@ public static void main(String[] args) throws Exception {`
`167`	`167`	`List<String> allFiles = Tool.getFilenames(new File(testFileDir), ".*\\.java");`
`168`	`168`	`ArrayList<InputDocument> documents = (ArrayList<InputDocument>) Tool.load(allFiles, JavaLexer.class, tabSize);`
`169`	`169`
`170`		`- Tester t = new Tester(CollectFeatures.FEATURES_INJECT_NL, corpus, documents, tabSize);`
	`170`	`+ Tester t = new Tester(CollectFeatures.FEATURES_INJECT_WS, corpus, documents, tabSize);`
`171`	`171`	`// sorry, had to comment this out`
`172`	`172`	`// multiRoundMinimize(Tester::test, LEARNING_RATE, h, PRECISION, CollectFeatures.FEATURES_INJECT_NL, 5);`
`173`	`173`	`}`