antlr
diff --git a/‎java/grammars/org/antlr/codebuff/Java.g4‎
Lines changed: 2 additions & 2 deletions b/‎java/grammars/org/antlr/codebuff/Java.g4‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎java/src/org/antlr/codebuff/CollectFeatures.java‎
Lines changed: 204 additions & 78 deletions b/‎java/src/org/antlr/codebuff/CollectFeatures.java‎
Lines changed: 204 additions & 78 deletions
diff --git a/‎java/src/org/antlr/codebuff/Corpus.java‎
Lines changed: 46 additions & 10 deletions b/‎java/src/org/antlr/codebuff/Corpus.java‎
Lines changed: 46 additions & 10 deletions
diff --git a/‎java/src/org/antlr/codebuff/FeatureType.java‎
Lines changed: 1 addition & 1 deletion b/‎java/src/org/antlr/codebuff/FeatureType.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎java/src/org/antlr/codebuff/Formatter.java‎
Lines changed: 151 additions & 51 deletions b/‎java/src/org/antlr/codebuff/Formatter.java‎
Lines changed: 151 additions & 51 deletions
@@ -1016,9 +1016,9 @@ WS  :  [ \t\r\n\u000C]+ -> channel(HIDDEN) // CodeBuff needs to see all whitespa
     ;
 
 COMMENT
-    :   '/*' .*? '*/' -> skip
+    :   '/*' .*? '*/' -> channel(HIDDEN)
     ;
 
 LINE_COMMENT
-    :   '//' ~[\r\n]* -> skip
+    :   '//' ~[\r\n]* -> channel(HIDDEN)
     ;
@@ -6,20 +6,19 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Random;
 
 public class Corpus {
-	public static final int NUM_DEPENDENT_VARS = 4;
+	public static final int NUM_DEPENDENT_VARS = 3;
 	public static final int INDEX_FEATURE_NEWLINES = 0;
 	public static final int INDEX_FEATURE_WS = 1;
-	public static final int INDEX_FEATURE_INDENT = 2;
-	public static final int INDEX_FEATURE_ALIGN_WITH_PREVIOUS = 3;
+	public static final int INDEX_FEATURE_ALIGN_WITH_PREVIOUS = 2;
 
 	List<InputDocument> documents; // an entry for each X
 	List<int[]> X;
 	List<Integer> injectNewlines;
+	List<Integer> align; // steps to common ancestor whose first token is alignment anchor
 	List<Integer> injectWS;
-	List<Integer> indent;
-	List<Integer> alignWithPrevious; // steps to common ancestor whose first token is alignment anchor
 
 	/** an index to narrow down the number of vectors we compute distance() on each classification.
 	 *  The key is (previous token's rule index, current token's rule index). It yields
@@ -30,16 +29,53 @@ public class Corpus {
 	public Corpus(List<InputDocument> documents,
 				  List<int[]> X,
 				  List<Integer> injectNewlines,
-				  List<Integer> injectWS,
-				  List<Integer> indent,
-				  List<Integer> alignWithPrevious)
+				  List<Integer> align,
+				  List<Integer> injectWS)
 	{
 		this.documents = documents;
 		this.X = X;
 		this.injectNewlines = injectNewlines;
 		this.injectWS = injectWS;
-		this.indent = indent;
-		this.alignWithPrevious = alignWithPrevious;
+		this.align = align;
+	}
+
+	/** Feature vectors in X are lumped together as they are read in each
+	 *  document. In kNN, this tends to find features from the same document
+	 *  rather than from across the corpus since we grab k neighbors.
+	 *  For k=11, we might only see exemplars from a single corpus document.
+	 *  If all exemplars fit in k, this wouldn't be an issue.
+	 *
+	 *  Fisher-Yates / Knuth shuffling
+	 *  "To shuffle an array a of n elements (indices 0..n-1)":
+	 *  https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
+	 */
+	public void randomShuffleInPlace() {
+		Random r = new Random();
+		// for i from n−1 downto 1 do
+		int n = X.size();
+		for (int i=n-1; i>=1; i--) {
+			// j ← random integer such that 0 ≤ j ≤ i
+			int j = r.nextInt(i+1);
+			// exchange a[j] and a[i]
+			// Swap X
+			int[] tmp = X.get(i);
+			X.set(i, X.get(j));
+			X.set(j, tmp);
+			// And now swap all prediction lists
+			Integer tmpI = injectNewlines.get(i);
+			injectNewlines.set(i, injectNewlines.get(j));
+			injectNewlines.set(j, tmpI);
+			tmpI = align.get(i);
+			align.set(i, align.get(j));
+			align.set(j, tmpI);
+			tmpI = injectWS.get(i);
+			injectWS.set(i, injectWS.get(j));
+			injectWS.set(j, tmpI);
+			// Finally, swap documents
+			InputDocument tmpD = documents.get(i);
+			documents.set(i, documents.get(j));
+			documents.set(j, tmpD);
+		}
 	}
 
 	public void buildTokenContextIndex() {
 
@@ -2,7 +2,7 @@
 
 public enum FeatureType {
 	TOKEN(12), RULE(14), INT(7), BOOL(5), COL(7),
-	INFO_FILE(4), INFO_LINE(4), INFO_CHARPOS(4),
+	INFO_FILE(15), INFO_LINE(4), INFO_CHARPOS(4),
 	UNUSED(0);
 	public int displayWidth;
 
 
@@ -4,13 +4,35 @@
 import org.antlr.v4.runtime.CommonTokenStream;
 import org.antlr.v4.runtime.ParserRuleContext;
 import org.antlr.v4.runtime.Token;
+import org.antlr.v4.runtime.WritableToken;
 import org.antlr.v4.runtime.misc.Interval;
 import org.antlr.v4.runtime.tree.TerminalNode;
 
 import java.util.List;
 import java.util.Map;
 import java.util.Vector;
 
+import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_ANCESTORS_PARENT_FIRST_TOKEN;
+import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_ANCESTOR_FIRST_TOKEN;
+import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_LIST_FIRST_ELEMENT;
+import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_PAIR;
+import static org.antlr.codebuff.CollectFeatures.CAT_INDENT;
+import static org.antlr.codebuff.CollectFeatures.CAT_NO_ALIGNMENT;
+import static org.antlr.codebuff.CollectFeatures.FEATURES_ALIGN;
+import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_NL;
+import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_WS;
+import static org.antlr.codebuff.CollectFeatures.INDEX_FIRST_ON_LINE;
+import static org.antlr.codebuff.CollectFeatures.INDEX_PREV_END_COLUMN;
+import static org.antlr.codebuff.CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD;
+import static org.antlr.codebuff.CollectFeatures.earliestAncestorEndingWithToken;
+import static org.antlr.codebuff.CollectFeatures.getListSiblings;
+import static org.antlr.codebuff.CollectFeatures.getMatchingLeftSymbol;
+import static org.antlr.codebuff.CollectFeatures.getNodeFeatures;
+import static org.antlr.codebuff.CollectFeatures.getRealTokens;
+import static org.antlr.codebuff.CollectFeatures.getTokensOnPreviousLine;
+import static org.antlr.codebuff.CollectFeatures.indexTree;
+import static org.antlr.codebuff.CollectFeatures.isAlignedWithFirstSiblingOfList;
+
 public class Formatter {
 	protected final Corpus corpus;
 	protected StringBuilder output = new StringBuilder();
@@ -26,13 +48,11 @@ public class Formatter {
 
 	protected CodekNNClassifier newlineClassifier;
 	protected CodekNNClassifier wsClassifier;
-	protected CodekNNClassifier indentClassifier;
 	protected CodekNNClassifier alignClassifier;
 	protected int k;
 
 	protected int line = 1;
 	protected int charPosInLine = 0;
-	protected int currentIndent = 0;
 
 	protected int tabSize;
 
@@ -47,11 +67,11 @@ public Formatter(Corpus corpus, InputDocument doc, int tabSize) {
 		this.tokens = doc.tokens;
 		this.originalTokens = Tool.copy(tokens);
 		Tool.wipeLineAndPositionInfo(tokens);
-		newlineClassifier = new CodekNNClassifier(corpus, CollectFeatures.FEATURES_INJECT_NL);
-		wsClassifier = new CodekNNClassifier(corpus, CollectFeatures.FEATURES_INJECT_WS);
-		indentClassifier = new CodekNNClassifier(corpus, CollectFeatures.FEATURES_INDENT);
-		alignClassifier = new CodekNNClassifier(corpus, CollectFeatures.FEATURES_ALIGN);
-		k = (int)Math.sqrt(corpus.X.size());
+		newlineClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_NL);
+		wsClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_WS);
+		alignClassifier = new CodekNNClassifier(corpus, FEATURES_ALIGN);
+//		k = (int)Math.sqrt(corpus.X.size());
+		k = 11;
 		this.tabSize = tabSize;
 	}
 
@@ -66,15 +86,23 @@ public List<TokenPositionAnalysis> getAnalysisPerToken() {
 
 	public String format() {
 		if ( tokenToNodeMap == null ) {
-			tokenToNodeMap = CollectFeatures.indexTree(root);
+			tokenToNodeMap = indexTree(root);
 		}
 
 		tokens.seek(0);
-		Token secondToken = tokens.LT(2);
+		WritableToken firstToken = (WritableToken)tokens.LT(1);
+		WritableToken secondToken = (WritableToken)tokens.LT(2);
+		// all tokens are wiped of line/col info so set them for first 2
+		firstToken.setLine(1);
+		firstToken.setCharPositionInLine(0);
+		secondToken.setLine(1);
+		secondToken.setCharPositionInLine(firstToken.getText().length());
+
 		String prefix = tokens.getText(Interval.of(0, secondToken.getTokenIndex()));
 		output.append(prefix);
 
-		realTokens = CollectFeatures.getRealTokens(tokens);
+
+		realTokens = getRealTokens(tokens);
 		for (int i = 2; i<realTokens.size(); i++) { // can't process first 2 tokens
 			int tokenIndexInStream = realTokens.get(i).getTokenIndex();
 			processToken(i, tokenIndexInStream);
@@ -86,18 +114,23 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
 		CommonToken curToken = (CommonToken)tokens.get(tokenIndexInStream);
 		String tokText = curToken.getText();
 
-		int[] features = CollectFeatures.getNodeFeatures(tokenToNodeMap, doc, tokenIndexInStream, line, tabSize);
+		int[] features = getNodeFeatures(tokenToNodeMap, doc, tokenIndexInStream, line, tabSize);
 		// must set "prev end column" value as token stream doesn't have it;
 		// we're tracking it as we emit tokens
-		features[CollectFeatures.INDEX_PREV_END_COLUMN] = charPosInLine;
+		features[INDEX_PREV_END_COLUMN] = charPosInLine;
 
-		int injectNewline = newlineClassifier.classify(k, features, corpus.injectNewlines, CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
-		int alignWithPrevious = alignClassifier.classify(k, features, corpus.alignWithPrevious, CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
-		int indent = indentClassifier.classify(k, features, corpus.indent, CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
-		int ws = wsClassifier.classify(k, features, corpus.injectWS, CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
+		int injectNewline = newlineClassifier.classify(k, features, corpus.injectNewlines, MAX_CONTEXT_DIFF_THRESHOLD);
+
+		// getNodeFeatures() also doesn't know what line curToken is on. If \n, we need to find exemplars that start a line
+		features[INDEX_FIRST_ON_LINE] = injectNewline; // use \n prediction to match exemplars for alignment
+
+		int align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD);
+		int indent = 0;
+		//indentClassifier.classify(k, features, corpus.indent, CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
+		int ws = wsClassifier.classify(k, features, corpus.injectWS, MAX_CONTEXT_DIFF_THRESHOLD);
 
 		TokenPositionAnalysis tokenPositionAnalysis =
-			getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, injectNewline, alignWithPrevious, indent, ws);
+			getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, injectNewline, align, indent, ws);
 		analysis.setSize(tokenIndexInStream+1);
 		analysis.set(tokenIndexInStream, tokenPositionAnalysis);
 
@@ -108,36 +141,106 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
 		if ( injectNewline>0 ) {
 			output.append(Tool.newlines(injectNewline));
 			line++;
-			TerminalNode node = tokenToNodeMap.get(tokens.get(tokenIndexInStream));
-			ParserRuleContext parent = (ParserRuleContext)node.getParent();
-			int myIndex = 0;
-			ParserRuleContext earliestAncestor = CollectFeatures.earliestAncestorStartingAtToken(parent, curToken);
-			if ( earliestAncestor!=null ) {
-				ParserRuleContext commonAncestor = earliestAncestor.getParent();
-				List<ParserRuleContext> siblings = commonAncestor.getRuleContexts(earliestAncestor.getClass());
-				myIndex = siblings.indexOf(earliestAncestor);
-			}
-			if ( myIndex>0 && alignWithPrevious>0 ) { // align with first sibling's start token
-				ParserRuleContext commonAncestor = earliestAncestor.getParent();
-				List<ParserRuleContext> siblings = commonAncestor.getRuleContexts(earliestAncestor.getClass());
-				ParserRuleContext firstSibling = siblings.get(0);
-				Token firstSiblingStartToken = firstSibling.getStart();
-				// align but don't update currentIndent
-				charPosInLine = firstSiblingStartToken.getCharPositionInLine();
-				output.append(Tool.spaces(charPosInLine));
+			charPosInLine = 0;
+
+			List<Token> tokensOnPreviousLine = getTokensOnPreviousLine(tokens, tokenIndexInStream, line);
+			Token firstTokenOnPrevLine = null;
+			if ( tokensOnPreviousLine.size()>0 ) {
+				firstTokenOnPrevLine = tokensOnPreviousLine.get(0);
 			}
-			else {
-				currentIndent += indent;
-				if ( currentIndent<0 ) currentIndent = 0; // don't allow bad indents to accumulate
-				charPosInLine = currentIndent;
-				output.append(Tool.spaces(currentIndent));
+
+			TerminalNode node = tokenToNodeMap.get(curToken);
+			ParserRuleContext parent = (ParserRuleContext)node.getParent();
+			ParserRuleContext earliestRightAncestor = earliestAncestorEndingWithToken(parent, curToken);
+
+			switch ( align ) {
+				case CAT_INDENT :
+					if ( firstTokenOnPrevLine!=null ) { // if not on first line, we can indent indent
+						int indentedCol = firstTokenOnPrevLine.getCharPositionInLine() + 4;
+						charPosInLine = indentedCol;
+						output.append(Tool.spaces(indentedCol));
+					}
+					break;
+				case CAT_ALIGN_WITH_ANCESTOR_FIRST_TOKEN :
+					if ( earliestRightAncestor!=null ) {
+						Token earliestRightAncestorStart = earliestRightAncestor.getStart();
+						int linedUpCol = earliestRightAncestorStart.getCharPositionInLine();
+						charPosInLine = linedUpCol;
+						output.append(Tool.spaces(linedUpCol));
+					}
+					break;
+				case CAT_ALIGN_WITH_ANCESTORS_PARENT_FIRST_TOKEN :
+					if ( earliestRightAncestor!=null ) {
+						ParserRuleContext earliestAncestorParent = earliestRightAncestor.getParent();
+						if ( earliestAncestorParent!=null ) {
+							Token earliestAncestorParentStart = earliestAncestorParent.getStart();
+							int linedUpCol = earliestAncestorParentStart.getCharPositionInLine();
+							charPosInLine = linedUpCol;
+							output.append(Tool.spaces(linedUpCol));
+						}
+					}
+					break;
+				case CAT_ALIGN_WITH_LIST_FIRST_ELEMENT :
+					List<ParserRuleContext> listSiblings = getListSiblings(tokenToNodeMap, curToken);
+					if ( listSiblings!=null ) {
+						ParserRuleContext firstSibling = listSiblings.get(0);
+						int linedUpCol = firstSibling.getStart().getCharPositionInLine();
+						charPosInLine = linedUpCol;
+						output.append(Tool.spaces(linedUpCol));
+					}
+					break;
+				case CAT_ALIGN_WITH_PAIR :
+					TerminalNode matchingLeftSymbol = getMatchingLeftSymbol(doc, node);
+					int linedUpCol = matchingLeftSymbol.getSymbol().getCharPositionInLine();
+					charPosInLine = linedUpCol;
+					output.append(Tool.spaces(linedUpCol));
+					break;
+				case CAT_NO_ALIGNMENT :
+					break;
 			}
+//			if ( currentIndent<0 ) currentIndent = 0; // don't allow bad indents to accumulate
+//			charPosInLine = currentIndent;
+//			output.append(Tool.spaces(currentIndent));
 		}
 		else {
 			// inject whitespace instead of \n?
 			output.append(Tool.spaces(ws));
 			charPosInLine += ws;
 		}
+
+//		if ( injectNewline>0 ) {
+//			output.append(Tool.newlines(injectNewline));
+//			line++;
+//			TerminalNode node = tokenToNodeMap.get(tokens.get(tokenIndexInStream));
+//			ParserRuleContext parent = (ParserRuleContext)node.getParent();
+//			int myIndex = 0;
+//			ParserRuleContext earliestAncestor = CollectFeatures.earliestAncestorStartingWithToken(parent, curToken);
+//			if ( earliestAncestor!=null ) {
+//				ParserRuleContext commonAncestor = earliestAncestor.getParent();
+//				List<ParserRuleContext> siblings = commonAncestor.getRuleContexts(earliestAncestor.getClass());
+//				myIndex = siblings.indexOf(earliestAncestor);
+//			}
+//			if ( false ) { //if ( myIndex>0 && align>0 ) { // align with first sibling's start token
+//				ParserRuleContext commonAncestor = earliestAncestor.getParent();
+//				List<ParserRuleContext> siblings = commonAncestor.getRuleContexts(earliestAncestor.getClass());
+//				ParserRuleContext firstSibling = siblings.get(0);
+//				Token firstSiblingStartToken = firstSibling.getStart();
+//				// align but don't update currentIndent
+//				charPosInLine = firstSiblingStartToken.getCharPositionInLine();
+//				output.append(Tool.spaces(charPosInLine));
+//			}
+//			else {
+//				currentIndent += indent;
+//				if ( currentIndent<0 ) currentIndent = 0; // don't allow bad indents to accumulate
+//				charPosInLine = currentIndent;
+//				output.append(Tool.spaces(currentIndent));
+//			}
+//		}
+//		else {
+//			// inject whitespace instead of \n?
+//			output.append(Tool.spaces(ws));
+//			charPosInLine += ws;
+//		}
 		// update Token object with position information now that we are about
 		// to emit it.
 		curToken.setLine(line);
@@ -171,8 +274,8 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT
 		boolean prevIsWS = prevToken.getType()==JavaLexer.WS;
 		int actualNL = Tool.count(prevToken.getText(), '\n');
 		int actualWS = Tool.count(prevToken.getText(), ' ');
-		int actualIndent = originalCurToken.getCharPositionInLine()-currentIndent;
-		boolean actualAlign = CollectFeatures.isAlignedWithFirstSibling(tokenToNodeMap, tokens, curToken);
+		int actualIndent = originalCurToken.getCharPositionInLine()-0;// currentIndent;
+		boolean actualAlign = isAlignedWithFirstSiblingOfList(tokenToNodeMap, tokens, curToken);
 		String newlinePredictionString = String.format("### line %d: predicted %d \\n actual %s",
 		                                               originalCurToken.getLine(), injectNewline, prevIsWS ? actualNL : "none");
 		String alignPredictionString = String.format("### line %d: predicted %s actual %s",
@@ -189,18 +292,15 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT
 
 
 		String newlineAnalysis = newlinePredictionString+"\n"+
-			newlineClassifier.getPredictionAnalysis(k, features, corpus.injectNewlines,
-			                                        CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
+			newlineClassifier.getPredictionAnalysis(doc, k, features, corpus.injectNewlines,
+			                                        MAX_CONTEXT_DIFF_THRESHOLD);
 		String alignAnalysis =alignPredictionString+"\n"+
-			alignClassifier.getPredictionAnalysis(k, features, corpus.alignWithPrevious,
-			                                      CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
-		String indentAnalysis =indentPredictionString+"\n"+
-			indentClassifier.getPredictionAnalysis(k, features, corpus.indent,
-			                                       CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
+			alignClassifier.getPredictionAnalysis(doc, k, features, corpus.align,
+			                                      MAX_CONTEXT_DIFF_THRESHOLD);
 		String wsAnalysis =wsPredictionString+"\n"+
-			wsClassifier.getPredictionAnalysis(k, features, corpus.injectWS,
-			                                   CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD);
-		return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, indentAnalysis, wsAnalysis);
+			wsClassifier.getPredictionAnalysis(doc, k, features, corpus.injectWS,
+			                                   MAX_CONTEXT_DIFF_THRESHOLD);
+		return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, wsAnalysis);
 	}
 
 	/** Do not join two words like "finaldouble" or numbers like "3double",
Original file line number	Diff line number	Diff line change
`@@ -1016,9 +1016,9 @@ WS : [ \t\r\n\u000C]+ -> channel(HIDDEN) // CodeBuff needs to see all whitespa`
`1016`	`1016`	`;`
`1017`	`1017`
`1018`	`1018`	`COMMENT`
`1019`		`- : '/' .? '*/' -> skip`
	`1019`	`+ : '/' .? '*/' -> channel(HIDDEN)`
`1020`	`1020`	`;`
`1021`	`1021`
`1022`	`1022`	`LINE_COMMENT`
`1023`		`- : '//' ~[\r\n]* -> skip`
	`1023`	`+ : '//' ~[\r\n]* -> channel(HIDDEN)`
`1024`	`1024`	`;`