Feature vectors in X are lumped together as they are read in so I randomized them. Grabs k from randomized list now not all from same file.

parrt · parrt · commit c65e0b221aa1 · 2016-03-13T13:52:39.000-07:00
diff --git a/java/src/org/antlr/codebuff/Corpus.java b/java/src/org/antlr/codebuff/Corpus.java
@@ -6,6 +6,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Random;
 
 public class Corpus {
 	public static final int NUM_DEPENDENT_VARS = 4;
@@ -17,9 +18,8 @@ public class Corpus {
 	List<InputDocument> documents; // an entry for each X
 	List<int[]> X;
 	List<Integer> injectNewlines;
-	List<Integer> injectWS;
-	List<Integer> indent;
 	List<Integer> align; // steps to common ancestor whose first token is alignment anchor
+	List<Integer> injectWS;
 
 	/** an index to narrow down the number of vectors we compute distance() on each classification.
 	 *  The key is (previous token's rule index, current token's rule index). It yields
@@ -30,18 +30,55 @@ public class Corpus {
 	public Corpus(List<InputDocument> documents,
 				  List<int[]> X,
 				  List<Integer> injectNewlines,
-				  List<Integer> injectWS,
-				  List<Integer> indent,
-				  List<Integer> align)
+				  List<Integer> align,
+				  List<Integer> injectWS)
 	{
 		this.documents = documents;
 		this.X = X;
 		this.injectNewlines = injectNewlines;
 		this.injectWS = injectWS;
-		this.indent = indent;
 		this.align = align;
 	}
 
+	/** Feature vectors in X are lumped together as they are read in each
+	 *  document. In kNN, this tends to find features from the same document
+	 *  rather than from across the corpus since we grab k neighbors.
+	 *  For k=11, we might only see exemplars from a single corpus document.
+	 *  If all exemplars fit in k, this wouldn't be an issue.
+	 *
+	 *  Fisher-Yates / Knuth shuffling
+	 *  "To shuffle an array a of n elements (indices 0..n-1)":
+	 *  https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
+	 */
+	public void randomShuffleInPlace() {
+		Random r = new Random();
+		// for i from n−1 downto 1 do
+		int n = X.size();
+		for (int i=n-1; i>=1; i--) {
+			// j ← random integer such that 0 ≤ j ≤ i
+			int j = r.nextInt(i+1);
+			// exchange a[j] and a[i]
+			// Swap X
+			int[] tmp = X.get(i);
+			X.set(i, X.get(j));
+			X.set(j, tmp);
+			// And now swap all prediction lists
+			Integer tmpI = injectNewlines.get(i);
+			injectNewlines.set(i, injectNewlines.get(j));
+			injectNewlines.set(j, tmpI);
+			tmpI = align.get(i);
+			align.set(i, align.get(j));
+			align.set(j, tmpI);
+			tmpI = injectWS.get(i);
+			injectWS.set(i, injectWS.get(j));
+			injectWS.set(j, tmpI);
+			// Finally, swap documents
+			InputDocument tmpD = documents.get(i);
+			documents.set(i, documents.get(j));
+			documents.set(j, tmpD);
+		}
+	}
+
 	public void buildTokenContextIndex() {
 		curAndPrevTokenRuleIndexToVectorsMap = new HashMap<>();
 		for (int i=0; i<X.size(); i++) {
diff --git a/java/src/org/antlr/codebuff/InputDocument.java b/java/src/org/antlr/codebuff/InputDocument.java
@@ -18,7 +18,6 @@ public class InputDocument {
 	public List<int[]> featureVectors;
 	public List<Integer> injectNewlines;
 	public List<Integer> injectWS;
-	public List<Integer> indent;
 	public List<Integer> alignWithPrevious;
 	public int allWhiteSpaceCount = 0;
 	public int incorrectWhiteSpaceCount = 0;
diff --git a/java/src/org/antlr/codebuff/Tool.java b/java/src/org/antlr/codebuff/Tool.java
@@ -114,6 +114,7 @@ public static Corpus train(String rootDir,
 		}
 
 		Corpus corpus = processSampleDocs(documents, lexerClass, parserClass, tabSize, ruleToPairsBag);
+		corpus.randomShuffleInPlace();
 		corpus.buildTokenContextIndex();
 		return corpus;
 	}
@@ -144,7 +145,6 @@ public static Corpus processSampleDocs(List<InputDocument> docs,
 		List<int[]> featureVectors = new ArrayList<>();
 		List<Integer> injectNewlines = new ArrayList<>();
 		List<Integer> injectWS = new ArrayList<>();
-		List<Integer> indent = new ArrayList<>();
 		List<Integer> alignWithPrevious = new ArrayList<>();
 		for (InputDocument doc : docs) {
 			if ( showFileNames ) System.out.println(doc);
@@ -155,13 +155,12 @@ public static Corpus processSampleDocs(List<InputDocument> docs,
 				int[] featureVec = doc.featureVectors.get(i);
 				injectNewlines.add(doc.injectNewlines.get(i));
 				injectWS.add(doc.injectWS.get(i));
-				indent.add(doc.indent.get(i));
 				alignWithPrevious.add(doc.alignWithPrevious.get(i));
 				featureVectors.add(featureVec);
 			}
 		}
 		System.out.printf("%d feature vectors\n", featureVectors.size());
-		return new Corpus(documents, featureVectors, injectNewlines, injectWS, indent, alignWithPrevious);
+		return new Corpus(documents, featureVectors, injectNewlines, alignWithPrevious, injectWS);
 	}
 
 	/** Parse document, save feature vectors to the doc but return it also */
@@ -174,7 +173,6 @@ public static void process(InputDocument doc, int tabSize, Map<String, List<Pair
 		doc.featureVectors = collector.getFeatures();
 		doc.injectNewlines = collector.getInjectNewlines();
 		doc.injectWS = collector.getInjectWS();
-		doc.indent = collector.getIndent();
 		doc.alignWithPrevious = collector.getAlign();
 	}
 
diff --git a/java/src/org/antlr/codebuff/kNNClassifier.java b/java/src/org/antlr/codebuff/kNNClassifier.java
@@ -40,9 +40,6 @@ public int[] classify(int k, int[] unknown, double distanceThreshold) {
 		votesBag = getVotesBag(kNN, k, unknown, corpus.injectWS);
 		categories[Corpus.INDEX_FEATURE_WS] = getCategoryWithMostVotes(votesBag);
 
-		votesBag = getVotesBag(kNN, k, unknown, corpus.indent);
-		categories[Corpus.INDEX_FEATURE_INDENT] = getCategoryWithMostVotes(votesBag);
-
 		votesBag = getVotesBag(kNN, k, unknown, corpus.align);
 		categories[Corpus.INDEX_FEATURE_ALIGN_WITH_PREVIOUS] = getCategoryWithMostVotes(votesBag);
 

Original file line number	Diff line number	Diff line change
`@@ -114,6 +114,7 @@ public static Corpus train(String rootDir,`
`114`	`114`	`}`
`115`	`115`
`116`	`116`	`Corpus corpus = processSampleDocs(documents, lexerClass, parserClass, tabSize, ruleToPairsBag);`
	`117`	`+ corpus.randomShuffleInPlace();`
`117`	`118`	`corpus.buildTokenContextIndex();`
`118`	`119`	`return corpus;`
`119`	`120`	`}`
`@@ -144,7 +145,6 @@ public static Corpus processSampleDocs(List<InputDocument> docs,`
`144`	`145`	`List<int[]> featureVectors = new ArrayList<>();`
`145`	`146`	`List<Integer> injectNewlines = new ArrayList<>();`
`146`	`147`	`List<Integer> injectWS = new ArrayList<>();`
`147`		`- List<Integer> indent = new ArrayList<>();`
`148`	`148`	`List<Integer> alignWithPrevious = new ArrayList<>();`
`149`	`149`	`for (InputDocument doc : docs) {`
`150`	`150`	`if ( showFileNames ) System.out.println(doc);`
`@@ -155,13 +155,12 @@ public static Corpus processSampleDocs(List<InputDocument> docs,`
`155`	`155`	`int[] featureVec = doc.featureVectors.get(i);`
`156`	`156`	`injectNewlines.add(doc.injectNewlines.get(i));`
`157`	`157`	`injectWS.add(doc.injectWS.get(i));`
`158`		`- indent.add(doc.indent.get(i));`
`159`	`158`	`alignWithPrevious.add(doc.alignWithPrevious.get(i));`
`160`	`159`	`featureVectors.add(featureVec);`
`161`	`160`	`}`
`162`	`161`	`}`
`163`	`162`	`System.out.printf("%d feature vectors\n", featureVectors.size());`
`164`		`- return new Corpus(documents, featureVectors, injectNewlines, injectWS, indent, alignWithPrevious);`
	`163`	`+ return new Corpus(documents, featureVectors, injectNewlines, alignWithPrevious, injectWS);`
`165`	`164`	`}`
`166`	`165`
`167`	`166`	`/** Parse document, save feature vectors to the doc but return it also */`
`@@ -174,7 +173,6 @@ public static void process(InputDocument doc, int tabSize, Map<String, List<Pair`
`174`	`173`	`doc.featureVectors = collector.getFeatures();`
`175`	`174`	`doc.injectNewlines = collector.getInjectNewlines();`
`176`	`175`	`doc.injectWS = collector.getInjectWS();`
`177`		`- doc.indent = collector.getIndent();`
`178`	`176`	`doc.alignWithPrevious = collector.getAlign();`
`179`	`177`	`}`
`180`	`178`