66import java .util .HashMap ;
77import java .util .List ;
88import java .util .Map ;
9+ import java .util .Random ;
910
1011public class Corpus {
1112 public static final int NUM_DEPENDENT_VARS = 4 ;
@@ -17,9 +18,8 @@ public class Corpus {
1718 List <InputDocument > documents ; // an entry for each X
1819 List <int []> X ;
1920 List <Integer > injectNewlines ;
20- List <Integer > injectWS ;
21- List <Integer > indent ;
2221 List <Integer > align ; // steps to common ancestor whose first token is alignment anchor
22+ List <Integer > injectWS ;
2323
2424 /** an index to narrow down the number of vectors we compute distance() on each classification.
2525 * The key is (previous token's rule index, current token's rule index). It yields
@@ -30,18 +30,55 @@ public class Corpus {
3030 public Corpus (List <InputDocument > documents ,
3131 List <int []> X ,
3232 List <Integer > injectNewlines ,
33- List <Integer > injectWS ,
34- List <Integer > indent ,
35- List <Integer > align )
33+ List <Integer > align ,
34+ List <Integer > injectWS )
3635 {
3736 this .documents = documents ;
3837 this .X = X ;
3938 this .injectNewlines = injectNewlines ;
4039 this .injectWS = injectWS ;
41- this .indent = indent ;
4240 this .align = align ;
4341 }
4442
43+ /** Feature vectors in X are lumped together as they are read in each
44+ * document. In kNN, this tends to find features from the same document
45+ * rather than from across the corpus since we grab k neighbors.
46+ * For k=11, we might only see exemplars from a single corpus document.
47+ * If all exemplars fit in k, this wouldn't be an issue.
48+ *
49+ * Fisher-Yates / Knuth shuffling
50+ * "To shuffle an array a of n elements (indices 0..n-1)":
51+ * https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
52+ */
53+ public void randomShuffleInPlace () {
54+ Random r = new Random ();
55+ // for i from n−1 downto 1 do
56+ int n = X .size ();
57+ for (int i =n -1 ; i >=1 ; i --) {
58+ // j ← random integer such that 0 ≤ j ≤ i
59+ int j = r .nextInt (i +1 );
60+ // exchange a[j] and a[i]
61+ // Swap X
62+ int [] tmp = X .get (i );
63+ X .set (i , X .get (j ));
64+ X .set (j , tmp );
65+ // And now swap all prediction lists
66+ Integer tmpI = injectNewlines .get (i );
67+ injectNewlines .set (i , injectNewlines .get (j ));
68+ injectNewlines .set (j , tmpI );
69+ tmpI = align .get (i );
70+ align .set (i , align .get (j ));
71+ align .set (j , tmpI );
72+ tmpI = injectWS .get (i );
73+ injectWS .set (i , injectWS .get (j ));
74+ injectWS .set (j , tmpI );
75+ // Finally, swap documents
76+ InputDocument tmpD = documents .get (i );
77+ documents .set (i , documents .get (j ));
78+ documents .set (j , tmpD );
79+ }
80+ }
81+
4582 public void buildTokenContextIndex () {
4683 curAndPrevTokenRuleIndexToVectorsMap = new HashMap <>();
4784 for (int i =0 ; i <X .size (); i ++) {
0 commit comments