Skip to content

Commit c65e0b2

Browse files
committed
Feature vectors in X are lumped together as they are read in so I randomized them. Grabs k from randomized list now not all from same file.
1 parent 33b5675 commit c65e0b2

4 files changed

Lines changed: 45 additions & 14 deletions

File tree

java/src/org/antlr/codebuff/Corpus.java

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import java.util.HashMap;
77
import java.util.List;
88
import java.util.Map;
9+
import java.util.Random;
910

1011
public class Corpus {
1112
public static final int NUM_DEPENDENT_VARS = 4;
@@ -17,9 +18,8 @@ public class Corpus {
1718
List<InputDocument> documents; // an entry for each X
1819
List<int[]> X;
1920
List<Integer> injectNewlines;
20-
List<Integer> injectWS;
21-
List<Integer> indent;
2221
List<Integer> align; // steps to common ancestor whose first token is alignment anchor
22+
List<Integer> injectWS;
2323

2424
/** an index to narrow down the number of vectors we compute distance() on each classification.
2525
* The key is (previous token's rule index, current token's rule index). It yields
@@ -30,18 +30,55 @@ public class Corpus {
3030
public Corpus(List<InputDocument> documents,
3131
List<int[]> X,
3232
List<Integer> injectNewlines,
33-
List<Integer> injectWS,
34-
List<Integer> indent,
35-
List<Integer> align)
33+
List<Integer> align,
34+
List<Integer> injectWS)
3635
{
3736
this.documents = documents;
3837
this.X = X;
3938
this.injectNewlines = injectNewlines;
4039
this.injectWS = injectWS;
41-
this.indent = indent;
4240
this.align = align;
4341
}
4442

43+
/** Feature vectors in X are lumped together as they are read in each
44+
* document. In kNN, this tends to find features from the same document
45+
* rather than from across the corpus since we grab k neighbors.
46+
* For k=11, we might only see exemplars from a single corpus document.
47+
* If all exemplars fit in k, this wouldn't be an issue.
48+
*
49+
* Fisher-Yates / Knuth shuffling
50+
* "To shuffle an array a of n elements (indices 0..n-1)":
51+
* https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle
52+
*/
53+
public void randomShuffleInPlace() {
54+
Random r = new Random();
55+
// for i from n−1 downto 1 do
56+
int n = X.size();
57+
for (int i=n-1; i>=1; i--) {
58+
// j ← random integer such that 0 ≤ j ≤ i
59+
int j = r.nextInt(i+1);
60+
// exchange a[j] and a[i]
61+
// Swap X
62+
int[] tmp = X.get(i);
63+
X.set(i, X.get(j));
64+
X.set(j, tmp);
65+
// And now swap all prediction lists
66+
Integer tmpI = injectNewlines.get(i);
67+
injectNewlines.set(i, injectNewlines.get(j));
68+
injectNewlines.set(j, tmpI);
69+
tmpI = align.get(i);
70+
align.set(i, align.get(j));
71+
align.set(j, tmpI);
72+
tmpI = injectWS.get(i);
73+
injectWS.set(i, injectWS.get(j));
74+
injectWS.set(j, tmpI);
75+
// Finally, swap documents
76+
InputDocument tmpD = documents.get(i);
77+
documents.set(i, documents.get(j));
78+
documents.set(j, tmpD);
79+
}
80+
}
81+
4582
public void buildTokenContextIndex() {
4683
curAndPrevTokenRuleIndexToVectorsMap = new HashMap<>();
4784
for (int i=0; i<X.size(); i++) {

java/src/org/antlr/codebuff/InputDocument.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ public class InputDocument {
1818
public List<int[]> featureVectors;
1919
public List<Integer> injectNewlines;
2020
public List<Integer> injectWS;
21-
public List<Integer> indent;
2221
public List<Integer> alignWithPrevious;
2322
public int allWhiteSpaceCount = 0;
2423
public int incorrectWhiteSpaceCount = 0;

java/src/org/antlr/codebuff/Tool.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ public static Corpus train(String rootDir,
114114
}
115115

116116
Corpus corpus = processSampleDocs(documents, lexerClass, parserClass, tabSize, ruleToPairsBag);
117+
corpus.randomShuffleInPlace();
117118
corpus.buildTokenContextIndex();
118119
return corpus;
119120
}
@@ -144,7 +145,6 @@ public static Corpus processSampleDocs(List<InputDocument> docs,
144145
List<int[]> featureVectors = new ArrayList<>();
145146
List<Integer> injectNewlines = new ArrayList<>();
146147
List<Integer> injectWS = new ArrayList<>();
147-
List<Integer> indent = new ArrayList<>();
148148
List<Integer> alignWithPrevious = new ArrayList<>();
149149
for (InputDocument doc : docs) {
150150
if ( showFileNames ) System.out.println(doc);
@@ -155,13 +155,12 @@ public static Corpus processSampleDocs(List<InputDocument> docs,
155155
int[] featureVec = doc.featureVectors.get(i);
156156
injectNewlines.add(doc.injectNewlines.get(i));
157157
injectWS.add(doc.injectWS.get(i));
158-
indent.add(doc.indent.get(i));
159158
alignWithPrevious.add(doc.alignWithPrevious.get(i));
160159
featureVectors.add(featureVec);
161160
}
162161
}
163162
System.out.printf("%d feature vectors\n", featureVectors.size());
164-
return new Corpus(documents, featureVectors, injectNewlines, injectWS, indent, alignWithPrevious);
163+
return new Corpus(documents, featureVectors, injectNewlines, alignWithPrevious, injectWS);
165164
}
166165

167166
/** Parse document, save feature vectors to the doc but return it also */
@@ -174,7 +173,6 @@ public static void process(InputDocument doc, int tabSize, Map<String, List<Pair
174173
doc.featureVectors = collector.getFeatures();
175174
doc.injectNewlines = collector.getInjectNewlines();
176175
doc.injectWS = collector.getInjectWS();
177-
doc.indent = collector.getIndent();
178176
doc.alignWithPrevious = collector.getAlign();
179177
}
180178

java/src/org/antlr/codebuff/kNNClassifier.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,6 @@ public int[] classify(int k, int[] unknown, double distanceThreshold) {
4040
votesBag = getVotesBag(kNN, k, unknown, corpus.injectWS);
4141
categories[Corpus.INDEX_FEATURE_WS] = getCategoryWithMostVotes(votesBag);
4242

43-
votesBag = getVotesBag(kNN, k, unknown, corpus.indent);
44-
categories[Corpus.INDEX_FEATURE_INDENT] = getCategoryWithMostVotes(votesBag);
45-
4643
votesBag = getVotesBag(kNN, k, unknown, corpus.align);
4744
categories[Corpus.INDEX_FEATURE_ALIGN_WITH_PREVIOUS] = getCategoryWithMostVotes(votesBag);
4845

0 commit comments

Comments
 (0)