Skip to content

Commit e792501

Browse files
committed
improving graphs, fixed bug in grammar invariance, added consistent markers and colors for the various languages for different graphs.
1 parent 4fc80b5 commit e792501

6 files changed

Lines changed: 153 additions & 72 deletions

File tree

2.87 KB
Binary file not shown.

python/src/subset_validator.py

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,32 @@
11
#
22
# AUTO-GENERATED FILE. DO NOT EDIT
3-
# CodeBuff 1.4.19 'Fri Jun 17 15:30:29 PDT 2016'
3+
# CodeBuff 1.4.19 'Sat Jun 18 12:07:15 PDT 2016'
44
#
5-
import numpy as np
65
import matplotlib.pyplot as plt
76

87
fig = plt.figure()
98
ax = plt.subplot(111)
10-
N = 3
9+
N = 30
1110
sizes = range(1,N+1)
12-
sqlite = [0.39566395,0.19639066,0.1870229]
13-
ax.plot(range(1,len(sqlite)+1), sqlite, label="sqlite", marker='o')
14-
antlr = [0.23529412,0.11906425,0.22540188]
15-
ax.plot(range(1,len(antlr)+1), antlr, label="antlr", marker='o')
16-
java_st = [0.1372315,0.07272727,0.06632213]
17-
ax.plot(range(1,len(java_st)+1), java_st, label="java_st", marker='o')
18-
java8_st = [0.2593828,0.06481481,0.0754717]
19-
ax.plot(range(1,len(java8_st)+1), java8_st, label="java8_st", marker='o')
20-
tsql = [0.23404256,0.24701196,0.13541667]
21-
ax.plot(range(1,len(tsql)+1), tsql, label="tsql", marker='o')
11+
sqlite = [0.25919002,0.1875,0.16905189,0.15116279,0.1570248,0.12646048,0.13111547,0.14213198,0.11797753,0.13519813,0.12033195,0.12209302,0.1298077,0.116099775,0.120200336,0.116959065,0.11691542,0.1106383,0.1205074,0.11556982,0.11490683,0.11111111,0.1119403,0.11612903,0.11111111,0.100877196,0.11347197,0.10823909,0.10766046,0.10980392]
12+
ax.plot(range(1,len(sqlite)+1), sqlite, label="sqlite", marker='o', color='y')
13+
antlr = [0.29315588,0.2076523,0.19942749,0.1779661,0.1779661,0.1779661,0.1641627,0.15467626,0.16949153,0.16236162,0.16949153,0.16718563]
14+
ax.plot(range(1,len(antlr)+1), antlr, label="antlr", marker='.', color='k')
15+
java_st = [0.165,0.08979592,0.09818482,0.07055961,0.069646284,0.06962025,0.06918239,0.0601626,0.06870229,0.06104901,0.06422764,0.06185567,0.054545455,0.056818184,0.05,0.05437352,0.06285073,0.055555556,0.058097314,0.059227467,0.058066282,0.05665236,0.054545455,0.053435113,0.047503047,0.043848965,0.05869446,0.046296295,0.05263158,0.053701613]
16+
ax.plot(range(1,len(java_st)+1), java_st, label="java_st", marker='s', color='g')
17+
java8_guava = [0.12016129,0.057759088,0.040557668,0.04054054,0.035587188,0.032967035,0.02566296,0.027541311,0.021957913,0.028037382,0.026829269,0.022167487,0.02031302,0.01875,0.023882898,0.02457956,0.021420518,0.020100502,0.016393442,0.024475524,0.015923567,0.022108844,0.019537276,0.019374724,0.021276595,0.020304568,0.016194332,0.018181818,0.017008504,0.0192604]
18+
ax.plot(range(1,len(java8_guava)+1), java8_guava, label="java8_guava", marker='d', color='c')
19+
java8_st = [0.14335664,0.09755333,0.081318684,0.083333336,0.07458564,0.067961164,0.06419753,0.07519641,0.049242426,0.067299396,0.056999687,0.0480456,0.04918033,0.06356968,0.054347824,0.06235566,0.061068702,0.051855896,0.05154639,0.060240965,0.051502146,0.059344552,0.056962024,0.054545455,0.061269145,0.05882353,0.046296295,0.0529595,0.062893085,0.056603774]
20+
ax.plot(range(1,len(java8_st)+1), java8_st, label="java8_st", marker='+', color='b')
21+
tsql = [0.23381294,0.18333334,0.1442786,0.14285715,0.13286713,0.14772727,0.13819095,0.13022113,0.11764706,0.110091746,0.11764706,0.11717496,0.11111111,0.10666667,0.10638298,0.10410959,0.10616438,0.11081081,0.110946745,0.09756097,0.1,0.101620026,0.1007371,0.102564104,0.09548313,0.10062241,0.09885932,0.09885932,0.1025788,0.097201765]
22+
ax.plot(range(1,len(tsql)+1), tsql, label="tsql", marker='', color='r')
23+
java_guava = [0.10330579,0.071428575,0.04477612,0.05532863,0.042372882,0.046865217,0.036753446,0.034443818,0.029850746,0.03514377,0.030805686,0.032786883,0.029288704,0.028169014,0.03409091,0.027760051,0.026697177,0.025316456,0.033333335,0.02254283,0.027272727,0.024691358,0.024647888,0.02567394,0.024539877,0.025817556,0.023809524,0.023255814,0.02587519,0.031927712]
24+
ax.plot(range(1,len(java_guava)+1), java_guava, label="java_guava", marker='>', color='m')
2225

2326
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
24-
ax.set_xlabel("Number n of training files in sample subset corpus", fontsize=14)
25-
ax.set_ylabel("Median Error rate for 5 trials", fontsize=14)
26-
ax.set_title("Effect of Corpus size on Median Leave-one-out Validation Error Rate")
27+
ax.set_xlabel("Number of training files in sample corpus subset", fontsize=14)
28+
ax.set_ylabel("Median Error rate for 50 trials", fontsize=14)
29+
#ax.set_title("Effect of Corpus size on Median Leave-one-out Validation Error Rate")
2730
plt.legend()
2831
plt.tight_layout()
2932
fig.savefig('images/subset_validator.pdf', format='pdf')

src/org/antlr/codebuff/validation/GrammarInvariance.java

Lines changed: 96 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -8,69 +8,118 @@
88
import java.util.List;
99

1010
import static org.antlr.codebuff.Tool.JAVA8_DESCR;
11+
import static org.antlr.codebuff.Tool.JAVA8_GUAVA_DESCR;
1112
import static org.antlr.codebuff.Tool.JAVA_DESCR;
13+
import static org.antlr.codebuff.Tool.JAVA_GUAVA_DESCR;
1214
import static org.antlr.codebuff.Tool.SQLITE_CLEAN_DESCR;
1315
import static org.antlr.codebuff.Tool.TSQL_CLEAN_DESCR;
1416
import static org.antlr.codebuff.Tool.normalizedLevenshteinDistance;
1517

1618
public class GrammarInvariance {
1719
public static void main(String[] args) throws Exception {
18-
// SQL
19-
LeaveOneOutValidator sqliteValidator = new LeaveOneOutValidator(SQLITE_CLEAN_DESCR.corpusDir, SQLITE_CLEAN_DESCR);
20-
LeaveOneOutValidator tsqlValidator = new LeaveOneOutValidator(TSQL_CLEAN_DESCR.corpusDir, TSQL_CLEAN_DESCR);
21-
Triple<List<Formatter>, List<Float>, List<Float>> sqliteResults = sqliteValidator.validateDocuments(false, null);
22-
Triple<List<Formatter>, List<Float>, List<Float>> tsqlResults = tsqlValidator.validateDocuments(false, null);
23-
List<Formatter> sqliteFormatters = sqliteResults.a;
24-
List<Formatter> tsqlFormatters = tsqlResults.a;
20+
// we need to get all of the results in order so that we can compare
21+
LeaveOneOutValidator.FORCE_SINGLE_THREADED = true;
22+
float sql_median;
23+
float java_st_median;
24+
float java_guava_median;
25+
{
26+
// SQL
27+
LeaveOneOutValidator sqliteValidator =
28+
new LeaveOneOutValidator(SQLITE_CLEAN_DESCR.corpusDir, SQLITE_CLEAN_DESCR);
29+
LeaveOneOutValidator tsqlValidator =
30+
new LeaveOneOutValidator(TSQL_CLEAN_DESCR.corpusDir, TSQL_CLEAN_DESCR);
31+
Triple<List<Formatter>, List<Float>, List<Float>> sqliteResults =
32+
sqliteValidator.validateDocuments(false, null);
33+
Triple<List<Formatter>, List<Float>, List<Float>> tsqlResults =
34+
tsqlValidator.validateDocuments(false, null);
35+
List<Formatter> sqliteFormatters = sqliteResults.a;
36+
List<Formatter> tsqlFormatters = tsqlResults.a;
37+
38+
List<Float> distances = new ArrayList<>();
39+
for (int i = 0; i<sqliteFormatters.size(); i++) {
40+
Formatter sqlite = sqliteFormatters.get(i);
41+
Formatter tsql = tsqlFormatters.get(i);
42+
float editDistance = normalizedLevenshteinDistance(sqlite.getOutput(), tsql.getOutput());
43+
distances.add(editDistance);
44+
// System.out.println(sqlite.testDoc.fileName+" edit distance "+editDistance);
45+
}
2546

26-
List<Float> distances = new ArrayList<>();
27-
for (int i = 0; i<sqliteFormatters.size(); i++) {
28-
Formatter sqlite = sqliteFormatters.get(i);
29-
Formatter tsql = tsqlFormatters.get(i);
30-
float editDistance = normalizedLevenshteinDistance(sqlite.getOutput(), tsql.getOutput());
31-
distances.add(editDistance);
32-
System.out.println(sqlite.testDoc.fileName+" edit distance "+editDistance);
47+
{
48+
Collections.sort(distances);
49+
int n = distances.size();
50+
float min = distances.get(0);
51+
float quart = distances.get((int)(0.27*n));
52+
float median = distances.get(n/2);
53+
float quart3 = distances.get((int)(0.75*n));
54+
float max = distances.get(distances.size()-1);
55+
String display = "("+min+","+median+","+max+")";
56+
sql_median = median;
57+
}
3358
}
3459

3560
{
36-
Collections.sort(distances);
37-
int n = distances.size();
38-
float min = distances.get(0);
39-
float quart = distances.get((int)(0.27*n));
40-
float median = distances.get(n/2);
41-
float quart3 = distances.get((int)(0.75*n));
42-
float max = distances.get(distances.size()-1);
43-
String display = "("+min+","+median+","+max+")";
44-
System.out.println("SQLite vs TSQL edit distance info (min,median,max)="+display);
45-
}
61+
// JAVA
62+
List<Float> distances = new ArrayList<>();
63+
LeaveOneOutValidator javaValidator = new LeaveOneOutValidator(JAVA_DESCR.corpusDir, JAVA_DESCR);
64+
LeaveOneOutValidator java8Validator = new LeaveOneOutValidator(JAVA8_DESCR.corpusDir, JAVA8_DESCR);
65+
Triple<List<Formatter>, List<Float>, List<Float>> javaResults = javaValidator.validateDocuments(false, null);
66+
Triple<List<Formatter>, List<Float>, List<Float>> java8Results = java8Validator.validateDocuments(false, null);
67+
List<Formatter> javaFormatters = javaResults.a;
68+
List<Formatter> java8Formatters = java8Results.a;
4669

47-
// JAVA
48-
distances.clear();
49-
LeaveOneOutValidator javaValidator = new LeaveOneOutValidator(JAVA_DESCR.corpusDir, JAVA_DESCR);
50-
LeaveOneOutValidator java8Validator = new LeaveOneOutValidator(JAVA8_DESCR.corpusDir, JAVA8_DESCR);
51-
Triple<List<Formatter>, List<Float>, List<Float>> javaResults = javaValidator.validateDocuments(false, null);
52-
Triple<List<Formatter>, List<Float>, List<Float>> java8Results = java8Validator.validateDocuments(false, null);
53-
List<Formatter> javaFormatters = javaResults.a;
54-
List<Formatter> java8Formatters = java8Results.a;
70+
for (int i = 0; i<javaFormatters.size(); i++) {
71+
Formatter java = javaFormatters.get(i);
72+
Formatter java8 = java8Formatters.get(i);
73+
float editDistance = normalizedLevenshteinDistance(java.getOutput(), java8.getOutput());
74+
distances.add(editDistance);
75+
// System.out.println(java.testDoc.fileName+" edit distance "+editDistance);
76+
}
5577

56-
for (int i = 0; i<javaFormatters.size(); i++) {
57-
Formatter java = javaFormatters.get(i);
58-
Formatter java8 = java8Formatters.get(i);
59-
float editDistance = normalizedLevenshteinDistance(java.getOutput(), java8.getOutput());
60-
distances.add(editDistance);
61-
System.out.println(java.testDoc.fileName+" edit distance "+editDistance);
78+
{
79+
Collections.sort(distances);
80+
int n = distances.size();
81+
float min = distances.get(0);
82+
float quart = distances.get((int) (0.27*n));
83+
float median = distances.get(n/2);
84+
float quart3 = distances.get((int) (0.75*n));
85+
float max = distances.get(distances.size()-1);
86+
String display = "("+min+","+median+","+max+")";
87+
java_st_median = median;
88+
}
6289
}
6390

6491
{
65-
Collections.sort(distances);
66-
int n = distances.size();
67-
float min = distances.get(0);
68-
float quart = distances.get((int) (0.27*n));
69-
float median = distances.get(n/2);
70-
float quart3 = distances.get((int) (0.75*n));
71-
float max = distances.get(distances.size()-1);
72-
String display = "("+min+","+median+","+max+")";
73-
System.out.println("Java vs Java8 edit distance info (min,median,max)="+display);
92+
// JAVA GUAVA
93+
List<Float> distances = new ArrayList<>();
94+
LeaveOneOutValidator java_guavaValidator = new LeaveOneOutValidator(JAVA_GUAVA_DESCR.corpusDir, JAVA_GUAVA_DESCR);
95+
LeaveOneOutValidator java8_guavaValidator = new LeaveOneOutValidator(JAVA8_GUAVA_DESCR.corpusDir, JAVA8_GUAVA_DESCR);
96+
Triple<List<Formatter>, List<Float>, List<Float>> java_guavaResults = java_guavaValidator.validateDocuments(false, null);
97+
Triple<List<Formatter>, List<Float>, List<Float>> java8_guavaResults = java8_guavaValidator.validateDocuments(false, null);
98+
List<Formatter> java_guavaFormatters = java_guavaResults.a;
99+
List<Formatter> java8_guavaFormatters = java8_guavaResults.a;
100+
101+
for (int i = 0; i<java_guavaFormatters.size(); i++) {
102+
Formatter java_guava = java_guavaFormatters.get(i);
103+
Formatter java8_guava = java8_guavaFormatters.get(i);
104+
float editDistance = normalizedLevenshteinDistance(java_guava.getOutput(), java8_guava.getOutput());
105+
distances.add(editDistance);
106+
// System.out.println(java_guava.testDoc.fileName+" edit distance "+editDistance);
107+
}
108+
109+
{
110+
Collections.sort(distances);
111+
int n = distances.size();
112+
float min = distances.get(0);
113+
float quart = distances.get((int) (0.27*n));
114+
float median = distances.get(n/2);
115+
float quart3 = distances.get((int) (0.75*n));
116+
float max = distances.get(distances.size()-1);
117+
String display = "("+min+","+median+","+max+")";
118+
java_guava_median = median;
119+
}
74120
}
121+
System.out.println("clean SQLite vs TSQL edit distance info median="+sql_median);
122+
System.out.println("Java vs Java8 edit distance info median="+java_st_median);
123+
System.out.println("Java vs Java8 guava edit distance info median="+java_guava_median);
75124
}
76125
}

src/org/antlr/codebuff/validation/LeaveOneOutValidator.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,27 @@ public class LeaveOneOutValidator {
3737
public static final int DOCLIST_RANDOM_SEED = 951413; // need randomness but use same seed to get reproducibility
3838
final Random random = new Random();
3939

40-
public static final boolean FORCE_SINGLE_THREADED = false;
40+
public static boolean FORCE_SINGLE_THREADED = false;
41+
42+
public static final Map<String,String> nameToGraphMarker = new HashMap<String,String>() {{
43+
put("antlr", ".");
44+
put("java_st", "s");
45+
put("java8_st", "+");
46+
put("java_guava", ">");
47+
put("java8_guava", "d");
48+
put("sqlite", "o");
49+
put("tsqlr", "p");
50+
}};
51+
52+
public static final Map<String,String> nameToGraphColor = new HashMap<String,String>() {{
53+
put("antlr", "k");
54+
put("java_st", "g");
55+
put("java8_st", "b");
56+
put("java_guava", "m");
57+
put("java8_guava", "c");
58+
put("sqlite", "y");
59+
put("tsql", "r");
60+
}};
4161

4262
public String rootDir;
4363
public LangDescriptor language;

src/org/antlr/codebuff/validation/SubsetValidator.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424

2525
import static org.antlr.codebuff.Tool.ANTLR4_DESCR;
2626
import static org.antlr.codebuff.Tool.JAVA8_DESCR;
27+
import static org.antlr.codebuff.Tool.JAVA8_GUAVA_DESCR;
2728
import static org.antlr.codebuff.Tool.JAVA_DESCR;
29+
import static org.antlr.codebuff.Tool.JAVA_GUAVA_DESCR;
2830
import static org.antlr.codebuff.Tool.SQLITE_CLEAN_DESCR;
2931
import static org.antlr.codebuff.Tool.TSQL_CLEAN_DESCR;
3032
import static org.antlr.codebuff.Tool.getFilenames;
@@ -61,13 +63,13 @@ public static void main(String[] args) throws Exception {
6163
ANTLR4_DESCR,
6264
JAVA_DESCR,
6365
JAVA8_DESCR,
64-
// SQLITE_NOISY_DESCR,
66+
JAVA_GUAVA_DESCR,
67+
JAVA8_GUAVA_DESCR,
6568
SQLITE_CLEAN_DESCR,
66-
// TSQL_NOISY_DESCR,
6769
TSQL_CLEAN_DESCR,
6870
};
6971

70-
int maxNumFiles = 20;
72+
int maxNumFiles = 30;
7173
int trials = 50;
7274
Map<String,float[]> results = new HashMap<>();
7375
for (LangDescriptor language : languages) {
@@ -87,10 +89,10 @@ public static void main(String[] args) throws Exception {
8789
"sizes = range(1,N+1)\n" +
8890
"<results:{r |\n" +
8991
"<r> = [<rest(results.(r)); separator={,}>]\n"+
90-
"ax.plot(range(1,len(<r>)+1), <r>, label=\"<r>\", marker='o')\n" +
92+
"ax.plot(range(1,len(<r>)+1), <r>, label=\"<r>\", marker='<markers.(r)>', color='<colors.(r)>')\n" +
9193
"}>\n" +
9294
"ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n" +
93-
"ax.set_xlabel(\"Number n of training files in sample subset corpus\", fontsize=14)\n"+
95+
"ax.set_xlabel(\"Number of training files in sample corpus subset\", fontsize=14)\n"+
9496
"ax.set_ylabel(\"Median Error rate for <trials> trials\", fontsize=14)\n" +
9597
"ax.set_title(\"Effect of Corpus size on Median Leave-one-out Validation Error Rate\")\n"+
9698
"plt.legend()\n" +
@@ -99,6 +101,8 @@ public static void main(String[] args) throws Exception {
99101
"plt.show()\n";
100102
ST pythonST = new ST(python);
101103
pythonST.add("results", results);
104+
pythonST.add("markers", LeaveOneOutValidator.nameToGraphMarker);
105+
pythonST.add("colors", LeaveOneOutValidator.nameToGraphColor);
102106
pythonST.add("version", version);
103107
pythonST.add("date", new Date());
104108
pythonST.add("trials", trials);

src/org/antlr/codebuff/validation/TestK.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@
2222

2323
import static org.antlr.codebuff.Tool.ANTLR4_DESCR;
2424
import static org.antlr.codebuff.Tool.JAVA8_DESCR;
25+
import static org.antlr.codebuff.Tool.JAVA8_GUAVA_DESCR;
2526
import static org.antlr.codebuff.Tool.JAVA_DESCR;
26-
import static org.antlr.codebuff.Tool.QUORUM_DESCR;
27+
import static org.antlr.codebuff.Tool.JAVA_GUAVA_DESCR;
2728
import static org.antlr.codebuff.Tool.SQLITE_CLEAN_DESCR;
2829
import static org.antlr.codebuff.Tool.TSQL_CLEAN_DESCR;
2930
import static org.antlr.codebuff.Tool.getFilenames;
@@ -40,9 +41,10 @@ public TestK(String rootDir, LangDescriptor language, int k) {
4041

4142
public static void main(String[] args) throws Exception {
4243
LangDescriptor[] languages = new LangDescriptor[] {
43-
QUORUM_DESCR,
4444
JAVA_DESCR,
4545
JAVA8_DESCR,
46+
JAVA_GUAVA_DESCR,
47+
JAVA8_GUAVA_DESCR,
4648
ANTLR4_DESCR,
4749
SQLITE_CLEAN_DESCR,
4850
TSQL_CLEAN_DESCR,
@@ -105,7 +107,10 @@ public static void writePython(LangDescriptor[] languages, List<Integer> ks, Flo
105107
LangDescriptor language = languages[i];
106108
List<Float> filteredMedians = BuffUtils.filter(Arrays.asList(medians[i]), m -> m!=null);
107109
data.append(language.name+'='+filteredMedians+'\n');
108-
plot.append(String.format("ax.plot(ks, %s, label=\"%s\", marker='o')\n", language.name, language.name));
110+
plot.append(String.format("ax.plot(ks, %s, label=\"%s\", marker='%s', color='%s')\n",
111+
language.name, language.name,
112+
nameToGraphMarker.get(language.name),
113+
nameToGraphColor.get(language.name)));
109114
}
110115

111116
String python =

0 commit comments

Comments
 (0)