Skip to content

Commit 7e493ee

Browse files
authored
On-disk index cache for the Grid benchmark harness (#612)
* Initial implementation of index cache for Bench / Grid. * Initial implementation of index cache for Bench / Grid. * Uncommented datasets that were not working prior to PR613. * Improved Exception handling. Added alpha to key signature. Lazy cached index deletes. Index cached marked Experimental. * Integrated index caching into autoBenchYAML and runAllAndCollectResults. useSavedIndexIfExists is No by default. Added refineFinalGraph to construction parameters.
1 parent e263cc8 commit 7e493ee

14 files changed

Lines changed: 661 additions & 147 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ local/
66

77
### Bench caches
88
pq_cache/
9+
index_cache/
910

1011
### JVM crashes
1112
hs_err_pid*

jvector-base/src/main/java/io/github/jbellis/jvector/quantization/ProductQuantization.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -733,11 +733,12 @@ public long ramBytesUsed() {
733733
@Override
734734
public String toString() {
735735
if (anisotropicThreshold == UNWEIGHTED) {
736-
return String.format("ProductQuantization(M=%d, clusters=%d)", M, clusterCount);
736+
return String.format("ProductQuantization(M=%d, clusters=%d, centered=%s)", M, clusterCount, globalCentroid != null);
737737
}
738-
return String.format("ProductQuantization(M=%d, clusters=%d, T=%.3f, eta=%.1f)",
738+
return String.format("ProductQuantization(M=%d, clusters=%d, centered=%s, anisotropicT=%.3f, eta=%.1f)",
739739
M,
740740
clusterCount,
741+
globalCentroid != null,
741742
anisotropicThreshold,
742743
KMeansPlusPlusClusterer.computeParallelCostMultiplier(anisotropicThreshold, originalDimension));
743744
}

jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,11 +153,13 @@ public static void main(String[] args) throws IOException {
153153
}
154154
logger.info("Using configuration: {}", config);
155155

156-
List<BenchResult> datasetResults = Grid.runAllAndCollectResults(ds,
156+
List<BenchResult> datasetResults = Grid.runAllAndCollectResults(ds,
157+
config.construction.useSavedIndexIfExists,
157158
config.construction.outDegree,
158159
config.construction.efConstruction,
159160
config.construction.neighborOverflow,
160161
config.construction.addHierarchy,
162+
config.construction.refineFinalGraph,
161163
config.construction.getFeatureSets(),
162164
config.construction.getCompressorParameters(),
163165
config.search.getCompressorParameters(),

jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,7 @@
2525
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
2626

2727
import java.io.IOException;
28-
import java.util.Arrays;
29-
import java.util.EnumSet;
30-
import java.util.List;
31-
import java.util.Map;
28+
import java.util.*;
3229
import java.util.function.Function;
3330
import java.util.regex.Pattern;
3431
import java.util.stream.Collectors;
@@ -42,6 +39,10 @@ public class Bench {
4239
public static void main(String[] args) throws IOException {
4340
System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory());
4441

42+
// When enabled, caches built indices for reuse in future runs.
43+
// Useful for large indexes and repeated testing.
44+
boolean enableIndexCache = false;
45+
4546
var mGrid = List.of(32); // List.of(16, 24, 32, 48, 64, 96, 128);
4647
var efConstructionGrid = List.of(100); // List.of(60, 80, 100, 120, 160, 200, 400, 600, 800);
4748
var topKGrid = Map.of(
@@ -81,10 +82,10 @@ public static void main(String[] args) throws IOException {
8182
// compile regex and do substring matching using find
8283
var pattern = Pattern.compile(regex);
8384

84-
execute(pattern, buildCompression, featureSets, searchCompression, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, topKGrid, usePruningGrid);
85+
execute(pattern, enableIndexCache, buildCompression, featureSets, searchCompression, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, topKGrid, usePruningGrid);
8586
}
8687

87-
private static void execute(Pattern pattern, List<Function<DataSet, CompressorParameters>> buildCompression, List<EnumSet<FeatureId>> featureSets, List<Function<DataSet, CompressorParameters>> compressionGrid, List<Integer> mGrid, List<Integer> efConstructionGrid, List<Float> neighborOverflowGrid, List<Boolean> addHierarchyGrid, List<Boolean> refineFinalGraphGrid, Map<Integer, List<Double>> topKGrid, List<Boolean> usePruningGrid) throws IOException {
88+
private static void execute(Pattern pattern, boolean enableIndexCache, List<Function<DataSet, CompressorParameters>> buildCompression, List<EnumSet<FeatureId>> featureSets, List<Function<DataSet, CompressorParameters>> compressionGrid, List<Integer> mGrid, List<Integer> efConstructionGrid, List<Float> neighborOverflowGrid, List<Boolean> addHierarchyGrid, List<Boolean> refineFinalGraphGrid, Map<Integer, List<Double>> topKGrid, List<Boolean> usePruningGrid) throws IOException {
8889
var datasetCollection = DatasetCollection.load();
8990
var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());
9091
System.out.println("Executing the following datasets: " + datasetNames);
@@ -93,7 +94,7 @@ private static void execute(Pattern pattern, List<Function<DataSet, CompressorPa
9394
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
9495
() -> new RuntimeException("Dataset " + datasetName + " not found")
9596
);
96-
Grid.runAll(ds, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid);
97+
Grid.runAll(ds, enableIndexCache, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid, null);
9798
}
9899
}
99100
}

jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ public static void main(String[] args) throws IOException {
9494
() -> new RuntimeException("Could not load dataset:" + datasetName)
9595
);
9696

97-
Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction,
97+
Grid.runAll(ds, config.construction.useSavedIndexIfExists, config.construction.outDegree, config.construction.efConstruction,
9898
config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph,
9999
config.construction.getFeatureSets(), config.construction.getCompressorParameters(),
100100
config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning, config.search.benchmarks);

0 commit comments

Comments
 (0)