` - Add profiler
36 |
37 |
38 | 2. Focus on specific benchmarks
39 |
40 | For example in the below command lines we are going to run only `IndexConstructionWithRandomSetBenchmark`
41 | ```shell
42 | mvn clean install -DskipTests=true
43 | BENCHMARK_NAME="IndexConstructionWithRandomSetBenchmark"
44 | java --enable-native-access=ALL-UNNAMED \
45 | --add-modules=jdk.incubator.vector \
46 | -XX:+HeapDumpOnOutOfMemoryError \
47 | -Xmx20G -Djvector.experimental.enable_native_vectorization=true \
48 | -jar benchmarks-jmh/target/benchmarks-jmh-4.0.0-beta.3-SNAPSHOT.jar $BENCHMARK_NAME
49 | ```
50 |
51 | If you want to rerun a specific benchmark without testing the entire grid of scenarios defined in the benchmark.
52 | You can just do the following to set M and beamWidth:
53 | ```shell
54 | java -jar benchmarks-jmh/target/benchmarks-jmh-4.0.0-beta.3-SNAPSHOT.jar IndexConstructionWithStaticSetBenchmark -p M=32 -p beamWidth=100
55 | ```
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/benchmarks-jmh/scripts/test_node_setup.sh:
--------------------------------------------------------------------------------
1 | ###### Script for test node setup ######
2 |
3 | sudo apt-get update
4 |
5 | # Download JDK 22
6 | wget https://download.java.net/java/GA/jdk22.0.2/c9ecb94cd31b495da20a27d4581645e8/9/GPL/openjdk-22.0.2_linux-x64_bin.tar.gz
7 |
8 | # Extract JDK 22
9 | tar -xzf openjdk-22.0.2_linux-x64_bin.tar.gz
10 |
11 | sudo mkdir -p /usr/lib/jvm
12 | sudo mv jdk-22.0.2 /usr/lib/jvm/jdk-22.0.2
13 |
14 | ########################################
15 | # Setup Alternatives
16 | ########################################
17 | sudo update-alternatives --install "/usr/bin/java" "java" "/usr/lib/jvm/jdk-22.0.2/bin/java" 1
18 | sudo update-alternatives --install "/usr/bin/javac" "javac" "/usr/lib/jvm/jdk-22.0.2/bin/javac" 1
19 |
20 | ########################################
21 | # Verification
22 | ########################################
23 |
24 | echo
25 | echo "Installation complete. Current default Java version:"
26 | java -version
27 |
28 | # Install Maven
29 | sudo apt-get install maven -y
30 |
31 | # Install Git
32 | sudo apt-get install git -y
33 |
34 | # clone jvector
35 | git clone https://github.com/datastax/jvector.git
36 |
37 | # Build jvector
38 | cd jvector
39 | mvn clean install -DskipTests=true
40 |
41 | # Run benchmarks
42 | java --enable-native-access=ALL-UNNAMED \
43 | --add-modules=jdk.incubator.vector \
44 | -XX:+HeapDumpOnOutOfMemoryError \
45 | -Xmx14G -Djvector.experimental.enable_native_vectorization=true \
46 | -jar target/benchmarks-jmh-4.0.0-beta.3-SNAPSHOT.jar
47 |
48 |
--------------------------------------------------------------------------------
/benchmarks-jmh/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/jvector-base/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
6 | 4.0.0
7 |
8 | io.github.jbellis
9 | jvector-parent
10 | ${revision}
11 |
12 | jvector-base
13 | Base
14 |
15 |
16 |
17 |
18 | org.apache.rat
19 | apache-rat-plugin
20 |
21 | ${project.parent.basedir}/rat-excludes.txt
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/annotations/Experimental.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.github.jbellis.jvector.annotations;
17 |
18 | /**
19 | * Indicates that an API is experimental and may change or be removed in future releases
20 | * with no prior notice.
21 | */
22 | public @interface Experimental {
23 | }
24 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/annotations/VisibleForTesting.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.annotations;
18 |
19 | /**
20 | * Methods or classes marked VisibleForTesting are intended for internal use only
21 | * and may change without warning, regardless of their visibility.
22 | */
23 | public @interface VisibleForTesting {
24 | }
25 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/disk/ByteBufferReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.disk;
18 |
19 | import java.io.IOException;
20 | import java.nio.ByteBuffer;
21 |
22 | /**
23 | * RandomAccessReader that reads from a ByteBuffer
24 | */
25 | public class ByteBufferReader implements RandomAccessReader {
26 | protected final ByteBuffer bb;
27 |
28 | public ByteBufferReader(ByteBuffer sourceBB) {
29 | bb = sourceBB;
30 | }
31 |
32 | @Override
33 | public void seek(long offset) {
34 | bb.position(Math.toIntExact(offset));
35 | }
36 |
37 | @Override
38 | public long getPosition() {
39 | return bb.position();
40 | }
41 |
42 | @Override
43 | public void readFully(float[] buffer) {
44 | for (int i = 0; i < buffer.length; i++) {
45 | buffer[i] = bb.getFloat();
46 | }
47 | }
48 |
49 | @Override
50 | public void readFully(byte[] b) {
51 | bb.get(b);
52 | }
53 |
54 | @Override
55 | public void readFully(ByteBuffer buffer) {
56 | // slice mbb from current position to buffer.remaining()
57 | var slice = bb.slice();
58 | var remaining = buffer.remaining();
59 | slice.limit(remaining);
60 | buffer.put(slice);
61 | bb.position(bb.position() + remaining);
62 | }
63 |
64 | @Override
65 | public void readFully(long[] vector) {
66 | for (int i = 0; i < vector.length; i++) {
67 | vector[i] = bb.getLong();
68 | }
69 | }
70 |
71 | @Override
72 | public int readInt() {
73 | return bb.getInt();
74 | }
75 |
76 | @Override
77 | public long readLong() {
78 | return bb.getLong();
79 | }
80 |
81 | @Override
82 | public float readFloat() {
83 | return bb.getFloat();
84 | }
85 |
86 | @Override
87 | public void read(int[] ints, int offset, int count) {
88 | for (int i = 0; i < count; i++) {
89 | ints[offset + i] = bb.getInt();
90 | }
91 | }
92 |
93 | @Override
94 | public void read(float[] floats, int offset, int count) {
95 | for (int i = 0; i < count; i++) {
96 | floats[offset + i] = bb.getFloat();
97 | }
98 | }
99 |
100 | @Override
101 | public void close() {
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/disk/RandomAccessReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.disk;
18 |
19 | import java.io.IOException;
20 | import java.nio.ByteBuffer;
21 |
22 | /**
23 | * This is a subset of DataInput, plus seek and readFully methods, which allows implementations
24 | * to use more efficient options like FloatBuffer for bulk reads.
25 | *
26 | * JVector includes production-ready implementations; the recommended way to use these are via
27 | * `ReaderSupplierFactory.open`. For custom implementations, e.g. reading from network storage,
28 | * you should also implement a corresponding `ReaderSupplier`.
29 | *
30 | * The general usage pattern is expected to be "seek to a position, then read sequentially from there."
31 | * Thus, RandomAccessReader implementations are expected to be stateful and NOT threadsafe; JVector
32 | * uses the ReaderSupplier API to create a RandomAccessReader per thread, as needed.
33 | */
34 | public interface RandomAccessReader extends AutoCloseable {
35 | void seek(long offset) throws IOException;
36 |
37 | long getPosition() throws IOException;
38 |
39 | int readInt() throws IOException;
40 |
41 | float readFloat() throws IOException;
42 |
43 | long readLong() throws IOException;
44 |
45 | void readFully(byte[] bytes) throws IOException;
46 |
47 | void readFully(ByteBuffer buffer) throws IOException;
48 |
49 | default void readFully(float[] floats) throws IOException {
50 | read(floats, 0, floats.length);
51 | }
52 |
53 | void readFully(long[] vector) throws IOException;
54 |
55 | void read(int[] ints, int offset, int count) throws IOException;
56 |
57 | void read(float[] floats, int offset, int count) throws IOException;
58 |
59 | void close() throws IOException;
60 | }
61 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/disk/RandomAccessWriter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.disk;
18 |
19 | import java.io.Closeable;
20 | import java.io.DataOutput;
21 | import java.io.IOException;
22 |
23 | /**
24 | * A DataOutput that adds methods for random access writes
25 | */
26 | public interface RandomAccessWriter extends DataOutput, Closeable {
27 | void seek(long position) throws IOException;
28 |
29 | long position() throws IOException;
30 |
31 | void flush() throws IOException;
32 |
33 | long checksum(long startOffset, long endOffset) throws IOException;
34 | }
35 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/disk/ReaderSupplier.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.disk;
18 |
19 | import java.io.IOException;
20 |
21 | /**
22 | * A supplier of RandomAccessReaders.
23 | */
24 | public interface ReaderSupplier extends AutoCloseable {
25 | /**
26 | * @return a new reader. It is up to the caller to re-use these readers or close them,
27 | * the ReaderSupplier is not responsible for caching them.
28 | */
29 | RandomAccessReader get() throws IOException;
30 |
31 | default void close() throws IOException {
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/disk/ReaderSupplierFactory.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.github.jbellis.jvector.disk;
17 |
18 | import java.io.IOException;
19 | import java.lang.reflect.Constructor;
20 | import java.nio.file.Files;
21 | import java.nio.file.Path;
22 | import java.util.logging.Level;
23 | import java.util.logging.Logger;
24 |
25 | public class ReaderSupplierFactory {
26 | private static final Logger LOG = Logger.getLogger(ReaderSupplierFactory.class.getName());
27 | private static final String MEMORY_SEGMENT_READER_CLASSNAME = "io.github.jbellis.jvector.disk.MemorySegmentReader$Supplier";
28 | private static final String MMAP_READER_CLASSNAME = "io.github.jbellis.jvector.example.util.MMapReader$Supplier";
29 |
30 | public static ReaderSupplier open(Path path) throws IOException {
31 | try {
32 | // prefer MemorySegmentReader (available under JDK 20+)
33 | var supplierClass = Class.forName(MEMORY_SEGMENT_READER_CLASSNAME);
34 | Constructor> ctor = supplierClass.getConstructor(Path.class);
35 | return (ReaderSupplier) ctor.newInstance(path);
36 | } catch (Exception e) {
37 | LOG.log(Level.WARNING, "MemorySegmentReaderSupplier not available, falling back to MMapReaderSupplier. Reason: {0}: {1}",
38 | new Object[]{e.getClass().getName(), e.getMessage()});
39 | }
40 |
41 | try {
42 | // fall back to MMapReader (requires a 3rd party linux-only native mmap library that is only included
43 | // in the build with jvector-example; this allows Bench to not embarrass us on older JDKs)
44 | var supplierClass = Class.forName(MMAP_READER_CLASSNAME);
45 | Constructor> ctor = supplierClass.getConstructor(Path.class);
46 | return (ReaderSupplier) ctor.newInstance(path);
47 | } catch (Exception e) {
48 | LOG.log(Level.WARNING, "MMapReaderSupplier not available, falling back to SimpleMappedReaderSupplier. More details available at level FINE.");
49 | LOG.log(Level.FINE, "MMapReaderSupplier instantiation exception:", e);
50 | if (Files.size(path) > Integer.MAX_VALUE) {
51 | throw new RuntimeException("File sizes greater than 2GB are not supported on older Windows JDKs");
52 | }
53 |
54 | // finally, fall back to SimpleMappedReader (available everywhere, but doesn't support files > 2GB)
55 | return new SimpleMappedReader.Supplier(path);
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/disk/SimpleMappedReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.disk;
18 |
19 | import sun.misc.Unsafe;
20 |
21 | import java.io.IOException;
22 | import java.io.RandomAccessFile;
23 | import java.lang.reflect.Field;
24 | import java.nio.MappedByteBuffer;
25 | import java.nio.channels.FileChannel;
26 | import java.nio.file.Path;
27 | import java.util.logging.Logger;
28 |
29 | /**
30 | * Simple sample implementation of RandomAccessReader.
31 | * It provides a bare minimum to run against disk in reasonable time.
32 | * Does not handle files above 2 GB.
33 | */
34 | public class SimpleMappedReader extends ByteBufferReader {
35 | private static final Logger LOG = Logger.getLogger(SimpleMappedReader.class.getName());
36 |
37 | private static Unsafe getUnsafe() {
38 | try {
39 | Field f = Unsafe.class.getDeclaredField("theUnsafe");
40 | f.setAccessible(true);
41 | return (Unsafe) f.get(null);
42 | } catch (Exception e) {
43 | LOG.warning("MappedRandomAccessReader can't acquire needed Unsafe access");
44 | return null;
45 | }
46 | }
47 |
48 |
49 | SimpleMappedReader(MappedByteBuffer mbb) {
50 | super(mbb);
51 | }
52 |
53 | @Override
54 | public void close() {
55 | // Individual readers don't close anything
56 | }
57 |
58 | public static class Supplier implements ReaderSupplier {
59 | private final MappedByteBuffer buffer;
60 | private static final Unsafe unsafe = getUnsafe();
61 |
62 | public Supplier(Path path) throws IOException {
63 | try (var raf = new RandomAccessFile(path.toString(), "r")) {
64 | if (raf.length() > Integer.MAX_VALUE) {
65 | throw new RuntimeException("SimpleMappedReader doesn't support files above 2GB");
66 | }
67 | this.buffer = raf.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, raf.length());
68 | this.buffer.load();
69 | }
70 | }
71 |
72 | @Override
73 | public SimpleMappedReader get() {
74 | return new SimpleMappedReader((MappedByteBuffer) buffer.duplicate());
75 | }
76 |
77 | @Override
78 | public void close() {
79 | if (unsafe != null) {
80 | try {
81 | unsafe.invokeCleaner(buffer);
82 | } catch (IllegalArgumentException e) {
83 | // empty catch, this was a duplicated/indirect buffer or
84 | // otherwise not cleanable
85 | }
86 | }
87 | }
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/exceptions/ThreadInterruptedException.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.exceptions;
26 |
27 | public final class ThreadInterruptedException extends RuntimeException {
28 | public ThreadInterruptedException(InterruptedException ie) {
29 | super(ie);
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/ListRandomAccessVectorValues.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph;
18 |
19 | import io.github.jbellis.jvector.vector.types.VectorFloat;
20 |
21 | import java.util.List;
22 |
23 | /**
24 | * A List-backed implementation of the {@link RandomAccessVectorValues} interface.
25 | *
26 | * It is acceptable to provide this class to a GraphBuilder, and then continue
27 | * to add vectors to the backing List as you add to the graph.
28 | *
29 | * This will be as threadsafe as the provided List.
30 | */
31 | public class ListRandomAccessVectorValues implements RandomAccessVectorValues {
32 | private final List> vectors;
33 | private final int dimension;
34 |
35 | /**
36 | * Construct a new instance of {@link ListRandomAccessVectorValues}.
37 | *
38 | * @param vectors a (potentially mutable) list of float vectors.
39 | * @param dimension the dimension of the vectors.
40 | */
41 | public ListRandomAccessVectorValues(List> vectors, int dimension) {
42 | this.vectors = vectors;
43 | this.dimension = dimension;
44 | }
45 |
46 | @Override
47 | public int size() {
48 | return vectors.size();
49 | }
50 |
51 | @Override
52 | public int dimension() {
53 | return dimension;
54 | }
55 |
56 | @Override
57 | public VectorFloat> getVector(int targetOrd) {
58 | return vectors.get(targetOrd);
59 | }
60 |
61 | @Override
62 | public boolean isValueShared() {
63 | return false;
64 | }
65 |
66 | @Override
67 | public ListRandomAccessVectorValues copy() {
68 | return this;
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/MapRandomAccessVectorValues.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph;
18 |
19 | import io.github.jbellis.jvector.vector.types.VectorFloat;
20 |
21 | import java.util.Map;
22 |
23 | /**
24 | * RandomAccessValues backed by a Map. This can be more useful than `ListRandomAccessVectorValues`
25 | * for handling concurrent inserts.
26 | *
27 | * It is acceptable to provide this class to a GraphBuilder, and then continue
28 | * to add vectors to the backing Map as you add to the graph.
29 | *
30 | * This will be as threadsafe as the provided Map.
31 | */
32 | public class MapRandomAccessVectorValues implements RandomAccessVectorValues {
33 | private final Map> map;
34 | private final int dimension;
35 |
36 | public MapRandomAccessVectorValues(Map> map, int dimension) {
37 | this.map = map;
38 | this.dimension = dimension;
39 | }
40 |
41 | @Override
42 | public int size() {
43 | return map.size();
44 | }
45 |
46 | @Override
47 | public int dimension() {
48 | return dimension;
49 | }
50 |
51 | @Override
52 | public VectorFloat> getVector(int nodeId) {
53 | return map.get(nodeId);
54 | }
55 |
56 | @Override
57 | public boolean isValueShared() {
58 | return false;
59 | }
60 |
61 | @Override
62 | public RandomAccessVectorValues copy() {
63 | return this;
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/NodesIterator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.graph;
26 |
27 | import java.util.NoSuchElementException;
28 | import java.util.PrimitiveIterator;
29 |
30 | /**
31 | * Iterator over graph nodes that includes the size –- the total
32 | * number of nodes to be iterated over. The nodes are NOT guaranteed to be presented in any
33 | * particular order.
34 | */
35 | public interface NodesIterator extends PrimitiveIterator.OfInt {
36 | /**
37 | * The number of elements in this iterator *
38 | */
39 | int size();
40 |
41 | static NodesIterator fromPrimitiveIterator(PrimitiveIterator.OfInt iterator, int size) {
42 | return new NodesIterator() {
43 | @Override
44 | public int size() {
45 | return size;
46 | }
47 |
48 | @Override
49 | public int nextInt() {
50 | return iterator.nextInt();
51 | }
52 |
53 | @Override
54 | public boolean hasNext() {
55 | return iterator.hasNext();
56 | }
57 | };
58 | }
59 |
60 | class ArrayNodesIterator implements NodesIterator {
61 | private final int[] nodes;
62 | private int cur = 0;
63 | private final int size;
64 |
65 | /** Constructor for iterator based on integer array representing nodes */
66 | public ArrayNodesIterator(int[] nodes, int size) {
67 | assert nodes != null;
68 | assert size <= nodes.length;
69 | this.size = size;
70 | this.nodes = nodes;
71 | }
72 |
73 | @Override
74 | public int size() {
75 | return size;
76 | }
77 |
78 | public ArrayNodesIterator(int[] nodes) {
79 | this(nodes, nodes.length);
80 | }
81 |
82 | @Override
83 | public int nextInt() {
84 | if (!hasNext()) {
85 | throw new NoSuchElementException();
86 | }
87 | if (nodes == null) {
88 | return cur++;
89 | } else {
90 | return nodes[cur++];
91 | }
92 | }
93 |
94 | @Override
95 | public boolean hasNext() {
96 | return cur < size;
97 | }
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/NodesUnsorted.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.graph;
26 |
27 | import io.github.jbellis.jvector.graph.NodeQueue.NodeConsumer;
28 | import io.github.jbellis.jvector.util.ArrayUtil;
29 |
30 | /**
31 | * NodesUnsorted contains scored node ids in insertion order.
32 | */
33 | public class NodesUnsorted {
34 | protected int size;
35 | float[] score;
36 | int[] node;
37 |
38 | public NodesUnsorted(int initialSize) {
39 | node = new int[initialSize];
40 | score = new float[initialSize];
41 | }
42 |
43 | /**
44 | * Add a new node to the NodeArray. The new node must be worse than all previously stored
45 | * nodes.
46 | */
47 | public void add(int newNode, float newScore) {
48 | if (size == node.length) {
49 | growArrays();
50 | }
51 | node[size] = newNode;
52 | score[size] = newScore;
53 | ++size;
54 | }
55 |
56 | protected final void growArrays() {
57 | node = ArrayUtil.grow(node);
58 | score = ArrayUtil.growExact(score, node.length);
59 | }
60 |
61 | public int size() {
62 | return size;
63 | }
64 |
65 | public void clear() {
66 | size = 0;
67 | }
68 |
69 | public void foreach(NodeConsumer consumer) {
70 | for (int i = 0; i < size; i++) {
71 | consumer.accept(node[i], score[i]);
72 | }
73 | }
74 |
75 | @Override
76 | public String toString() {
77 | return "NodesUnsorted[" + size + "]";
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/Header.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.disk;
18 |
19 | import io.github.jbellis.jvector.disk.RandomAccessReader;
20 | import io.github.jbellis.jvector.disk.RandomAccessWriter;
21 | import io.github.jbellis.jvector.graph.disk.feature.Feature;
22 | import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
23 |
24 | import java.io.DataOutput;
25 | import java.io.IOException;
26 | import java.util.EnumMap;
27 | import java.util.EnumSet;
28 |
29 | /**
30 | * Header information for an on-disk graph index, reflecting the common header and feature-specific headers.
31 | */
32 | class Header {
33 | final CommonHeader common;
34 | final EnumMap features;
35 |
36 | Header(CommonHeader common, EnumMap features) {
37 | this.common = common;
38 | this.features = features;
39 | }
40 |
41 | void write(RandomAccessWriter out) throws IOException {
42 | common.write(out);
43 |
44 | if (common.version >= 3) {
45 | out.writeInt(FeatureId.serialize(EnumSet.copyOf(features.keySet())));
46 | }
47 |
48 | // we restrict pre-version-3 writers to INLINE_VECTORS features, so we don't need additional version-handling here
49 | for (Feature writer : features.values()) {
50 | writer.writeHeader(out);
51 | }
52 | }
53 |
54 | public int size() {
55 | int size = common.size();
56 |
57 | if (common.version >= 3) {
58 | size += Integer.BYTES;
59 | }
60 |
61 | size += features.values().stream().mapToInt(Feature::headerSize).sum();
62 |
63 | return size;
64 | }
65 |
66 | static Header load(RandomAccessReader reader, long offset) throws IOException {
67 | reader.seek(offset);
68 |
69 | EnumSet featureIds;
70 | EnumMap features = new EnumMap<>(FeatureId.class);
71 | CommonHeader common = CommonHeader.load(reader);
72 | if (common.version >= 3) {
73 | featureIds = FeatureId.deserialize(reader.readInt());
74 | } else {
75 | featureIds = EnumSet.of(FeatureId.INLINE_VECTORS);
76 | }
77 |
78 | for (FeatureId featureId : featureIds) {
79 | features.put(featureId, featureId.load(common, reader));
80 | }
81 |
82 | return new Header(common, features);
83 | }
84 | }
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/feature/Feature.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.disk.feature;
18 |
19 | import java.io.DataOutput;
20 | import java.io.IOException;
21 | import java.util.EnumMap;
22 | import java.util.function.IntFunction;
23 |
24 | /**
25 | * A feature of an on-disk graph index. Information to use a feature is stored in the header on-disk.
26 | */
27 | public interface Feature {
28 | FeatureId id();
29 |
30 | int headerSize();
31 |
32 | int featureSize();
33 |
34 | void writeHeader(DataOutput out) throws IOException;
35 |
36 | default void writeInline(DataOutput out, State state) throws IOException {
37 | // default no-op
38 | }
39 |
40 | // Feature implementations should implement a State as well for use with writeInline/writeSeparately
41 | interface State {
42 | }
43 |
44 | static EnumMap> singleStateFactory(FeatureId id, IntFunction stateFactory) {
45 | EnumMap> map = new EnumMap<>(FeatureId.class);
46 | map.put(id, stateFactory);
47 | return map;
48 | }
49 |
50 | static EnumMap singleState(FeatureId id, State state) {
51 | EnumMap map = new EnumMap<>(FeatureId.class);
52 | map.put(id, state);
53 | return map;
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/feature/FeatureId.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.disk.feature;
18 |
19 | import io.github.jbellis.jvector.disk.RandomAccessReader;
20 | import io.github.jbellis.jvector.graph.disk.CommonHeader;
21 |
22 | import java.util.Collections;
23 | import java.util.EnumSet;
24 | import java.util.Set;
25 | import java.util.function.BiFunction;
26 |
27 | /**
28 | * An enum representing the features that can be stored in an on-disk graph index.
29 | * The order of this Enum SHOULD NOT be changed, as it affects serialization structure of graphs.
30 | * New features should be added to the end.
31 | * These are typically mapped to a Feature.
32 | */
33 | public enum FeatureId {
34 | INLINE_VECTORS(InlineVectors::load),
35 | FUSED_ADC(FusedADC::load),
36 | NVQ_VECTORS(NVQ::load),
37 | SEPARATED_VECTORS(SeparatedVectors::load),
38 | SEPARATED_NVQ(SeparatedNVQ::load);
39 |
40 | public static final Set ALL = Collections.unmodifiableSet(EnumSet.allOf(FeatureId.class));
41 |
42 | private final BiFunction loader;
43 |
44 | FeatureId(BiFunction loader) {
45 | this.loader = loader;
46 | }
47 |
48 | public Feature load(CommonHeader header, RandomAccessReader reader) {
49 | return loader.apply(header, reader);
50 | }
51 |
52 | public static EnumSet deserialize(int bitflags) {
53 | EnumSet set = EnumSet.noneOf(FeatureId.class);
54 | for (int n = 0; n < values().length; n++) {
55 | if ((bitflags & (1 << n)) != 0)
56 | set.add(values()[n]);
57 | }
58 | return set;
59 | }
60 |
61 | public static int serialize(EnumSet flags) {
62 | int i = 0;
63 | for (FeatureId flag : flags)
64 | i |= 1 << flag.ordinal();
65 | return i;
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/feature/FeatureSource.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.disk.feature;
18 |
19 | import io.github.jbellis.jvector.disk.RandomAccessReader;
20 |
21 | import java.io.Closeable;
22 | import java.io.IOException;
23 |
24 | public interface FeatureSource extends Closeable {
25 | RandomAccessReader featureReaderForNode(int node, FeatureId featureId) throws IOException;
26 | }
27 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/feature/InlineVectors.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.disk.feature;
18 |
19 | import io.github.jbellis.jvector.disk.RandomAccessReader;
20 | import io.github.jbellis.jvector.graph.disk.CommonHeader;
21 | import io.github.jbellis.jvector.vector.VectorizationProvider;
22 | import io.github.jbellis.jvector.vector.types.VectorFloat;
23 | import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
24 |
25 | import java.io.DataOutput;
26 | import java.io.IOException;
27 |
28 | /**
29 | * Implements the storage of full-resolution vectors inline into an OnDiskGraphIndex. These can be used for exact scoring.
30 | */
31 | public class InlineVectors implements Feature {
32 | private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
33 | private final int dimension;
34 |
35 | public InlineVectors(int dimension) {
36 | this.dimension = dimension;
37 | }
38 |
39 | @Override
40 | public FeatureId id() {
41 | return FeatureId.INLINE_VECTORS;
42 | }
43 |
44 | @Override
45 | public int headerSize() {
46 | return 0;
47 | }
48 |
49 | public int featureSize() {
50 | return dimension * Float.BYTES;
51 | }
52 |
53 | public int dimension() {
54 | return dimension;
55 | }
56 |
57 | static InlineVectors load(CommonHeader header, RandomAccessReader reader) {
58 | return new InlineVectors(header.dimension);
59 | }
60 |
61 | @Override
62 | public void writeHeader(DataOutput out) {
63 | // common header contains dimension, which is sufficient
64 | }
65 |
66 | @Override
67 | public void writeInline(DataOutput out, Feature.State state) throws IOException {
68 | vectorTypeSupport.writeFloatVector(out, ((InlineVectors.State) state).vector);
69 | }
70 |
71 | public static class State implements Feature.State {
72 | public final VectorFloat> vector;
73 |
74 | public State(VectorFloat> vector) {
75 | this.vector = vector;
76 | }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/feature/SeparatedFeature.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.disk.feature;
18 |
19 | import java.io.DataOutput;
20 | import java.io.IOException;
21 |
22 | public interface SeparatedFeature extends Feature {
23 | void setOffset(long offset);
24 | long getOffset();
25 |
26 | void writeSeparately(DataOutput out, State state) throws IOException;
27 | }
28 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/feature/SeparatedVectors.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.disk.feature;
18 |
19 | import io.github.jbellis.jvector.disk.RandomAccessReader;
20 | import io.github.jbellis.jvector.graph.disk.CommonHeader;
21 | import io.github.jbellis.jvector.vector.VectorizationProvider;
22 | import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
23 |
24 | import java.io.DataOutput;
25 | import java.io.IOException;
26 | import java.io.UncheckedIOException;
27 |
28 | public class SeparatedVectors implements SeparatedFeature {
29 | private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
30 | private final int dimension;
31 | private long offset;
32 |
33 | public SeparatedVectors(int dimension, long offset) {
34 | this.dimension = dimension;
35 | this.offset = offset;
36 | }
37 |
38 | @Override
39 | public void setOffset(long offset) {
40 | this.offset = offset;
41 | }
42 |
43 | @Override
44 | public long getOffset() {
45 | return offset;
46 | }
47 |
48 | @Override
49 | public FeatureId id() {
50 | return FeatureId.SEPARATED_VECTORS;
51 | }
52 |
53 | @Override
54 | public int headerSize() {
55 | return Long.BYTES;
56 | }
57 |
58 | @Override
59 | public int featureSize() {
60 | return dimension * Float.BYTES;
61 | }
62 |
63 | @Override
64 | public void writeHeader(DataOutput out) throws IOException {
65 | out.writeLong(offset);
66 | }
67 |
68 | @Override
69 | public void writeSeparately(DataOutput out, State state_) throws IOException {
70 | var state = (InlineVectors.State) state_;
71 | if (state.vector != null) {
72 | vectorTypeSupport.writeFloatVector(out, state.vector);
73 | } else {
74 | // Write zeros for missing vector
75 | for (int j = 0; j < dimension; j++) {
76 | out.writeFloat(0.0f);
77 | }
78 | }
79 | }
80 |
81 | // Using InlineVectors.State
82 |
83 | static SeparatedVectors load(CommonHeader header, RandomAccessReader reader) {
84 | try {
85 | long offset = reader.readLong();
86 | return new SeparatedVectors(header.dimension, offset);
87 | } catch (IOException e) {
88 | throw new UncheckedIOException(e);
89 | }
90 | }
91 |
92 | public int dimension() {
93 | return dimension;
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/diversity/DiversityProvider.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.diversity;
18 |
19 | import io.github.jbellis.jvector.graph.NodeArray;
20 | import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider;
21 | import io.github.jbellis.jvector.graph.similarity.ScoreFunction;
22 | import io.github.jbellis.jvector.util.BitSet;
23 | import io.github.jbellis.jvector.util.DocIdSetIterator;
24 |
25 | import static java.lang.Math.min;
26 |
27 | public interface DiversityProvider {
28 | /**
29 | * update `selected` with the diverse members of `neighbors`. `neighbors` is not modified
30 | * @return the fraction of short edges (neighbors within alpha=1.0)
31 | */
32 | double retainDiverse(NodeArray neighbors, int maxDegree, int diverseBefore, BitSet selected);
33 | }
34 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/similarity/CachingVectorValues.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.similarity;
18 |
19 | import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
20 | import io.github.jbellis.jvector.quantization.PQVectors;
21 | import io.github.jbellis.jvector.vector.types.VectorFloat;
22 | import org.agrona.collections.Int2ObjectHashMap;
23 |
24 | /**
25 | * This is NOT a general "make vectors go faster" class. It is used specifically by diversity computations
26 | * when loading vectors from disk, because in that specific scenario the same vector is usually loaded multiple times
27 | * as different neighbors are scored.
28 | */
29 | class CachingVectorValues implements RandomAccessVectorValues {
30 | private final PQVectors cv;
31 | private final int dimension;
32 | private final Int2ObjectHashMap> cache;
33 | private final RandomAccessVectorValues ravv;
34 |
35 | public CachingVectorValues(PQVectors cv, int dimension, Int2ObjectHashMap> cache, RandomAccessVectorValues ravv) {
36 | this.cv = cv;
37 | this.dimension = dimension;
38 | this.cache = cache;
39 | this.ravv = ravv;
40 | }
41 |
42 | @Override
43 | public int size() {
44 | return cv.count();
45 | }
46 |
47 | @Override
48 | public int dimension() {
49 | return dimension;
50 | }
51 |
52 | @Override
53 | public boolean isValueShared() {
54 | return false;
55 | }
56 |
57 | @Override
58 | public RandomAccessVectorValues copy() {
59 | return this;
60 | }
61 |
62 | @Override
63 | public void getVectorInto(int nodeId, VectorFloat> result, int offset) {
64 | // getVectorInto is only called by reranking, not diversity code
65 | throw new UnsupportedOperationException();
66 | }
67 |
68 | @Override
69 | public VectorFloat> getVector(int nodeId) {
70 | return cache.computeIfAbsent(nodeId, (int n) -> {
71 | var v = ravv.getVector(n);
72 | return ravv.isValueShared() ? v.copy() : v;
73 | });
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/similarity/DefaultSearchScoreProvider.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.similarity;
18 |
19 | import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
20 | import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
21 | import io.github.jbellis.jvector.vector.types.VectorFloat;
22 |
23 | /** Encapsulates comparing node distances to a specific vector for GraphSearcher. */
24 | public final class DefaultSearchScoreProvider implements SearchScoreProvider {
25 | private final ScoreFunction scoreFunction;
26 | private final ScoreFunction.ExactScoreFunction reranker;
27 |
28 | /**
29 | * @param scoreFunction the primary, fast scoring function
30 | *
31 | * No reranking is performed.
32 | */
33 | public DefaultSearchScoreProvider(ScoreFunction scoreFunction) {
34 | this(scoreFunction, null);
35 | }
36 |
37 | /**
38 | * @param scoreFunction the primary, fast scoring function
39 | * @param reranker optional reranking function
40 | * Generally, reranker will be null iff scoreFunction is an ExactScoreFunction. However,
41 | * it is allowed, and sometimes useful, to only perform approximate scoring without reranking.
42 | *
43 | * Most often it will be convenient to get the reranker either using `RandomAccessVectorValues.rerankerFor`
44 | * or `ScoringView.rerankerFor`.
45 | */
46 | public DefaultSearchScoreProvider(ScoreFunction scoreFunction, ScoreFunction.ExactScoreFunction reranker) {
47 | assert scoreFunction != null;
48 | this.scoreFunction = scoreFunction;
49 | this.reranker = reranker;
50 | }
51 |
52 | public ScoreFunction scoreFunction() {
53 | return scoreFunction;
54 | }
55 |
56 | public ScoreFunction.ExactScoreFunction reranker() {
57 | return reranker;
58 | }
59 |
60 | public ScoreFunction.ExactScoreFunction exactScoreFunction() {
61 | return scoreFunction.isExact()
62 | ? (ScoreFunction.ExactScoreFunction) scoreFunction
63 | : reranker;
64 | }
65 |
66 | /**
67 | * A SearchScoreProvider for a single-pass search based on exact similarity.
68 | * Generally only suitable when your RandomAccessVectorValues is entirely in-memory,
69 | * e.g. during construction.
70 | */
71 | public static DefaultSearchScoreProvider exact(VectorFloat> v, VectorSimilarityFunction vsf, RandomAccessVectorValues ravv) {
72 | // don't use ESF.reranker, we need thread safety here
73 | var sf = new ScoreFunction.ExactScoreFunction() {
74 | @Override
75 | public float similarityTo(int node2) {
76 | return vsf.compare(v, ravv.getVector(node2));
77 | }
78 | };
79 | return new DefaultSearchScoreProvider(sf);
80 | }
81 | }
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/similarity/ScoreFunction.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.similarity;
18 |
19 | import io.github.jbellis.jvector.vector.VectorizationProvider;
20 | import io.github.jbellis.jvector.vector.types.VectorFloat;
21 | import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
22 |
23 | /**
24 | * Provides an API for encapsulating similarity to another node or vector. Used both for
25 | * building the graph (as part of NodeSimilarity) or for searching it (used standalone,
26 | * with a reference to the query vector).
27 | *
28 | * ExactScoreFunction and ApproximateScoreFunction are provided for convenience so they
29 | * can be defined as a simple lambda.
30 | */
31 | public interface ScoreFunction {
32 | VectorTypeSupport vts = VectorizationProvider.getInstance().getVectorTypeSupport();
33 |
34 | /**
35 | * @return true if the ScoreFunction returns exact, full-resolution scores
36 | */
37 | boolean isExact();
38 |
39 | /**
40 | * @return the similarity to one other node
41 | */
42 | float similarityTo(int node2);
43 |
44 | /**
45 | * @return the similarity to all of the nodes that `node2` has an edge towards.
46 | * Used when expanding the neighbors of a search candidate.
47 | */
48 | default VectorFloat> edgeLoadingSimilarityTo(int node2) {
49 | throw new UnsupportedOperationException("bulk similarity not supported");
50 | }
51 |
52 | /**
53 | * @return true if `edgeLoadingSimilarityTo` is supported
54 | */
55 | default boolean supportsEdgeLoadingSimilarity() {
56 | return false;
57 | }
58 |
59 | interface ExactScoreFunction extends ScoreFunction {
60 | default boolean isExact() {
61 | return true;
62 | }
63 | }
64 |
65 | interface ApproximateScoreFunction extends ScoreFunction {
66 | default boolean isExact() {
67 | return false;
68 | }
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/graph/similarity/SearchScoreProvider.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.graph.similarity;
18 |
19 | /** Encapsulates comparing node distances to a specific vector for GraphSearcher. */
20 | public interface SearchScoreProvider {
21 |
22 | ScoreFunction scoreFunction();
23 |
24 | ScoreFunction.ExactScoreFunction reranker();
25 |
26 | ScoreFunction.ExactScoreFunction exactScoreFunction();
27 | }
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/CompressedVectors.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.quantization;
18 |
19 | import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;
20 | import io.github.jbellis.jvector.graph.similarity.ScoreFunction;
21 | import io.github.jbellis.jvector.util.Accountable;
22 | import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
23 | import io.github.jbellis.jvector.vector.types.VectorFloat;
24 |
25 | import java.io.DataOutput;
26 | import java.io.IOException;
27 |
28 | public interface CompressedVectors extends Accountable {
29 | /**
30 | * Write the compressed vectors to the given DataOutput
31 | * @param out the DataOutput to write to
32 | * @param version the serialization version. versions 2 and 3 are supported
33 | */
34 | void write(DataOutput out, int version) throws IOException;
35 |
36 | /**
37 | * Write the compressed vectors to the given DataOutput at the current serialization version
38 | */
39 | default void write(DataOutput out) throws IOException {
40 | write(out, OnDiskGraphIndex.CURRENT_VERSION);
41 | }
42 |
43 | /** @return the original size of each vector, in bytes, before compression */
44 | int getOriginalSize();
45 |
46 | /** @return the compressed size of each vector, in bytes */
47 | int getCompressedSize();
48 |
49 | /** @return the compressor used by this instance */
50 | VectorCompressor> getCompressor();
51 |
52 | /** precomputes partial scores for the given query with every centroid; suitable for most searches */
53 | ScoreFunction.ApproximateScoreFunction precomputedScoreFunctionFor(VectorFloat> q, VectorSimilarityFunction similarityFunction);
54 |
55 | /** no precomputation; suitable when just a handful of score computations are performed */
56 | ScoreFunction.ApproximateScoreFunction scoreFunctionFor(VectorFloat> q, VectorSimilarityFunction similarityFunction);
57 |
58 | @Deprecated
59 | default ScoreFunction.ApproximateScoreFunction approximateScoreFunctionFor(VectorFloat> q, VectorSimilarityFunction similarityFunction) {
60 | return precomputedScoreFunctionFor(q, similarityFunction);
61 | }
62 |
63 | /** the number of vectors */
64 | int count();
65 | }
66 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/ImmutableBQVectors.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.quantization;
18 |
19 | public class ImmutableBQVectors extends BQVectors {
20 | public ImmutableBQVectors(BinaryQuantization bq, long[][] compressedVectors) {
21 | super(bq);
22 | this.compressedVectors = compressedVectors;
23 | }
24 |
25 | @Override
26 | public int count() {
27 | return compressedVectors.length;
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/ImmutablePQVectors.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.quantization;
18 |
19 | import io.github.jbellis.jvector.vector.types.ByteSequence;
20 |
21 | public class ImmutablePQVectors extends PQVectors {
22 | private final int vectorCount;
23 |
24 | /**
25 | * Construct an immutable PQVectors instance with the given ProductQuantization and compressed data chunks.
26 | * @param pq the ProductQuantization to use
27 | * @param compressedDataChunks the compressed data chunks
28 | * @param vectorCount the number of vectors
29 | * @param vectorsPerChunk the number of vectors per chunk
30 | */
31 | public ImmutablePQVectors(ProductQuantization pq, ByteSequence>[] compressedDataChunks, int vectorCount, int vectorsPerChunk) {
32 | super(pq);
33 | this.compressedDataChunks = compressedDataChunks;
34 | this.vectorCount = vectorCount;
35 | this.vectorsPerChunk = vectorsPerChunk;
36 | }
37 |
38 | @Override
39 | protected int validChunkCount() {
40 | return compressedDataChunks.length;
41 | }
42 |
43 | @Override
44 | public int count() {
45 | return vectorCount;
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/MutableBQVectors.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.quantization;
18 |
19 | import io.github.jbellis.jvector.vector.types.VectorFloat;
20 |
21 | @SuppressWarnings("unused")
22 | public class MutableBQVectors extends BQVectors implements MutableCompressedVectors> {
23 | private static final int INITIAL_CAPACITY = 1024;
24 | private static final float GROWTH_FACTOR = 1.5f;
25 |
26 | protected int vectorCount;
27 |
28 | /**
29 | * Construct a mutable BQVectors instance with the given BinaryQuantization.
30 | * The vectors storage will grow dynamically as needed.
31 | * @param bq the BinaryQuantization to use
32 | */
33 | public MutableBQVectors(BinaryQuantization bq) {
34 | super(bq);
35 | this.compressedVectors = new long[INITIAL_CAPACITY][];
36 | this.vectorCount = 0;
37 | }
38 |
39 | private void ensureCapacity(int ordinal) {
40 | if (ordinal >= compressedVectors.length) {
41 | int newCapacity = Math.max(ordinal + 1, (int)(compressedVectors.length * GROWTH_FACTOR));
42 | long[][] newVectors = new long[newCapacity][];
43 | System.arraycopy(compressedVectors, 0, newVectors, 0, compressedVectors.length);
44 | compressedVectors = newVectors;
45 | }
46 | }
47 |
48 | @Override
49 | public void encodeAndSet(int ordinal, VectorFloat> vector) {
50 | ensureCapacity(ordinal);
51 | compressedVectors[ordinal] = bq.encode(vector);
52 | vectorCount = Math.max(vectorCount, ordinal + 1);
53 | }
54 |
55 | @Override
56 | public void setZero(int ordinal) {
57 | ensureCapacity(ordinal);
58 | compressedVectors[ordinal] = new long[bq.compressedVectorSize()];
59 | vectorCount = Math.max(vectorCount, ordinal + 1);
60 | }
61 |
62 | @Override
63 | public int count() {
64 | return vectorCount;
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/MutableCompressedVectors.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.quantization;
18 |
19 | public interface MutableCompressedVectors extends CompressedVectors {
20 | /**
21 | * Encode the given vector and set it at the given ordinal. Done without unnecessary copying.
22 | *
23 | * It's the caller's responsibility to ensure there are no "holes" in the ordinals that are
24 | * neither encoded nor set to zero.
25 | *
26 | * @param ordinal the ordinal to set
27 | * @param vector the vector to encode and set
28 | */
29 | void encodeAndSet(int ordinal, T vector);
30 |
31 | /**
32 | * Set the vector at the given ordinal to zero.
33 | *
34 | * It's the caller's responsibility to ensure there are no "holes" in the ordinals that are
35 | * neither encoded nor set to zero.
36 | *
37 | * @param ordinal the ordinal to set
38 | */
39 | void setZero(int ordinal);
40 | }
41 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/VectorCompressor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.quantization;
18 |
19 | import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues;
20 | import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
21 | import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;
22 | import io.github.jbellis.jvector.util.PhysicalCoreExecutor;
23 | import io.github.jbellis.jvector.vector.types.VectorFloat;
24 |
25 | import java.io.DataOutput;
26 | import java.io.IOException;
27 | import java.util.List;
28 | import java.util.concurrent.ForkJoinPool;
29 |
30 | /**
31 | * Interface for vector compression. T is the encoded (compressed) vector type;
32 | * it will be an array type.
33 | */
34 | public interface VectorCompressor {
35 |
36 | default CompressedVectors encodeAll(RandomAccessVectorValues ravv) {
37 | return encodeAll(ravv, PhysicalCoreExecutor.pool());
38 | }
39 |
40 | /**
41 | * Encode all vectors in the RandomAccessVectorValues. If the RandomAccessVectorValues
42 | * has a missing vector for a given ordinal, the value will be encoded as a zero vector.
43 | * @param ravv RandomAccessVectorValues to encode
44 | * @param simdExecutor ForkJoinPool to use for SIMD operations
45 | * @return CompressedVectors containing the encoded vectors
46 | */
47 | CompressedVectors encodeAll(RandomAccessVectorValues ravv, ForkJoinPool simdExecutor);
48 |
49 | T encode(VectorFloat> v);
50 |
51 | void encodeTo(VectorFloat> v, T dest);
52 |
53 | /**
54 | * @param out DataOutput to write to
55 | * @param version serialization version. Versions 2 and 3 are supported
56 | */
57 | void write(DataOutput out, int version) throws IOException;
58 |
59 | /** Write with the current serialization version */
60 | default void write(DataOutput out) throws IOException {
61 | write(out, OnDiskGraphIndex.CURRENT_VERSION);
62 | }
63 |
64 | /**
65 | * @param compressedVectors must match the type T for this VectorCompressor, but
66 | * it is declared as Object because we want callers to be able to use this
67 | * without committing to a specific type T.
68 | */
69 | @Deprecated
70 | CompressedVectors createCompressedVectors(Object[] compressedVectors);
71 |
72 | /** the size of the serialized compressor itself (NOT the size of compressed vectors) */
73 | int compressorSize();
74 |
75 | /** the size of a compressed vector */
76 | int compressedVectorSize();
77 | }
78 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/Accountable.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.util;
26 |
27 | public interface Accountable {
28 | long ramBytesUsed();
29 | }
30 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/BitSet.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.util;
26 |
27 | /**
28 | * Base implementation for a bit set.
29 | */
30 | public abstract class BitSet implements Bits, Accountable {
31 | /**
32 | * Clear all the bits of the set.
33 | *
34 | * Depending on the implementation, this may be significantly faster than clear(0, length).
35 | */
36 | public void clear() {
37 | // default implementation for compatibility
38 | clear(0, length());
39 | }
40 |
41 | /** The number of bits in the set. */
42 | public abstract int length();
43 |
44 | /** Set the bit at i
. */
45 | public abstract void set(int i);
46 |
47 | /** Set the bit at i
, returning true
if it was previously set. */
48 | public abstract boolean getAndSet(int i);
49 |
50 | /** Clear the bit at i
. */
51 | public abstract void clear(int i);
52 |
53 | /**
54 | * Clears a range of bits.
55 | *
56 | * @param startIndex lower index
57 | * @param endIndex one-past the last bit to clear
58 | */
59 | public abstract void clear(int startIndex, int endIndex);
60 |
61 | /** Return the number of bits that are set. NOTE: this method is likely to run in linear time */
62 | public abstract int cardinality();
63 |
64 | /**
65 | * Return an approximation of the cardinality of this set. Some implementations may trade accuracy
66 | * for speed if they have the ability to estimate the cardinality of the set without iterating
67 | * over all the data. The default implementation returns {@link #cardinality()}.
68 | */
69 | public abstract int approximateCardinality();
70 |
71 | /**
72 | * Returns the index of the last set bit before or on the index specified. -1 is returned if there
73 | * are no more set bits.
74 | */
75 | public abstract int prevSetBit(int index);
76 |
77 | /**
78 | * Returns the index of the first set bit starting at the index specified. {@link
79 | * DocIdSetIterator#NO_MORE_DOCS} is returned if there are no more set bits.
80 | */
81 | public abstract int nextSetBit(int index);
82 |
83 | @Override
84 | public String toString() {
85 | return getClass().getSimpleName() + "(length=" + length() + ", cardinality=~" + approximateCardinality() + ")";
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/Bits.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.util;
26 |
27 | /**
28 | * Interface for Bitset-like structures.
29 | */
30 | public interface Bits {
31 | Bits ALL = new MatchAllBits();
32 | Bits NONE = new MatchNoBits();
33 |
34 | /**
35 | * Returns the value of the bit with the specified index
.
36 | *
37 | * @param index index, should be non-negative. The result of passing
38 | * negative or out of bounds values is undefined by this interface, just don't do it!
39 | * @return true
if the bit is set, false
otherwise.
40 | */
41 | boolean get(int index);
42 |
43 | /**
44 | * Returns a Bits that is true when `bits` is false, and false when `bits` is true
45 | */
46 | static Bits inverseOf(Bits bits) {
47 | return new Bits() {
48 | @Override
49 | public boolean get(int index) {
50 | return !bits.get(index);
51 | }
52 | };
53 | }
54 |
55 | /**
56 | * Return a Bits that is set for a given ordinal iff both it is set in both `a` and `b`.
57 | */
58 | static Bits intersectionOf(Bits a, Bits b) {
59 | if (a instanceof MatchAllBits) {
60 | return b;
61 | }
62 | if (b instanceof MatchAllBits) {
63 | return a;
64 | }
65 |
66 | if (a instanceof MatchNoBits) {
67 | return a;
68 | }
69 | if (b instanceof MatchNoBits) {
70 | return b;
71 | }
72 |
73 | return new Bits() {
74 | @Override
75 | public boolean get(int index) {
76 | return a.get(index) && b.get(index);
77 | }
78 | };
79 | }
80 |
81 | /** Bits with all bits set. */
82 | class MatchAllBits implements Bits {
83 | @Override
84 | public boolean get(int index) {
85 | return true;
86 | }
87 | }
88 |
89 | /** Bits with no bits set. */
90 | class MatchNoBits implements Bits {
91 | @Override
92 | public boolean get(int index) {
93 | return false;
94 | }
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/Constants.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.util;
26 |
27 | /** Some useful constants. */
28 | public final class Constants {
29 | private Constants() {} // can't construct
30 |
31 | /** The value of System.getProperty("os.name")
. * */
32 | public static final String OS_NAME = System.getProperty("os.name");
33 |
34 | /** The value of System.getProperty("os.arch")
. */
35 | public static final String OS_ARCH = System.getProperty("os.arch");
36 |
37 | /** True iff running on a 64bit JVM */
38 | public static final boolean JRE_IS_64BIT;
39 |
40 | static {
41 | boolean is64Bit = false;
42 | String datamodel = null;
43 | try {
44 | datamodel = System.getProperty("sun.arch.data.model");
45 | if (datamodel != null) {
46 | is64Bit = datamodel.contains("64");
47 | }
48 | } catch (
49 | @SuppressWarnings("unused")
50 | SecurityException ex) {
51 | }
52 | if (datamodel == null) {
53 | if (OS_ARCH != null && OS_ARCH.contains("64")) {
54 | is64Bit = true;
55 | } else {
56 | is64Bit = false;
57 | }
58 | }
59 | JRE_IS_64BIT = is64Bit;
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/DocIdSetIterator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.util;
26 |
27 | public class DocIdSetIterator {
28 | public static final int NO_MORE_DOCS = Integer.MAX_VALUE;
29 | }
30 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/ExceptionUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.util;
18 |
19 | import java.io.IOException;
20 |
21 | public class ExceptionUtils {
22 | public static void throwIoException(Throwable t) throws IOException {
23 | if (t instanceof RuntimeException) {
24 | throw (RuntimeException) t;
25 | } else if (t instanceof Error) {
26 | throw (Error) t;
27 | } else if (t instanceof IOException) {
28 | throw (IOException) t;
29 | } else {
30 | throw new RuntimeException(t);
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/ExplicitThreadLocal.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.util;
18 |
19 | import java.util.concurrent.ConcurrentHashMap;
20 | import java.util.function.Function;
21 | import java.util.function.Supplier;
22 |
23 | /**
24 | * The standard {@link ThreadLocal} appears to be designed to be used with relatively
25 | * short-lived Threads. Specifically, it uses a ThreadLocalMap to store ThreadLocal key/value
26 | * Entry objects, and there are no guarantees as to when Entry references are expunged unless
27 | * you can explicitly call remove() on the ThreadLocal instance. This means that objects
28 | * referenced by ThreadLocals will not be able to be GC'd for the lifetime of the Thread,
29 | * effectively "leaking" these objects even if there are no other references.
30 | *
31 | * This makes ThreadLocal a bad fit for long-lived threads, such as those in the thread pools
32 | * used by JVector.
33 | *
34 | * Because ExplicitThreadLocal doesn't hook into Thread internals, any referenced values
35 | * can be GC'd as expected as soon as the ETL instance itself is no longer referenced.
36 | * ExplicitThreadLocal also implements AutoCloseable to cleanup non-GC'd resources.
37 | *
38 | * ExplicitThreadLocal is a drop-in replacement for ThreadLocal, and is used in the same way.
39 | */
40 | public abstract class ExplicitThreadLocal implements AutoCloseable {
41 | // thread id -> instance
42 | private final ConcurrentHashMap map = new ConcurrentHashMap<>();
43 |
44 | // computeIfAbsent wants a callable that takes a parameter, but if we use a lambda
45 | // it will be a closure and we'll get a new instance for every call. So we instantiate
46 | // it just once here as a field instead.
47 | private final Function initialSupplier = k -> initialValue();
48 |
49 | public U get() {
50 | return map.computeIfAbsent(Thread.currentThread().getId(), initialSupplier);
51 | }
52 |
53 | protected abstract U initialValue();
54 |
55 | /**
56 | * Invoke the close() method on all AutoCloseable values in the map, and then clear the map.
57 | *
58 | * Not threadsafe.
59 | */
60 | @Override
61 | public void close() throws Exception {
62 | for (U value : map.values()) {
63 | if (value instanceof AutoCloseable) {
64 | ((AutoCloseable) value).close();
65 | }
66 | }
67 | map.clear();
68 | }
69 |
70 | public static ExplicitThreadLocal withInitial(Supplier initialValue) {
71 | return new ExplicitThreadLocal<>() {
72 | @Override
73 | protected U initialValue() {
74 | return initialValue.get();
75 | }
76 | };
77 | }
78 | }
79 |
80 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/GrowableBitSet.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.util;
18 |
19 | /**
20 | * A {@link BitSet} implementation that grows as needed to accommodate set(index) calls. When it
21 | * does so, it will grow its internal storage multiplicatively, assuming that more growth will be
22 | * needed in the future. This is the important difference from FixedBitSet + FBS.ensureCapacity,
23 | * which grows the minimum necessary each time.
24 | *
25 | * For thread-safe version see {@link ThreadSafeGrowableBitSet}.
26 | */
27 | public class GrowableBitSet extends BitSet {
28 |
29 | private final java.util.BitSet bitSet;
30 |
31 | public GrowableBitSet(java.util.BitSet bitSet) {
32 | this.bitSet = bitSet;
33 | }
34 |
35 | public GrowableBitSet(int initialBits) {
36 | this.bitSet = new java.util.BitSet(initialBits);
37 | }
38 |
39 | @Override
40 | public void clear(int index) {
41 | bitSet.clear(index);
42 | }
43 |
44 | @Override
45 | public void clear() {
46 | bitSet.clear();
47 | }
48 |
49 | @Override
50 | public boolean get(int index) {
51 | return bitSet.get(index);
52 | }
53 |
54 | @Override
55 | public boolean getAndSet(int index) {
56 | boolean v = get(index);
57 | set(index);
58 | return v;
59 | }
60 |
61 | @Override
62 | public int length() {
63 | return bitSet.length();
64 | }
65 |
66 | @Override
67 | public void set(int i) {
68 | bitSet.set(i);
69 | }
70 |
71 | @Override
72 | public void clear(int startIndex, int endIndex) {
73 | if (startIndex == 0 && endIndex == bitSet.length()) {
74 | bitSet.clear();
75 | return;
76 | } else if (startIndex >= endIndex) {
77 | return;
78 | }
79 | bitSet.clear(startIndex, endIndex);
80 | }
81 |
82 | @Override
83 | public int cardinality() {
84 | return bitSet.cardinality();
85 | }
86 |
87 | @Override
88 | public int approximateCardinality() {
89 | return bitSet.cardinality();
90 | }
91 |
92 | @Override
93 | public int prevSetBit(int index) {
94 | return bitSet.previousSetBit(index);
95 | }
96 |
97 | @Override
98 | public int nextSetBit(int i) {
99 | int next = bitSet.nextSetBit(i);
100 | if (next == -1) {
101 | next = DocIdSetIterator.NO_MORE_DOCS;
102 | }
103 | return next;
104 | }
105 |
106 | @Override
107 | public long ramBytesUsed() {
108 | throw new UnsupportedOperationException();
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/GrowableLongHeap.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.util;
26 |
27 | import java.util.PrimitiveIterator;
28 |
29 | /**
30 | * An AbstractLongHeap that can grow in size (unbounded, except for memory and array size limits).
31 | */
32 | public class GrowableLongHeap extends AbstractLongHeap {
33 | /**
34 | * Create an empty heap with the configured initial size.
35 | *
36 | * @param initialSize the initial size of the heap
37 | */
38 | public GrowableLongHeap(int initialSize) {
39 | super(initialSize);
40 | }
41 |
42 | /**
43 | * Adds a value to an LongHeap in log(size) time.
44 | *
45 | * @return true always
46 | */
47 | @Override
48 | public boolean push(long element) {
49 | add(element);
50 | return true;
51 | }
52 |
53 | @Override
54 | public void pushMany(PrimitiveIterator.OfLong elements, int elementsSize)
55 | {
56 | addMany(elements, elementsSize);
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/IntMap.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.util;
18 |
19 | import io.github.jbellis.jvector.graph.NodesIterator;
20 |
21 | import java.util.stream.IntStream;
22 |
23 | public interface IntMap {
24 | /**
25 | * @param key ordinal
26 | * @return true if successful, false if the current value != `existing`
27 | */
28 | boolean compareAndPut(int key, T existing, T value);
29 |
30 | /**
31 | * @return number of items that have been added
32 | */
33 | int size();
34 |
35 | /**
36 | * @param key ordinal
37 | * @return the value of the key, or null if not set
38 | */
39 | T get(int key);
40 |
41 | /**
42 | * @return the former value of the key, or null if it was not set
43 | */
44 | T remove(int key);
45 |
46 | /**
47 | * @return true iff the given key is set in the map
48 | */
49 | boolean containsKey(int key);
50 |
51 | /**
52 | * Iterates keys in ascending order and calls the consumer for each non-null key-value pair.
53 | */
54 | void forEach(IntBiConsumer consumer);
55 |
56 | @FunctionalInterface
57 | interface IntBiConsumer {
58 | void consume(int key, T2 value);
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/MathUtil.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.util;
18 |
19 | public class MathUtil {
20 | // looks silly at first but it really does make code more readable
21 | public static float square(float a) {
22 | return a * a;
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/NumericUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.util;
26 |
27 | /**
28 | * Helper APIs to encode numeric values as sortable bytes and vice-versa.
29 | *
30 | * To also index floating point numbers, this class supplies a method to convert them to
31 | * integer values by changing their bit layout: {@link
32 | * #floatToSortableInt}. You will have no precision loss by converting floating point numbers to
33 | * integers and back (only that the integer form is not usable). Other data types like dates can
34 | * easily converted to longs or ints (e.g. date to long: {@link java.util.Date#getTime}).
35 | */
36 | public final class NumericUtils {
37 |
38 | private NumericUtils() {} // no instance!
39 |
40 | /**
41 | * Converts a float
value to a sortable signed int
. The value is
42 | * converted by getting their IEEE 754 floating-point "float format" bit layout and then
43 | * some bits are swapped, to be able to compare the result as int. By this the precision is not
44 | * reduced, but the value can easily used as an int. The sort order (including {@link Float#NaN})
45 | * is defined by {@link Float#compareTo}; {@code NaN} is greater than positive infinity.
46 | *
47 | * @see #sortableIntToFloat
48 | */
49 | public static int floatToSortableInt(float value) {
50 | return sortableFloatBits(Float.floatToIntBits(value));
51 | }
52 |
53 | /**
54 | * Converts a sortable int
back to a float
.
55 | *
56 | * @see #floatToSortableInt
57 | */
58 | public static float sortableIntToFloat(int encoded) {
59 | return Float.intBitsToFloat(sortableFloatBits(encoded));
60 | }
61 |
62 | /** Converts IEEE 754 representation of a float to sortable order (or back to the original) */
63 | public static int sortableFloatBits(int bits) {
64 | return bits ^ (bits >> 31) & 0x7fffffff;
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/PhysicalCoreExecutor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.github.jbellis.jvector.util;
17 |
18 | import io.github.jbellis.jvector.graph.GraphIndexBuilder;
19 | import io.github.jbellis.jvector.quantization.ProductQuantization;
20 |
21 | import java.io.Closeable;
22 | import java.util.concurrent.ForkJoinPool;
23 | import java.util.function.Supplier;
24 |
25 | /**
26 | * A fork join pool which is sized to match the number of physical cores on the machine (avoiding hyper-thread count)
27 | *
28 | * This is important for heavily vectorized sections of the code since it can easily saturate memory bandwidth.
29 | *
30 | * @see ProductQuantization
31 | * @see GraphIndexBuilder
32 | *
33 | * Knowing how many physical cores a machine has is left to the operator (however the default of 1/2 cores is today often correct).
34 | */
35 | public class PhysicalCoreExecutor implements Closeable {
36 | private static final int physicalCoreCount = Integer.getInteger("jvector.physical_core_count", Math.max(1, Runtime.getRuntime().availableProcessors()/2));
37 |
38 | public static final PhysicalCoreExecutor instance = new PhysicalCoreExecutor(physicalCoreCount);
39 |
40 | public static ForkJoinPool pool() {
41 | return instance.pool;
42 | }
43 |
44 | private final ForkJoinPool pool;
45 |
46 | private PhysicalCoreExecutor(int cores) {
47 | assert cores > 0 && cores <= Runtime.getRuntime().availableProcessors() : "Invalid core count: " + cores;
48 | this.pool = new ForkJoinPool(cores);
49 | }
50 |
51 | public void execute(Runnable run) {
52 | pool.submit(run).join();
53 | }
54 |
55 | public T submit(Supplier run) {
56 | return pool.submit(run::get).join();
57 | }
58 |
59 | public static int getPhysicalCoreCount() {
60 | return physicalCoreCount;
61 | }
62 |
63 | @Override
64 | public void close() {
65 | pool.shutdownNow();
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/SparseBits.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.util;
18 |
19 | import org.agrona.collections.IntHashSet;
20 |
21 | /**
22 | * Implements the membership parts of an updatable BitSet (but not prev/next bits)
23 | */
24 | public class SparseBits implements Bits {
25 | private final IntHashSet set = new IntHashSet();
26 |
27 | @Override
28 | public boolean get(int index) {
29 | return set.contains(index);
30 | }
31 |
32 | public void set(int index) {
33 | set.add(index);
34 | }
35 |
36 | public void clear() {
37 | set.clear();
38 | }
39 |
40 | public int cardinality() {
41 | return set.size();
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/util/SparseIntMap.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.util;
18 |
19 | import io.github.jbellis.jvector.graph.NodesIterator;
20 |
21 | import java.util.concurrent.ConcurrentHashMap;
22 | import java.util.stream.IntStream;
23 |
24 | public class SparseIntMap implements IntMap {
25 | private final ConcurrentHashMap map;
26 |
27 | public SparseIntMap() {
28 | this.map = new ConcurrentHashMap<>();
29 | }
30 |
31 | @Override
32 | public boolean compareAndPut(int key, T existing, T value) {
33 | if (value == null) {
34 | throw new IllegalArgumentException("compareAndPut() value cannot be null -- use remove() instead");
35 | }
36 |
37 | if (existing == null) {
38 | T result = map.putIfAbsent(key, value);
39 | return result == null;
40 | }
41 |
42 | return map.replace(key, existing, value);
43 | }
44 |
45 | @Override
46 | public int size() {
47 | return map.size();
48 | }
49 |
50 | @Override
51 | public T get(int key) {
52 | return map.get(key);
53 | }
54 |
55 | @Override
56 | public T remove(int key) {
57 | return map.remove(key);
58 | }
59 |
60 | @Override
61 | public boolean containsKey(int key) {
62 | return map.containsKey(key);
63 | }
64 |
65 | public IntStream keysStream() {
66 | return map.keySet().stream().mapToInt(key -> key);
67 | }
68 |
69 | @Override
70 | public void forEach(IntBiConsumer consumer) {
71 | map.forEach(consumer::consume);
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/vector/ArrayVectorFloat.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.vector;
18 |
19 | import io.github.jbellis.jvector.util.RamUsageEstimator;
20 | import io.github.jbellis.jvector.vector.types.VectorFloat;
21 |
22 | import java.util.Arrays;
23 |
24 | /**
25 | * VectorFloat implementation backed by an on-heap float array.
26 | */
27 | final public class ArrayVectorFloat implements VectorFloat
28 | {
29 | private final float[] data;
30 |
31 | ArrayVectorFloat(int length)
32 | {
33 | this.data = new float[length];
34 | }
35 |
36 | ArrayVectorFloat(float[] data)
37 | {
38 | this.data = data;
39 | }
40 |
41 | @Override
42 | public float[] get()
43 | {
44 | return data;
45 | }
46 |
47 | @Override
48 | public float get(int n) {
49 | return data[n];
50 | }
51 |
52 | @Override
53 | public void set(int n, float value) {
54 | data[n] = value;
55 | }
56 |
57 | @Override
58 | public void zero() {
59 | Arrays.fill(data, 0);
60 | }
61 |
62 | @Override
63 | public int length()
64 | {
65 | return data.length;
66 | }
67 |
68 | @Override
69 | public VectorFloat copy()
70 | {
71 | return new ArrayVectorFloat(Arrays.copyOf(data, data.length));
72 | }
73 |
74 | @Override
75 | public void copyFrom(VectorFloat> src, int srcOffset, int destOffset, int length)
76 | {
77 | ArrayVectorFloat csrc = (ArrayVectorFloat) src;
78 | System.arraycopy(csrc.data, srcOffset, data, destOffset, length);
79 | }
80 |
81 | @Override
82 | public long ramBytesUsed()
83 | {
84 | int OH_BYTES = RamUsageEstimator.NUM_BYTES_OBJECT_HEADER;
85 | return OH_BYTES + RamUsageEstimator.sizeOf(data);
86 | }
87 |
88 | @Override
89 | public String toString() {
90 | StringBuilder sb = new StringBuilder();
91 | sb.append("[");
92 | for (int i = 0; i < Math.min(data.length, 25); i++) {
93 | sb.append(data[i]);
94 | if (i < data.length - 1) {
95 | sb.append(", ");
96 | }
97 | }
98 | if (data.length > 25) {
99 | sb.append("...");
100 | }
101 | sb.append("]");
102 | return sb.toString();
103 | }
104 |
105 | @Override
106 | public boolean equals(Object o)
107 | {
108 | if (this == o) return true;
109 | if (o == null || getClass() != o.getClass()) return false;
110 | ArrayVectorFloat that = (ArrayVectorFloat) o;
111 | return Arrays.equals(data, that.data);
112 | }
113 |
114 | @Override
115 | public int hashCode()
116 | {
117 | return this.getHashCode();
118 | }
119 | }
120 |
121 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/vector/DefaultVectorizationProvider.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.vector;
26 |
27 | import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
28 |
29 | /** Default provider returning scalar implementations. */
30 | final public class DefaultVectorizationProvider extends VectorizationProvider {
31 |
32 | private final VectorUtilSupport vectorUtilSupport;
33 | private final VectorTypeSupport vectorTypes;
34 |
35 |
36 | public DefaultVectorizationProvider() {
37 | vectorUtilSupport = new DefaultVectorUtilSupport();
38 | vectorTypes = new ArrayVectorProvider();
39 | }
40 |
41 | @Override
42 | public VectorUtilSupport getVectorUtilSupport() {
43 | return vectorUtilSupport;
44 | }
45 |
46 | @Override
47 | public VectorTypeSupport getVectorTypeSupport() {
48 | return vectorTypes;
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorSimilarityFunction.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.vector;
26 |
27 | import io.github.jbellis.jvector.vector.types.VectorFloat;
28 |
29 | /**
30 | * Vector similarity function; used in search to return top K most similar vectors to a target
31 | * vector. This is a label describing the method used during indexing and searching of the vectors
32 | * in order to determine the nearest neighbors.
33 | */
34 | public enum VectorSimilarityFunction {
35 |
36 | /** Euclidean distance */
37 | EUCLIDEAN {
38 | @Override
39 | public float compare(VectorFloat> v1, VectorFloat> v2) {
40 | return 1 / (1 + VectorUtil.squareL2Distance(v1, v2));
41 | }
42 | },
43 |
44 | /**
45 | * Dot product. NOTE: this similarity is intended as an optimized way to perform cosine
46 | * similarity. In order to use it, all vectors must be normalized, including both document and
47 | * query vectors. Using dot product with vectors that are not normalized can result in errors or
48 | * poor search results. Floating point vectors must be normalized to be of unit length, while byte
49 | * vectors should simply all have the same norm.
50 | */
51 | DOT_PRODUCT {
52 | @Override
53 | public float compare(VectorFloat> v1, VectorFloat> v2) {
54 | return (1 + VectorUtil.dotProduct(v1, v2)) / 2;
55 | }
56 | },
57 |
58 | /**
59 | * Cosine similarity. NOTE: the preferred way to perform cosine similarity is to normalize all
60 | * vectors to unit length, and instead use {@link VectorSimilarityFunction#DOT_PRODUCT}. You
61 | * should only use this function if you need to preserve the original vectors and cannot normalize
62 | * them in advance. The similarity score is normalised to assure it is positive.
63 | */
64 | COSINE {
65 | @Override
66 | public float compare(VectorFloat> v1, VectorFloat> v2) {
67 | return (1 + VectorUtil.cosine(v1, v2)) / 2;
68 | }
69 | };
70 |
71 | /**
72 | * Calculates a similarity score between the two vectors with a specified function. Higher
73 | * similarity scores correspond to closer vectors.
74 | *
75 | * @param v1 a vector
76 | * @param v2 another vector, of the same dimension
77 | * @return the value of the similarity function applied to the two vectors
78 | */
79 | public abstract float compare(VectorFloat> v1, VectorFloat> v2);
80 | }
81 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/vector/types/ByteSequence.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.vector.types;
18 |
19 | import io.github.jbellis.jvector.util.Accountable;
20 | import java.util.Objects;
21 |
22 | public interface ByteSequence extends Accountable
23 | {
24 | /**
25 | * @return entire sequence backing storage
26 | */
27 | T get();
28 |
29 | int offset();
30 |
31 | int length();
32 |
33 | byte get(int i);
34 |
35 | void set(int i, byte value);
36 |
37 | /**
38 | * @param shortIndex index (as if this was a short array) inside the sequence to set the short value
39 | * @param value short value to set
40 | */
41 | void setLittleEndianShort(int shortIndex, short value);
42 |
43 | void zero();
44 |
45 | void copyFrom(ByteSequence> src, int srcOffset, int destOffset, int length);
46 |
47 | ByteSequence copy();
48 |
49 | ByteSequence slice(int offset, int length);
50 |
51 | /**
52 | * Two ByteSequences are equal if they have the same length and the same bytes at each position.
53 | * @param o the other object to compare to
54 | * @return true if the two ByteSequences are equal
55 | */
56 | default boolean equalTo(Object o) {
57 | if (this == o) return true;
58 | if (!(o instanceof ByteSequence)) return false;
59 | ByteSequence> that = (ByteSequence>) o;
60 | if (length() != that.length()) return false;
61 | for (int i = 0; i < length(); i++) {
62 | if (get(i) != that.get(i)) return false;
63 | }
64 | return true;
65 | }
66 |
67 | /**
68 | * @return a hash code for this ByteSequence
69 | */
70 | default int getHashCode() {
71 | int result = 1;
72 | for (int i = 0; i < length(); i++) {
73 | if (get(i) != 0) {
74 | result = 31 * result + get(i);
75 | }
76 | }
77 | return result;
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/vector/types/VectorFloat.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.vector.types;
18 |
19 | import io.github.jbellis.jvector.util.Accountable;
20 |
21 | public interface VectorFloat extends Accountable
22 | {
23 | /**
24 | * @return entire vector backing storage
25 | */
26 | T get();
27 |
28 | int length();
29 |
30 | default int offset(int i) {
31 | return i;
32 | }
33 |
34 | VectorFloat copy();
35 |
36 | void copyFrom(VectorFloat> src, int srcOffset, int destOffset, int length);
37 |
38 | float get(int i);
39 |
40 | void set(int i, float value);
41 |
42 | void zero();
43 |
44 | default int getHashCode() {
45 | int result = 1;
46 | for (int i = 0; i < length(); i++) {
47 | if (get(i) != 0) {
48 | result = 31 * result + Float.hashCode(get(i));
49 | }
50 | }
51 | return result;
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/jvector-base/src/main/java/io/github/jbellis/jvector/vector/types/VectorTypeSupport.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.vector.types;
18 |
19 | import io.github.jbellis.jvector.disk.RandomAccessReader;
20 |
21 | import java.io.DataOutput;
22 | import java.io.IOException;
23 |
24 | public interface VectorTypeSupport {
25 | /**
26 | * Create a vector from the given data.
27 | *
28 | * @param data the data to create the vector from. Supported data types are implementation-dependent.
29 | * @return the created vector.
30 | */
31 | VectorFloat> createFloatVector(Object data);
32 |
33 | /**
34 | * Create a zero-filled vector of the given length.
35 | * @param length the length of the vector to create.
36 | * @return the created vector.
37 | */
38 | VectorFloat> createFloatVector(int length);
39 |
40 | /**
41 | * Read a vector from the given RandomAccessReader.
42 | * @param r the reader to read the vector from.
43 | * @param size the size of the vector to read.
44 | * @return the vector.
45 | * @throws IOException
46 | */
47 | VectorFloat> readFloatVector(RandomAccessReader r, int size) throws IOException;
48 |
49 | /**
50 | * Read a vector from the given RandomAccessReader and store it in the given vector at the specified offset.
51 | * @param r the reader to read the vector from.
52 | * @param size the size of the vector to read.
53 | * @param vector the vector to store the read data in.
54 | * @param offset the offset in the vector to store the read data at.
55 | * @throws IOException
56 | */
57 | void readFloatVector(RandomAccessReader r, int size, VectorFloat> vector, int offset) throws IOException;
58 |
59 | /**
60 | * Write the given vector to the given DataOutput.
61 | * @param out the output to write the vector to.
62 | * @param vector the vector to write.
63 | * @throws IOException
64 | */
65 | void writeFloatVector(DataOutput out, VectorFloat> vector) throws IOException;
66 |
67 | /**
68 | * Create a sequence from the given data.
69 | *
70 | * @param data the data to create the sequence from. Supported data types are implementation-dependent.
71 | * @return the created vector.
72 | */
73 | ByteSequence> createByteSequence(Object data);
74 |
75 | /**
76 | * Create a zero-filled sequence of the given length.
77 | * @param length the length of the sequence to create.
78 | * @return the created sequence.
79 | */
80 | ByteSequence> createByteSequence(int length);
81 |
82 | ByteSequence> readByteSequence(RandomAccessReader r, int size) throws IOException;
83 |
84 | void readByteSequence(RandomAccessReader r, ByteSequence> sequence) throws IOException;
85 |
86 | void writeByteSequence(DataOutput out, ByteSequence> sequence) throws IOException;
87 | }
88 |
--------------------------------------------------------------------------------
/jvector-examples/README.md:
--------------------------------------------------------------------------------
1 | # JVector Examples
2 |
3 | JVector comes with the following sample programs to try:
4 |
5 | ### SiftSmall
6 | A simple benchmark for the sift dataset located in the [siftsmall](./siftsmall) directory in the project root.
7 |
8 | > `mvn compile exec:exec@sift`
9 |
10 | ### Bench
11 | Performs grid search across the `GraphIndexBuilder` parameter space to find
12 | the best tradeoffs between recall and throughput.
13 |
14 | This benchmark requires datasets from [https://github.com/erikbern/ann-benchmarks](https://github.com/erikbern/ann-benchmarks/blob/main/README.md#data-sets) to be downloaded to hdf5 and fvec
15 | directories `hdf5` or `fvec` under the project root depending on the dataset format.
16 |
17 | You can use [`plot_output.py`](./plot_output.py) to graph the [pareto-optimal points](https://en.wikipedia.org/wiki/Pareto_efficiency) found by `Bench`.
18 |
19 | > `mvn compile exec:exec@bench`
20 |
21 | Some sample KNN datasets for testing based on ada-002 embeddings generated on wikipedia data are available in ivec/fvec format for testing at:
22 |
23 | ```
24 | aws s3 ls s3://astra-vector/wikipedia_squad/ --no-sign-request
25 | PRE 100k/
26 | PRE 1M/
27 | PRE 4M/
28 | ```
29 |
30 | Bench automatically downloads the 100k dataset to the `./fvec` directory .
31 |
32 | To run `SiftSmall`/`Bench` without the JVM vector module available, you can use the following invocations:
33 |
34 | > `mvn -Pjdk11 compile exec:exec@bench`
35 |
36 | > `mvn -Pjdk11 compile exec:exec@sift`
37 |
38 | ### IPCService
39 |
40 | A simple service for adding / querying vectors over a unix socket.
41 |
42 | Install [socat]() using homebrew on mac or apt/rpm on linux
43 |
44 | Mac:
45 | > `brew install socat`
46 |
47 | Linux:
48 | > `apt-get install socat`
49 |
50 | Start the service with:
51 | > `mvn compile exec:exec@ipcserve`
52 |
53 | Now you can interact with the service
54 | ```bash
55 | socat - unix-client:/tmp/jvector.sock
56 |
57 | CREATE 3 DOT_PRODUCT 2 20
58 | OK
59 | WRITE [0.1,0.15,0.3]
60 | OK
61 | WRITE [0.2,0.83,0.05]
62 | OK
63 | WRITE [0.5,0.5,0.5]
64 | OK
65 | OPTIMIZE
66 | OK
67 | SEARCH 20 3 [0.15,0.1,0.1]
68 | RESULT [2,1,0]
69 | ```
70 |
71 | #### Commands
72 | All commands are completed with `\n`.
73 |
74 | No spaces are allowed inside vector brackets.
75 |
76 | * `CREATE {dimensions} {similarity-function} {M} {searchDepthConstruction}`
77 | * Creates a new index for this session
78 |
79 | * `WRITE [N,N,N] ... [N,N,N]`
80 | * Add one or more vectors to the index
81 | * `OPTIMIZE`
82 | * Call when indexing is complete
83 | * `MEMORY`
84 | * Get the in memory size of index
85 | * `SEARCH {overquerySearch} {top-k} [N,N,N] ... [N,N,N]`
86 | * Search index for the top-k closest vectors (ordinals of indexed values returned per query)
87 | * `BULKLOAD {localpath}`
88 | * Bulk loads a local file in numpy format Rows x Columns
89 |
90 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example;
18 |
19 | import io.github.jbellis.jvector.example.util.*;
20 | import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters;
21 | import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
22 |
23 | import java.io.IOException;
24 | import java.util.Arrays;
25 | import java.util.EnumSet;
26 | import java.util.List;
27 | import java.util.Map;
28 | import java.util.function.Function;
29 |
30 | import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED;
31 |
32 | /**
33 | * Tests GraphIndexes against vectors from a 2D dataset
34 | */
35 | public class Bench2D {
36 | public static void main(String[] args) throws IOException {
37 | System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory());
38 |
39 | var mGrid = List.of(32); // List.of(16, 24, 32, 48, 64, 96, 128);
40 | var efConstructionGrid = List.of(100); // List.of(60, 80, 100, 120, 160, 200, 400, 600, 800);
41 | var topKGrid = Map.of(
42 | 10, // topK
43 | List.of(1.0, 2.0, 5.0, 10.0, 20.0) // oq
44 | ); // rerankK = oq * topK
45 | var neighborOverflowGrid = List.of(1.2f); // List.of(1.2f, 2.0f);
46 | var addHierarchyGrid = List.of(true); // List.of(false, true);
47 | var usePruningGrid = List.of(false); // List.of(false, true);
48 | List> buildCompression = Arrays.asList(__ -> CompressorParameters.NONE);
49 | List> searchCompression = Arrays.asList(
50 | __ -> CompressorParameters.NONE,
51 | ds -> new PQParameters(ds.getDimension(), 256, true, UNWEIGHTED)
52 | );
53 | List> featureSets = Arrays.asList(
54 | EnumSet.of(FeatureId.NVQ_VECTORS),
55 | EnumSet.of(FeatureId.INLINE_VECTORS)
56 | );
57 |
58 | // 2D grid, built and calculated at runtime
59 | var grid2d = DataSetCreator.create2DGrid(4_000_000, 10_000, 100);
60 |
61 | Grid.runAll(grid2d, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, featureSets,
62 | buildCompression, searchCompression, topKGrid, usePruningGrid);
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example;
18 |
19 | import io.github.jbellis.jvector.example.util.DataSet;
20 | import io.github.jbellis.jvector.example.util.DownloadHelper;
21 | import io.github.jbellis.jvector.example.yaml.MultiConfig;
22 |
23 | import java.io.IOException;
24 |
25 | /**
26 | * Tests GraphIndexes against vectors from various datasets
27 | */
28 | public class HelloVectorWorld {
29 | public static void main(String[] args) throws IOException {
30 | System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory());
31 |
32 | String datasetName = "ada002-100k";
33 |
34 | var mfd = DownloadHelper.maybeDownloadFvecs(datasetName);
35 | DataSet ds = mfd.load();
36 |
37 | MultiConfig config = MultiConfig.getConfig(datasetName);
38 |
39 | Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction,
40 | config.construction.neighborOverflow, config.construction.addHierarchy,
41 | config.construction.getFeatureSets(), config.construction.getCompressorParameters(),
42 | config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning);
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AbstractQueryBenchmark.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.benchmarks;
18 |
19 | public abstract class AbstractQueryBenchmark implements QueryBenchmark {}
20 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.benchmarks;
18 |
19 | import java.util.List;
20 | import java.util.stream.IntStream;
21 | import io.github.jbellis.jvector.example.Grid.ConfiguredSystem;
22 | import io.github.jbellis.jvector.graph.SearchResult;
23 |
24 | /**
25 | * Measures average execution time over N runs through all queries in parallel.
26 | */
27 | public class ExecutionTimeBenchmark extends AbstractQueryBenchmark {
28 | private static final String DEFAULT_FORMAT = ".1f";
29 |
30 | private static volatile long SINK;
31 | private String format;
32 |
33 | public static ExecutionTimeBenchmark createDefault() {
34 | return new ExecutionTimeBenchmark(DEFAULT_FORMAT);
35 | }
36 |
37 | private ExecutionTimeBenchmark(String format) {
38 | this.format = format;
39 | }
40 |
41 | public ExecutionTimeBenchmark setFormat(String format) {
42 | this.format = format;
43 | return this;
44 | }
45 |
46 | @Override
47 | public String getBenchmarkName() {
48 | return "ExecutionTimeBenchmark";
49 | }
50 |
51 | @Override
52 | public List runBenchmark(
53 | ConfiguredSystem cs,
54 | int topK,
55 | int rerankK,
56 | boolean usePruning,
57 | int queryRuns) {
58 |
59 | int totalQueries = cs.getDataSet().queryVectors.size();
60 | double totalRuntime = 0;
61 |
62 | for (int run = 0; run < queryRuns; run++) {
63 | double startTime = System.nanoTime();
64 |
65 | // execute all queries in parallel
66 | IntStream.range(0, totalQueries)
67 | .parallel()
68 | .forEach(i -> {
69 | SearchResult sr = QueryExecutor.executeQuery(
70 | cs, topK, rerankK, usePruning, i);
71 | SINK += sr.getVisitedCount();
72 | });
73 |
74 | totalRuntime += System.nanoTime() - startTime;
75 | }
76 |
77 | double avgRuntimeSec = totalRuntime / queryRuns / 1e9;
78 | return List.of(Metric.of("Avg Runtime (s)", format, avgRuntimeSec));
79 | }
80 | }
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/Metric.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.benchmarks;
18 |
19 | import java.util.Map;
20 | import java.util.function.Function;
21 |
22 | /**
23 | * A single column in the table:
24 | * - header: The column title
25 | * - fmtSpec: The format specifier, e.g. ".3f" for floats or "s" for strings
26 | * - extractor: How to pull the value from the summary‐map
27 | */
28 | public class Metric {
29 | private final String header;
30 | private final String fmtSpec;
31 | private final double value;
32 |
33 | private Metric(String header, String fmtSpec, double value) {
34 | this.header = header;
35 | this.fmtSpec = fmtSpec;
36 | this.value = value;
37 | }
38 |
39 | public String getHeader() { return header; }
40 | public String getFmtSpec() { return fmtSpec; }
41 | public double getValue() { return value; }
42 |
43 | public static Metric of(String header, String fmtSpec, double value) {
44 | return new Metric(header, fmtSpec, value);
45 | }
46 |
47 | @Override
48 | public String toString() {
49 | return String.format(header + " = " + fmtSpec, value);
50 | }
51 | }
52 |
53 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryBenchmark.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.benchmarks;
18 |
19 | import io.github.jbellis.jvector.example.Grid.ConfiguredSystem;
20 |
21 | import java.util.List;
22 |
23 | /**
24 | * A common interface for all search benchmarks.
25 | */
26 | public interface QueryBenchmark {
27 | String getBenchmarkName();
28 |
29 | List runBenchmark(
30 | ConfiguredSystem cs,
31 | int topK,
32 | int rerankK,
33 | boolean usePruning,
34 | int queryRuns
35 | );
36 | }
37 |
38 |
39 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.benchmarks;
18 |
19 | import io.github.jbellis.jvector.example.Grid.ConfiguredSystem;
20 | import io.github.jbellis.jvector.graph.SearchResult;
21 | import io.github.jbellis.jvector.util.Bits;
22 |
23 | public class QueryExecutor {
24 | /**
25 | * Executes the query at index i using the given parameters.
26 | *
27 | * @param cs Configured system that contains the query vectors.
28 | * @param topK Number of top results.
29 | * @param rerankK Number of candidates for reranking.
30 | * @param usePruning Whether to use pruning.
31 | * @param i The query vector index.
32 | * @return the SearchResult for query i.
33 | */
34 | public static SearchResult executeQuery(ConfiguredSystem cs, int topK, int rerankK, boolean usePruning, int i) {
35 | var queryVector = cs.getDataSet().queryVectors.get(i);
36 | var searcher = cs.getSearcher();
37 | searcher.usePruning(usePruning);
38 | var sf = cs.scoreProviderFor(queryVector, searcher.getView());
39 | return searcher.search(sf, topK, rerankK, 0.0f, 0.0f, Bits.ALL);
40 | }
41 | }
42 |
43 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryTester.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.benchmarks;
18 |
19 | import java.util.ArrayList;
20 | import java.util.LinkedHashMap;
21 | import java.util.List;
22 | import java.util.Map;
23 |
24 | import io.github.jbellis.jvector.example.Grid.ConfiguredSystem;
25 |
26 | /**
27 | * Orchestrates running a set of QueryBenchmark instances
28 | * and collects their summary results.
29 | */
30 | public class QueryTester {
31 | private final List benchmarks;
32 |
33 | /**
34 | * @param benchmarks the benchmarks to run, in the order provided
35 | */
36 | public QueryTester(List benchmarks) {
37 | this.benchmarks = benchmarks;
38 | }
39 |
40 | /**
41 | * Run each benchmark once and return a map from each Summary class
42 | * to its returned summary instance.
43 | *
44 | * @param cs the configured system under test
45 | * @param topK the top‑K parameter for all benchmarks
46 | * @param rerankK the rerank‑K parameter
47 | * @param usePruning whether to enable pruning
48 | * @param queryRuns number of runs for each benchmark
49 | */
50 | public List run(
51 | ConfiguredSystem cs,
52 | int topK,
53 | int rerankK,
54 | boolean usePruning,
55 | int queryRuns) {
56 |
57 | List results = new ArrayList<>();
58 |
59 | for (var benchmark : benchmarks) {
60 | var metrics = benchmark.runBenchmark(cs, topK, rerankK, usePruning, queryRuns);
61 | results.addAll(metrics);
62 | }
63 |
64 | return results;
65 | }
66 | }
67 |
68 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.util;
18 |
19 | import java.io.IOException;
20 |
21 | public class DataSetLoader {
22 | public static DataSet loadDataSet(String fileName) throws IOException {
23 | DataSet ds;
24 | if (fileName.endsWith(".hdf5")) {
25 | DownloadHelper.maybeDownloadHdf5(fileName);
26 | ds = Hdf5Loader.load(fileName);
27 | } else {
28 | var mfd = DownloadHelper.maybeDownloadFvecs(fileName);
29 | ds = mfd.load();
30 | }
31 | return ds;
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MMapRandomAccessVectorValues.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.util;
18 |
19 | import com.indeed.util.mmap.MMapBuffer;
20 | import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
21 | import io.github.jbellis.jvector.vector.VectorizationProvider;
22 | import io.github.jbellis.jvector.vector.types.VectorFloat;
23 | import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
24 |
25 | import java.io.Closeable;
26 | import java.io.File;
27 | import java.io.IOError;
28 | import java.io.IOException;
29 | import java.nio.ByteOrder;
30 | import java.nio.channels.FileChannel;
31 |
32 | public class MMapRandomAccessVectorValues implements RandomAccessVectorValues, Closeable {
33 | private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
34 | final int dimension;
35 | final int rows;
36 | final File file;
37 | final float[] valueBuffer;
38 |
39 | final MMapBuffer fileReader;
40 |
41 | public MMapRandomAccessVectorValues(File f, int dimension) {
42 | assert f != null && f.exists() && f.canRead();
43 | assert f.length() % ((long) dimension * Float.BYTES) == 0;
44 |
45 | try {
46 | this.file = f;
47 | this.fileReader = new MMapBuffer(f, FileChannel.MapMode.READ_ONLY, ByteOrder.LITTLE_ENDIAN);
48 | this.dimension = dimension;
49 | this.rows = ((int) f.length()) / dimension;
50 | this.valueBuffer = new float[dimension];
51 | } catch (IOException e) {
52 | throw new IOError(e);
53 | }
54 | }
55 |
56 | @Override
57 | public int size() {
58 | return (int) (file.length() / ((long) dimension * Float.BYTES));
59 | }
60 |
61 | @Override
62 | public int dimension() {
63 | return dimension;
64 | }
65 |
66 | @Override
67 | public VectorFloat> getVector(int targetOrd) {
68 | long offset = (long) targetOrd * dimension * Float.BYTES;
69 | int i = 0;
70 | for (long o = offset; o < offset + ((long) dimension * Float.BYTES); o += Float.BYTES, i++)
71 | valueBuffer[i] = fileReader.memory().getFloat(o);
72 |
73 | return vectorTypeSupport.createFloatVector(valueBuffer);
74 | }
75 |
76 | @Override
77 | public boolean isValueShared() {
78 | return false;
79 | }
80 |
81 | @Override
82 | public RandomAccessVectorValues copy() {
83 | return new MMapRandomAccessVectorValues(file, dimension);
84 | }
85 |
86 | @Override
87 | public void close() {
88 | try {
89 | this.fileReader.close();
90 | } catch (IOException e) {
91 | throw new IOError(e);
92 | }
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SiftLoader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.util;
18 |
19 | import io.github.jbellis.jvector.vector.VectorizationProvider;
20 | import io.github.jbellis.jvector.vector.types.VectorFloat;
21 | import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
22 |
23 | import java.io.BufferedInputStream;
24 | import java.io.DataInputStream;
25 | import java.io.FileInputStream;
26 | import java.io.IOException;
27 | import java.io.UncheckedIOException;
28 | import java.nio.ByteBuffer;
29 | import java.nio.ByteOrder;
30 | import java.util.ArrayList;
31 | import java.util.HashSet;
32 | import java.util.List;
33 |
34 | public class SiftLoader {
35 | private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
36 |
37 | public static List> readFvecs(String filePath) throws IOException {
38 | var vectors = new ArrayList>();
39 | try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(filePath)))) {
40 | while (dis.available() > 0) {
41 | var dimension = Integer.reverseBytes(dis.readInt());
42 | assert dimension > 0 : dimension;
43 | var buffer = new byte[dimension * Float.BYTES];
44 | dis.readFully(buffer);
45 | var byteBuffer = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN);
46 |
47 | var vector = new float[dimension];
48 | var floatBuffer = byteBuffer.asFloatBuffer();
49 | floatBuffer.get(vector);
50 | vectors.add(vectorTypeSupport.createFloatVector(vector));
51 | }
52 | }
53 | return vectors;
54 | }
55 |
56 | public static List> readIvecs(String filename) {
57 | var groundTruthTopK = new ArrayList>();
58 |
59 | try (var dis = new DataInputStream(new FileInputStream(filename))) {
60 | while (dis.available() > 0) {
61 | var numNeighbors = Integer.reverseBytes(dis.readInt());
62 | var neighbors = new ArrayList(numNeighbors);
63 |
64 | for (var i = 0; i < numNeighbors; i++) {
65 | var neighbor = Integer.reverseBytes(dis.readInt());
66 | neighbors.add(neighbor);
67 | }
68 |
69 | groundTruthTopK.add(neighbors);
70 | }
71 | } catch (IOException e) {
72 | throw new UncheckedIOException(e);
73 | }
74 |
75 | return groundTruthTopK;
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/UpdatableRandomAccessVectorValues.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.util;
18 |
19 | import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
20 | import io.github.jbellis.jvector.vector.types.VectorFloat;
21 |
22 | import java.util.ArrayList;
23 | import java.util.List;
24 |
25 | public class UpdatableRandomAccessVectorValues implements RandomAccessVectorValues {
26 | private final List> data;
27 | private final int dimensions;
28 |
29 | public UpdatableRandomAccessVectorValues(int dimensions) {
30 | this.data = new ArrayList<>(1024);
31 | this.dimensions = dimensions;
32 | }
33 |
34 | public void add(VectorFloat> vector) {
35 | data.add(vector);
36 | }
37 |
38 | @Override
39 | public int size() {
40 | return data.size();
41 | }
42 |
43 | @Override
44 | public int dimension() {
45 | return dimensions;
46 | }
47 |
48 | @Override
49 | public VectorFloat> getVector(int targetOrd) {
50 | return data.get(targetOrd);
51 | }
52 |
53 | @Override
54 | public boolean isValueShared() {
55 | return false;
56 | }
57 |
58 | @Override
59 | public RandomAccessVectorValues copy() {
60 | return this;
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/CommonParameters.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.yaml;
18 |
19 | import io.github.jbellis.jvector.example.util.CompressorParameters;
20 | import io.github.jbellis.jvector.example.util.DataSet;
21 |
22 | import java.util.List;
23 | import java.util.function.Function;
24 | import java.util.stream.Collectors;
25 |
26 | public class CommonParameters {
27 | public List compression;
28 |
29 | public List> getCompressorParameters() {
30 | return compression.stream().map(Compression::getCompressorParameters).collect(Collectors.toList());
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.yaml;
18 |
19 | import io.github.jbellis.jvector.example.util.CompressorParameters;
20 | import io.github.jbellis.jvector.example.util.DataSet;
21 | import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
22 |
23 | import java.util.Map;
24 | import java.util.function.Function;
25 |
26 | public class Compression {
27 | public String type;
28 | public Map parameters;
29 |
30 | public Function getCompressorParameters() {
31 | switch (type) {
32 | case "None":
33 | return __ -> CompressorParameters.NONE;
34 | case "PQ":
35 | int k = Integer.parseInt(parameters.getOrDefault("k", "256"));
36 | String strCenterData = parameters.get("centerData");
37 | if (!(strCenterData == null || strCenterData.equals("Yes") || strCenterData.equals("No"))) {
38 | throw new IllegalArgumentException("centerData must be Yes or No, or not specified at all.");
39 | }
40 | float anisotropicThreshold = Float.parseFloat(parameters.getOrDefault("anisotropicThreshold", "-1"));
41 |
42 | return ds -> {
43 | boolean centerData;
44 | if (strCenterData == null) {
45 | centerData = ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN;
46 | } else {
47 | centerData = strCenterData.equals("Yes");;
48 | }
49 |
50 | if (parameters.containsKey("m")) {
51 | int m = Integer.parseInt(parameters.get("m"));
52 | return new CompressorParameters.PQParameters(m, k, centerData, anisotropicThreshold);
53 | } else if (parameters.containsKey("mFactor")) {
54 | String strMFactor = parameters.get("mFactor");
55 | int mFactor = Integer.parseInt(strMFactor);
56 | return new CompressorParameters.PQParameters(ds.getDimension() / mFactor, k, centerData, anisotropicThreshold);
57 | } else {
58 | throw new IllegalArgumentException("Need to specify either 'm' or 'mFactor'");
59 | }
60 | };
61 | case "BQ":
62 | return ds -> new CompressorParameters.BQParameters();
63 | default:
64 | throw new IllegalArgumentException("Unsupported compression type: " + type);
65 |
66 | }
67 | }
68 | }
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/ConstructionParameters.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.yaml;
18 |
19 | import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
20 |
21 | import java.util.EnumSet;
22 | import java.util.List;
23 | import java.util.stream.Collectors;
24 |
25 |
26 | public class ConstructionParameters extends CommonParameters {
27 | public List outDegree;
28 | public List efConstruction;
29 | public List neighborOverflow;
30 | public List addHierarchy;
31 | public List reranking;
32 | public Boolean useSavedIndexIfExists;
33 |
34 | public List> getFeatureSets() {
35 | return reranking.stream().map(item -> {
36 | switch (item) {
37 | case "FP":
38 | return EnumSet.of(FeatureId.INLINE_VECTORS);
39 | case "NVQ":
40 | return EnumSet.of(FeatureId.NVQ_VECTORS);
41 | default:
42 | throw new IllegalArgumentException("Only 'FP' and 'NVQ' are supported");
43 | }
44 | }).collect(Collectors.toList());
45 | }
46 | }
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/DatasetCollection.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.yaml;
18 |
19 | import org.yaml.snakeyaml.Yaml;
20 |
21 | import java.io.FileInputStream;
22 | import java.io.IOException;
23 | import java.io.InputStream;
24 | import java.util.ArrayList;
25 | import java.util.List;
26 | import java.util.Map;
27 |
28 | public class DatasetCollection {
29 | private static final String defaultFile = "./jvector-examples/yaml-configs/datasets.yml";
30 |
31 | public final Map> datasetNames;
32 |
33 | private DatasetCollection(Map> datasetNames) {
34 | this.datasetNames = datasetNames;
35 | }
36 |
37 | public static DatasetCollection load() throws IOException {
38 | return load(defaultFile);
39 | }
40 |
41 | public static DatasetCollection load(String file) throws IOException {
42 | InputStream inputStream = new FileInputStream(file);
43 | Yaml yaml = new Yaml();
44 | return new DatasetCollection(yaml.load(inputStream));
45 | }
46 |
47 | public List getAll() {
48 | List allDatasetNames = new ArrayList<>();
49 | for (var key : datasetNames.keySet()) {
50 | allDatasetNames.addAll(datasetNames.get(key));
51 | }
52 | return allDatasetNames;
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.yaml;
18 |
19 | import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;
20 | import org.yaml.snakeyaml.Yaml;
21 |
22 | import java.io.File;
23 | import java.io.FileInputStream;
24 | import java.io.FileNotFoundException;
25 | import java.io.InputStream;
26 |
27 | public class MultiConfig {
28 | private static final String defaultDirectory = "./jvector-examples/yaml-configs/";
29 |
30 | private int version;
31 | public String dataset;
32 | public ConstructionParameters construction;
33 | public SearchParameters search;
34 |
35 | public static MultiConfig getDefaultConfig(String datasetName) throws FileNotFoundException {
36 | File configFile = new File(defaultDirectory + datasetName + ".yml");
37 | if (!configFile.exists()) {
38 | configFile = new File(defaultDirectory + "default.yml");
39 | System.out.println("Default YAML config file: " + configFile.getAbsolutePath());
40 | }
41 | return getConfig(configFile);
42 | }
43 |
44 | public static MultiConfig getConfig(String datasetName) throws FileNotFoundException {
45 | File configFile = new File(datasetName);
46 | return getConfig(configFile);
47 | }
48 |
49 | public static MultiConfig getConfig(File configFile) throws FileNotFoundException {
50 | if (!configFile.exists()) {
51 | throw new FileNotFoundException(configFile.getAbsolutePath());
52 | }
53 | InputStream inputStream = new FileInputStream(configFile);
54 | Yaml yaml = new Yaml();
55 | return yaml.loadAs(inputStream, MultiConfig.class);
56 | }
57 |
58 | public int getVersion() {
59 | return version;
60 | }
61 |
62 | public void setVersion(int version) {
63 | if (version != OnDiskGraphIndex.CURRENT_VERSION) {
64 | throw new IllegalArgumentException("Invalid version: " + version);
65 | }
66 | this.version = version;
67 | }
68 | }
--------------------------------------------------------------------------------
/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.example.yaml;
18 |
19 | import java.util.List;
20 | import java.util.Map;
21 |
22 | public class SearchParameters extends CommonParameters {
23 | public Map> topKOverquery;
24 | public List useSearchPruning;
25 | }
--------------------------------------------------------------------------------
/jvector-examples/yaml-configs/ada002-100k.yml:
--------------------------------------------------------------------------------
1 | version: 4
2 |
3 | dataset: ada002-100k
4 |
5 | construction:
6 | outDegree: [32]
7 | efConstruction: [100]
8 | neighborOverflow: [1.2f]
9 | addHierarchy: [Yes]
10 | compression:
11 | - type: PQ
12 | parameters:
13 | m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
14 | # mFactor: 8
15 | # k: 256 # optional parameter. By default, k=256
16 | centerData: No
17 | anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
18 | reranking:
19 | - NVQ
20 | useSavedIndexIfExists: Yes
21 |
22 | search:
23 | topKOverquery:
24 | 10: [1.0, 2.0, 5.0, 10.0]
25 | 100: [1.0, 2.0]
26 | useSearchPruning: [Yes]
27 | compression:
28 | - type: PQ
29 | parameters:
30 | m: 192
31 | # k: 256 # optional parameter. By default, k=256
32 | centerData: No
33 | anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
--------------------------------------------------------------------------------
/jvector-examples/yaml-configs/datasets.yml:
--------------------------------------------------------------------------------
1 | neighborhood-watch-100k:
2 | - ada002-100k
3 | - cohere-english-v3-100k
4 | - openai-v3-small-100k
5 | - gecko-100k
6 | - openai-v3-large-3072-100k
7 | - openai-v3-large-1536-100k
8 | - e5-small-v2-100k
9 | - e5-base-v2-100k
10 | - e5-large-v2-100k
11 | neighborhood-watch-1M:
12 | - ada002-1M
13 | - colbert-1M
14 | ann-benchmarks:
15 | - glove-25-angular.hdf5
16 | - glove-50-angular.hdf5
17 | - lastfm-64-dot.hdf5
18 | - glove-100-angular.hdf5
19 | - glove-200-angular.hdf5
20 | - nytimes-256-angular.hdf5
21 | - sift-128-euclidean.hdf5"
22 | # - deep-image-96-angular.hdf5 # large files not yet supported
23 | # - gist-960-euclidean.hdf5 # large files not yet supported
--------------------------------------------------------------------------------
/jvector-examples/yaml-configs/default.yml:
--------------------------------------------------------------------------------
1 | version: 4
2 |
3 | dataset: cohere-english-v3-100k
4 |
5 | construction:
6 | outDegree: [32]
7 | efConstruction: [100]
8 | neighborOverflow: [1.2f]
9 | addHierarchy: [Yes]
10 | compression:
11 | - type: PQ
12 | parameters:
13 | m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
14 | # mFactor: 8
15 | # k: 256 # optional parameter. By default, k=256
16 | centerData: No
17 | anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
18 | reranking:
19 | - NVQ
20 | useSavedIndexIfExists: Yes
21 |
22 | search:
23 | topKOverquery:
24 | 10: [1.0, 2.0, 5.0, 10.0]
25 | 100: [1.0, 2.0]
26 | useSearchPruning: [Yes]
27 | compression:
28 | - type: PQ
29 | parameters:
30 | m: 192
31 | # k: 256 # optional parameter. By default, k=256
32 | centerData: No
33 | anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
--------------------------------------------------------------------------------
/jvector-multirelease/src/assembly/mrjar.xml:
--------------------------------------------------------------------------------
1 |
4 | mvjar
5 |
6 | jar
7 |
8 | false
9 |
10 |
11 | true
12 |
13 | io.github.jbellis:jvector-base
14 |
15 |
16 | true
17 | false
18 |
19 |
20 |
21 | true
22 |
23 | io.github.jbellis:jvector-twenty
24 |
25 |
26 | META-INF/versions/20
27 | true
28 | false
29 |
30 |
31 | /META-INF/**
32 |
33 |
34 |
35 |
36 |
37 | true
38 |
39 | io.github.jbellis:jvector-native
40 |
41 |
42 | META-INF/versions/22
43 | true
44 | false
45 |
46 |
47 | /META-INF/**
48 |
49 |
50 |
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/jvector-multirelease/src/assembly/sourcesjar.xml:
--------------------------------------------------------------------------------
1 |
4 | sources
5 |
6 | jar
7 |
8 | false
9 |
10 |
11 | true
12 |
13 | io.github.jbellis:jvector-base
14 |
15 |
16 | false
17 |
18 |
19 | ${module.artifactId}
20 | src/main/java
21 |
22 |
23 |
24 |
25 |
26 | true
27 |
28 | io.github.jbellis:jvector-twenty
29 |
30 |
31 | false
32 |
33 |
34 | ${module.artifactId}
35 | src/main/java
36 |
37 |
38 |
39 |
40 |
41 | true
42 |
43 | io.github.jbellis:jvector-native
44 |
45 |
46 | false
47 |
48 |
49 | ${module.artifactId}
50 | src/main/java
51 |
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/jvector-native/src/main/c/jextract_vector_simd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # fail on error
4 | set -e
5 |
6 | # Copyright DataStax, Inc.
7 | #
8 | # Licensed under the Apache License, Version 2.0 (the "License");
9 | # you may not use this file except in compliance with the License.
10 | # You may obtain a copy of the License at
11 | #
12 | # http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 |
20 | if [ "$1" == "--auto-install-gcc" ] ; then AUTO_INSTALL_GCC=true ; shift ; fi
21 | printf "AUTO_INSTALL_GCC=%s\n" "${AUTO_INSTALL_GCC}"
22 |
23 | mkdir -p ../resources
24 | # compile jvector_simd_check.c as x86-64
25 | # compile jvector_simd.c as skylake-avx512
26 | # produce one shared library
27 |
28 | # Desired minimum GCC version
29 | MIN_GCC_VERSION=11
30 |
31 | if ! command -v gcc &> /dev/null; then
32 | if [ "$AUTO_INSTALL_GCC" == "true" ]
33 | then
34 | LSB_RELEASE=$(lsb_release --id --short)
35 | printf "LSB_RELEASE=%s\n" "${LSB_RELEASE}"
36 | if [ "${LSB_RELEASE}" == "Ubuntu" ]
37 | then sudo apt update && sudo apt install -y gcc
38 | else printf "distribution %s needs a gcc install command in %s\n" "${LSB_RELEASE}" "${0}" ; exit 2
39 | fi
40 | else
41 | echo "GCC is not installed. Please install GCC 11+ to build supporting native libraries."
42 | exit 2
43 | fi
44 | fi
45 |
46 | # Check if GCC is installed
47 | CURRENT_GCC_VERSION=$(gcc -dumpversion)
48 |
49 | # Check if the current GCC version is greater than or equal to the minimum required version
50 | if [ "$(printf '%s\n' "$MIN_GCC_VERSION" "$CURRENT_GCC_VERSION" | sort -V | head -n1)" = "$MIN_GCC_VERSION" ]; then
51 | rm -rf ../resources/libjvector.so
52 | gcc -fPIC -O3 -march=icelake-server -c jvector_simd.c -o jvector_simd.o
53 | gcc -fPIC -O3 -march=x86-64 -c jvector_simd_check.c -o jvector_simd_check.o
54 | gcc -shared -o ../resources/libjvector.so jvector_simd_check.o jvector_simd.o
55 | else
56 | echo "WARNING: GCC version $CURRENT_GCC_VERSION is too old. Please upgrade to GCC $MIN_GCC_VERSION or newer."
57 | fi
58 |
59 | # Generate Java source code
60 | # Should only be run when c header changes
61 | # Check if jextract is available before running
62 | if ! command -v jextract &> /dev/null
63 | then
64 | echo "WARNING: jextract could not be found, please install it if you need to update bindings."
65 | exit 0
66 | fi
67 |
68 | jextract \
69 | --output ../java \
70 | -t io.github.jbellis.jvector.vector.cnative \
71 | -I . \
72 | --header-class-name NativeSimdOps \
73 | jvector_simd.h
74 |
75 | # Set critical linker option with heap-based segments for all generated methods
76 | sed -i 's/DESC)/DESC, Linker.Option.critical(true))/g' ../java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java
77 |
--------------------------------------------------------------------------------
/jvector-native/src/main/c/jvector_simd.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 |
19 | #ifndef VECTOR_SIMD_DOT_H
20 | #define VECTOR_SIMD_DOT_H
21 |
22 | // check CPU support
23 | bool check_compatibility(void);
24 |
25 | //F32
26 | float dot_product_f32(int preferred_size, const float* a, int aoffset, const float* b, int boffset, int length);
27 | float euclidean_f32(int preferred_size, const float* a, int aoffset, const float* b, int boffset, int length);
28 | void bulk_quantized_shuffle_dot_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartials, float delta, float minDistance, float* results);
29 | void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartials, float delta, float minDistance, float* results);
30 | void bulk_quantized_shuffle_cosine_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartialSums, float sumDelta, float minDistance, const char* quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float* results);
31 | float assemble_and_sum_f32_512(const float* data, int dataBase, const unsigned char* baseOffsets, int baseOffsetsOffset, int baseOffsetsLength);
32 | float pq_decoded_cosine_similarity_f32_512(const unsigned char* baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float* partialSums, const float* aMagnitude, float bMagnitude);
33 | void calculate_partial_sums_dot_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums);
34 | void calculate_partial_sums_euclidean_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums);
35 | void calculate_partial_sums_best_dot_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums, float* partialBestDistances);
36 | void calculate_partial_sums_best_euclidean_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums, float* partialBestDistances);
37 | #endif
38 |
--------------------------------------------------------------------------------
/jvector-native/src/main/c/jvector_simd_check.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include "jvector_simd.h"
19 |
20 | bool check_compatibility(void) {
21 | unsigned int eax, ebx, ecx, edx;
22 | bool avx512f_supported = false, avx512cd_supported = false,
23 | avx512bw_supported = false, avx512dq_supported = false,
24 | avx512vl_supported = false;
25 |
26 | // Check for AVX-512 Foundation (AVX-512F) and other AVX-512 features:
27 | // These are indicated by various bits of EBX from leaf 7, sub-leaf 0.
28 | if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
29 | avx512f_supported = ebx & (1 << 16); // AVX-512F
30 | avx512cd_supported = ebx & (1 << 28); // AVX-512CD
31 | avx512bw_supported = ebx & (1 << 30); // AVX-512BW
32 | avx512dq_supported = ebx & (1 << 17); // AVX-512DQ
33 | avx512vl_supported = ebx & (1 << 31); // AVX-512VL
34 | }
35 |
36 | return avx512f_supported && avx512cd_supported && avx512bw_supported && avx512dq_supported && avx512vl_supported;
37 | }
38 |
--------------------------------------------------------------------------------
/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorizationProvider.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.vector;
18 |
19 | import io.github.jbellis.jvector.annotations.Experimental;
20 | import io.github.jbellis.jvector.vector.cnative.LibraryLoader;
21 | import io.github.jbellis.jvector.vector.cnative.NativeSimdOps;
22 | import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
23 |
24 | /**
25 | * Experimental!
26 | * VectorizationProvider implementation that uses MemorySegment vectors and prefers native/Panama SIMD.
27 | */
28 | @Experimental
29 | public class NativeVectorizationProvider extends VectorizationProvider {
30 | private final VectorUtilSupport vectorUtilSupport;
31 | private final VectorTypeSupport vectorTypeSupport;
32 |
33 | public NativeVectorizationProvider() {
34 | var libraryLoaded = LibraryLoader.loadJvector();
35 | if (!libraryLoaded) {
36 | throw new UnsupportedOperationException("Failed to load supporting native library.");
37 | }
38 | if (!NativeSimdOps.check_compatibility()) {
39 | throw new UnsupportedOperationException("Native SIMD operations are not supported on this platform due to missing CPU support.");
40 | }
41 | this.vectorUtilSupport = new NativeVectorUtilSupport();
42 | this.vectorTypeSupport = new MemorySegmentVectorProvider();
43 | }
44 |
45 | @Override
46 | public VectorUtilSupport getVectorUtilSupport() {
47 | return vectorUtilSupport;
48 | }
49 |
50 | @Override
51 | public VectorTypeSupport getVectorTypeSupport() {
52 | return vectorTypeSupport;
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/LibraryLoader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.vector.cnative;
18 |
19 | import java.io.File;
20 | import java.nio.file.Files;
21 |
22 | /**
23 | * This class is used to load supporting native libraries. First, it tries to load the library from the system path.
24 | * If that fails, it tries to load the library from the classpath (using the usual copying to a tmp directory route).
25 | */
26 | public class LibraryLoader {
27 | private LibraryLoader() {}
28 | public static boolean loadJvector() {
29 | try {
30 | System.loadLibrary("jvector");
31 | return true;
32 | } catch (UnsatisfiedLinkError e) {
33 | // ignore
34 | }
35 | try {
36 | // reinventing the wheel instead of picking up deps, so we'll just use the classloader to load the library
37 | // as a resource and then copy it to a tmp directory and load it from there
38 | String libName = System.mapLibraryName("jvector");
39 | File tmpLibFile = File.createTempFile(libName.substring(0, libName.lastIndexOf('.')), libName.substring(libName.lastIndexOf('.')));
40 | try (var in = LibraryLoader.class.getResourceAsStream("/" + libName);
41 | var out = Files.newOutputStream(tmpLibFile.toPath())) {
42 | if (in != null) {
43 | in.transferTo(out);
44 | out.flush();
45 | } else {
46 | return false; // couldn't find library
47 | }
48 | }
49 | System.load(tmpLibFile.getAbsolutePath());
50 | return true;
51 | } catch (Exception | UnsatisfiedLinkError e) {
52 | // ignore
53 | }
54 | return false;
55 | }
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/jvector-tests/resources/version0.odgi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastax/jvector/c0f21804b60e0c0ad3c55ef6ba9781d5108df41d/jvector-tests/resources/version0.odgi
--------------------------------------------------------------------------------
/jvector-tests/resources/version0.pq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastax/jvector/c0f21804b60e0c0ad3c55ef6ba9781d5108df41d/jvector-tests/resources/version0.pq
--------------------------------------------------------------------------------
/jvector-tests/src/main/assembly/test-jar-with-dependencies.xml:
--------------------------------------------------------------------------------
1 |
4 | test-jar-with-dependencies
5 |
6 | jar
7 |
8 | false
9 |
10 |
11 | /
12 | true
13 |
14 | true
15 | true
16 | test
17 |
18 |
19 |
--------------------------------------------------------------------------------
/jvector-tests/src/test/java/io/github/jbellis/jvector/LuceneTestCase.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector;
18 |
19 | import com.carrotsearch.randomizedtesting.RandomizedTest;
20 |
21 | import java.util.Random;
22 |
23 | // Not actually derived from Lucene, but provides a random() method like LuceneTestCase does
24 | // for easier porting of Lucene tests
25 | public class LuceneTestCase extends RandomizedTest {
26 | public static int RANDOM_MULTIPLIER = 2;
27 |
28 | public static Random random() {
29 | return getRandom();
30 | }
31 |
32 | public static int atLeast(Random random, int n) {
33 | return n + random.nextInt(n / 2);
34 | }
35 |
36 | public static int atLeast(int n) {
37 | return n + getRandom().nextInt(n / 2);
38 | }
39 |
40 | public static int nextInt(int from, int to) {
41 | return getRandom().nextInt(to - from) + from;
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/jvector-tests/src/test/java/io/github/jbellis/jvector/graph/MockVectorValues.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.graph;
26 |
27 | import io.github.jbellis.jvector.util.ArrayUtil;
28 | import io.github.jbellis.jvector.vector.VectorizationProvider;
29 | import io.github.jbellis.jvector.vector.types.VectorFloat;
30 | import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
31 |
32 | public class MockVectorValues implements RandomAccessVectorValues {
33 | private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
34 | private final VectorFloat> scratch;
35 | private final int dimension;
36 | private final VectorFloat>[] denseValues;
37 |
38 | public static MockVectorValues fromValues(VectorFloat>[] values) {
39 | return new MockVectorValues(values[0].length(), values);
40 | }
41 |
42 | MockVectorValues(int dimension, VectorFloat>[] denseValues) {
43 | this.dimension = dimension;
44 | this.denseValues = denseValues;
45 | this.scratch = vectorTypeSupport.createFloatVector(dimension);
46 | }
47 |
48 | @Override
49 | public MockVectorValues copy() {
50 | return new MockVectorValues(dimension, ArrayUtil.copyOfSubArray(denseValues, 0, denseValues.length));
51 | }
52 |
53 | @Override
54 | public boolean isValueShared() {
55 | return true;
56 | }
57 |
58 | @Override
59 | public VectorFloat> getVector(int targetOrd) {
60 | VectorFloat> original = denseValues[targetOrd];
61 | // present a single vector reference to callers like the disk-backed RAVV implmentations,
62 | // to catch cases where they are not making a copy
63 | scratch.copyFrom(original, 0, 0, dimension);
64 | return scratch;
65 | }
66 |
67 | @Override
68 | public int size() {
69 | return denseValues.length;
70 | }
71 |
72 | @Override
73 | public int dimension() {
74 | return dimension;
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.github.jbellis.jvector.microbench;
17 |
18 |
19 | import io.github.jbellis.jvector.example.util.DataSet;
20 | import io.github.jbellis.jvector.example.util.Hdf5Loader;
21 | import io.github.jbellis.jvector.graph.GraphIndexBuilder;
22 | import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues;
23 | import org.openjdk.jmh.annotations.Benchmark;
24 | import org.openjdk.jmh.annotations.BenchmarkMode;
25 | import org.openjdk.jmh.annotations.Fork;
26 | import org.openjdk.jmh.annotations.Measurement;
27 | import org.openjdk.jmh.annotations.Mode;
28 | import org.openjdk.jmh.annotations.OutputTimeUnit;
29 | import org.openjdk.jmh.annotations.Scope;
30 | import org.openjdk.jmh.annotations.State;
31 | import org.openjdk.jmh.annotations.Warmup;
32 | import org.openjdk.jmh.infra.Blackhole;
33 |
34 | import java.util.concurrent.TimeUnit;
35 |
36 | @Warmup(iterations = 1, time = 5)
37 | @Measurement(iterations = 1, time = 10)
38 | @Fork(warmups = 0, value = 1, jvmArgsAppend = {"--add-modules=jdk.incubator.vector", "-XX:+UnlockDiagnosticVMOptions", "--enable-preview", "-XX:+PreserveFramePointer", "-XX:+DebugNonSafepoints"})
39 | public class GraphBuildBench {
40 |
41 | @State(Scope.Benchmark)
42 | public static class Parameters {
43 | final DataSet ds;
44 | final ListRandomAccessVectorValues ravv;
45 |
46 | public Parameters() {
47 | this.ds = Hdf5Loader.load("hdf5/glove-100-angular.hdf5");
48 | this.ravv = new ListRandomAccessVectorValues(ds.baseVectors, ds.baseVectors.get(0).length());
49 | }
50 | }
51 |
52 | @Benchmark
53 | @BenchmarkMode(Mode.Throughput)
54 | @OutputTimeUnit(TimeUnit.SECONDS)
55 | public void testGraphBuild(Blackhole bh, Parameters p) {
56 | long start = System.nanoTime();
57 | GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder(p.ravv, p.ds.similarityFunction, 8, 60, 1.2f, 1.4f, false);
58 | graphIndexBuilder.build(p.ravv);
59 | System.out.format("Build M=%d ef=%d in %.2fs%n",
60 | 32, 600, (System.nanoTime() - start) / 1_000_000_000.0);
61 | }
62 |
63 | @Benchmark
64 | @BenchmarkMode(Mode.Throughput)
65 | @OutputTimeUnit(TimeUnit.SECONDS)
66 | public void testGraphBuildWithHierarchy(Blackhole bh, Parameters p) {
67 | long start = System.nanoTime();
68 | GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder(p.ravv, p.ds.similarityFunction, 8, 60, 1.2f, 1.4f, true);
69 | graphIndexBuilder.build(p.ravv);
70 | System.out.format("Build M=%d ef=%d in %.2fs%n",
71 | 32, 600, (System.nanoTime() - start) / 1_000_000_000.0);
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/SimilarityBench.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.github.jbellis.jvector.microbench;
17 |
18 |
19 | import io.github.jbellis.jvector.TestUtil;
20 | import io.github.jbellis.jvector.vector.VectorUtil;
21 | import io.github.jbellis.jvector.vector.types.VectorFloat;
22 | import org.openjdk.jmh.annotations.Benchmark;
23 | import org.openjdk.jmh.annotations.BenchmarkMode;
24 | import org.openjdk.jmh.annotations.Fork;
25 | import org.openjdk.jmh.annotations.Level;
26 | import org.openjdk.jmh.annotations.Measurement;
27 | import org.openjdk.jmh.annotations.Mode;
28 | import org.openjdk.jmh.annotations.OutputTimeUnit;
29 | import org.openjdk.jmh.annotations.Param;
30 | import org.openjdk.jmh.annotations.Scope;
31 | import org.openjdk.jmh.annotations.Setup;
32 | import org.openjdk.jmh.annotations.State;
33 | import org.openjdk.jmh.annotations.Threads;
34 | import org.openjdk.jmh.annotations.Warmup;
35 | import org.openjdk.jmh.infra.Blackhole;
36 |
37 | import java.util.Random;
38 | import java.util.concurrent.TimeUnit;
39 |
40 | @BenchmarkMode(Mode.Throughput)
41 | @OutputTimeUnit(TimeUnit.SECONDS)
42 | @Warmup(iterations = 2, time = 5)
43 | @Measurement(iterations = 3, time = 5)
44 | @Fork(value = 1, jvmArgsAppend = {"--add-modules=jdk.incubator.vector", "--enable-preview", "-Djvector.experimental.enable_native_vectorization=true"})
45 | @State(Scope.Thread)
46 | public class SimilarityBench {
47 |
48 | @Param({"4", "8", "16", "1024"})
49 | int size = 1024;
50 |
51 | VectorFloat> A, B;
52 |
53 | @Setup(Level.Trial)
54 | public void setUp()
55 | {
56 | A = TestUtil.randomVector(new Random(), size);
57 | B = TestUtil.randomVector(new Random(), size);
58 | }
59 |
60 | @BenchmarkMode(Mode.Throughput)
61 | @OutputTimeUnit(TimeUnit.SECONDS)
62 | @Benchmark
63 | @Threads(8)
64 | public void testDotProduct8(Blackhole bh) {
65 | bh.consume(VectorUtil.dotProduct(A, B));
66 | }
67 |
68 |
69 | @BenchmarkMode(Mode.AverageTime)
70 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
71 | @Benchmark
72 | @Threads(1)
73 | public void testDotProduct1(Blackhole bh) {
74 | bh.consume(VectorUtil.dotProduct(A, B));
75 | }
76 |
77 | public static void main(String[] args) throws Exception {
78 | org.openjdk.jmh.Main.main(args);
79 | }
80 | }
81 |
82 |
--------------------------------------------------------------------------------
/jvector-tests/src/test/java/io/github/jbellis/jvector/quantization/TestBinaryQuantization.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.quantization;
18 |
19 | import org.junit.Test;
20 |
21 | import com.carrotsearch.randomizedtesting.RandomizedTest;
22 | import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
23 | import io.github.jbellis.jvector.TestUtil;
24 | import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues;
25 | import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
26 |
27 | import static io.github.jbellis.jvector.TestUtil.createRandomVectors;
28 | import static org.junit.jupiter.api.Assertions.assertEquals;
29 |
30 | @ThreadLeakScope(ThreadLeakScope.Scope.NONE)
31 | public class TestBinaryQuantization extends RandomizedTest
32 | {
33 | @Test
34 | public void testMutableImmutableBQEquality()
35 | {
36 | var vectors = createRandomVectors(512, 64);
37 | var ravv = new ListRandomAccessVectorValues(vectors, 64);
38 | var bq = new BinaryQuantization(ravv.dimension());
39 | var immutableCompressedVectors = bq.encodeAll(ravv);
40 | var mutableCompressedVectors = new MutableBQVectors(bq);
41 | for (int i = 0; i < ravv.size(); i++)
42 | {
43 | mutableCompressedVectors.encodeAndSet(i, ravv.getVector(i));
44 | }
45 | assertEquals(mutableCompressedVectors.count(), immutableCompressedVectors.count());
46 | var randomVector = TestUtil.randomVector(getRandom(), 64);
47 | for (VectorSimilarityFunction vsf : VectorSimilarityFunction.values())
48 | {
49 | var immutableScoreFunction = immutableCompressedVectors.scoreFunctionFor(randomVector, vsf);
50 | var mutableScoreFunction = mutableCompressedVectors.scoreFunctionFor(randomVector, vsf);
51 | for (int i = 0; i < ravv.size(); i++)
52 | {
53 | assertEquals(immutableScoreFunction.similarityTo(i), mutableScoreFunction.similarityTo(i));
54 | }
55 | }
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/jvector-tests/src/test/java/io/github/jbellis/jvector/util/TestAtomicFixedBitSet.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.util;
18 |
19 | import io.github.jbellis.jvector.TestUtil;
20 | import org.junit.Assert;
21 | import org.junit.Test;
22 |
23 | public class TestAtomicFixedBitSet extends BaseBitSetTestCase {
24 |
25 | @Override
26 | public AtomicFixedBitSet copyOf(BitSet bs, int length) {
27 | final AtomicFixedBitSet set = new AtomicFixedBitSet(length);
28 | for (int doc = bs.nextSetBit(0); doc != DocIdSetIterator.NO_MORE_DOCS; doc = bs.nextSetBit(doc + 1)) {
29 | set.set(doc);
30 | }
31 | return set;
32 | }
33 |
34 | @SuppressWarnings("NarrowCalculation")
35 | @Test
36 | public void testApproximateCardinality() {
37 | // The approximate cardinality works in such a way that it should be pretty accurate on a bitset
38 | // whose bits are uniformly distributed.
39 | final AtomicFixedBitSet set = new AtomicFixedBitSet(TestUtil.nextInt(random(), 100_000, 200_000));
40 | final int first = random().nextInt(10);
41 | final int interval = TestUtil.nextInt(random(), 10, 20);
42 | for (int i = first; i < set.length(); i += interval) {
43 | set.set(i);
44 | }
45 | final int cardinality = set.cardinality();
46 | Assert.assertEquals(cardinality, set.approximateCardinality(), cardinality / 20); // 5% error at most
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/jvector-tests/src/test/java/io/github/jbellis/jvector/util/TestThreadSafeGrowableBitSet.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.util;
18 |
19 | public class TestThreadSafeGrowableBitSet extends BaseBitSetTestCase {
20 |
21 | @Override
22 | public ThreadSafeGrowableBitSet copyOf(BitSet bs, int length) {
23 | final ThreadSafeGrowableBitSet set = new ThreadSafeGrowableBitSet(length);
24 | for (int doc = bs.nextSetBit(0); doc != DocIdSetIterator.NO_MORE_DOCS; doc = bs.nextSetBit(doc + 1)) {
25 | set.set(doc);
26 | }
27 | return set;
28 | }
29 | }
30 |
31 |
--------------------------------------------------------------------------------
/jvector-tests/src/test/java/io/github/jbellis/jvector/vector/TestMatrixUtil.java:
--------------------------------------------------------------------------------
1 | /*
2 | * All changes to the original code are Copyright DataStax, Inc.
3 | *
4 | * Please see the included license file for details.
5 | */
6 |
7 | /*
8 | * Original license:
9 | * Licensed to the Apache Software Foundation (ASF) under one or more
10 | * contributor license agreements. See the NOTICE file distributed with
11 | * this work for additional information regarding copyright ownership.
12 | * The ASF licenses this file to You under the Apache License, Version 2.0
13 | * (the "License"); you may not use this file except in compliance with
14 | * the License. You may obtain a copy of the License at
15 | *
16 | * http://www.apache.org/licenses/LICENSE-2.0
17 | *
18 | * Unless required by applicable law or agreed to in writing, software
19 | * distributed under the License is distributed on an "AS IS" BASIS,
20 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | * See the License for the specific language governing permissions and
22 | * limitations under the License.
23 | */
24 |
25 | package io.github.jbellis.jvector.vector;
26 |
27 | import org.junit.Test;
28 |
29 | import static org.junit.Assert.assertArrayEquals;
30 | import static org.junit.Assert.assertEquals;
31 |
32 | public class TestMatrixUtil {
33 | @Test
34 | public void testInvert() {
35 | var matrix = Matrix.from(new float[][] {{4, 7}, {2, 6}});
36 | var expected = Matrix.from(new float[][] {{0.6f, -0.7f}, {-0.2f, 0.4f}});
37 | assertEquals(expected, matrix.invert());
38 | }
39 |
40 | @Test(expected = IllegalArgumentException.class)
41 | public void testInvertNonSquareMatrix() {
42 | var matrix = Matrix.from(new float[][] {{1, 2, 3}, {4, 5, 6}});
43 | matrix.invert();
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/jvector-twenty/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
6 | 4.0.0
7 |
8 | io.github.jbellis
9 | jvector-parent
10 | ${revision}
11 |
12 | jvector-twenty
13 | Twenty
14 |
15 |
16 |
17 | org.apache.maven.plugins
18 | maven-compiler-plugin
19 | 3.11.0
20 |
21 | 20
22 |
23 |
24 | --add-modules
25 | jdk.incubator.vector
26 |
27 |
28 |
29 |
30 | org.apache.maven.plugins
31 | maven-surefire-plugin
32 | 3.1.2
33 |
34 | false
35 |
36 | --enable-preview
37 | --add-modules jdk.incubator.vector
38 | --enable-native-access=ALL-UNNAMED
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 | io.github.jbellis
49 | jvector-base
50 | ${project.version}
51 |
52 |
53 | commons-math3
54 | org.apache.commons
55 |
56 |
57 |
58 |
59 | org.junit.jupiter
60 | junit-jupiter-engine
61 | test
62 |
63 |
64 | com.carrotsearch.randomizedtesting
65 | randomizedtesting-runner
66 | test
67 |
68 |
69 | org.openjdk.jmh
70 | jmh-core
71 | test
72 |
73 |
74 | commons-math3
75 | org.apache.commons
76 |
77 |
78 |
79 |
80 | org.openjdk.jmh
81 | jmh-generator-annprocess
82 | test
83 |
84 |
85 |
86 |
--------------------------------------------------------------------------------
/jvector-twenty/src/main/java/io/github/jbellis/jvector/vector/PanamaVectorizationProvider.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright DataStax, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.github.jbellis.jvector.vector;
18 |
19 | import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
20 | import jdk.incubator.vector.FloatVector;
21 |
22 | /**
23 | * Vectorization provider that uses on-heap arrays and SIMD operations through Panama SIMD API.
24 | */
25 | public class PanamaVectorizationProvider extends VectorizationProvider
26 | {
27 | static {
28 | System.setProperty("jdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK", "0");
29 | }
30 |
31 | private final VectorUtilSupport vectorUtilSupport;
32 | private final VectorTypeSupport vectorTypeSupport;
33 |
34 | public PanamaVectorizationProvider() {
35 | this.vectorUtilSupport = new PanamaVectorUtilSupport();
36 | LOG.info("Preferred f32 species is " + FloatVector.SPECIES_PREFERRED.vectorBitSize());
37 | this.vectorTypeSupport = new ArrayVectorProvider();
38 | }
39 |
40 | @Override
41 | public VectorUtilSupport getVectorUtilSupport() {
42 | return vectorUtilSupport;
43 | }
44 |
45 | @Override
46 | public VectorTypeSupport getVectorTypeSupport() {
47 | return vectorTypeSupport;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/rat-excludes.txt:
--------------------------------------------------------------------------------
1 | .github/workflows/unit-tests.yaml
2 | .github/workflows/tag-release.yml
3 | .mvn/wrapper/maven-wrapper.properties
4 | .mvn/jvm.config
5 | README.md
6 | UPGRADING.md
7 | CHANGELOG.md
8 | rat-excludes.txt
9 | pom.xml
10 | src/main/assembly/test-jar-with-dependencies.xml
11 | src/assembly/mrjar.xml
12 | src/assembly/sourcesjar.xml
13 | src/main/java/io/github/jbellis/jvector/vector/cnative/*
14 | src/main/resources/log4j2.xml
15 | scripts/test_node_setup.sh
16 | yaml-configs/*.yml
--------------------------------------------------------------------------------
/siftsmall/siftsmall_base.fvecs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastax/jvector/c0f21804b60e0c0ad3c55ef6ba9781d5108df41d/siftsmall/siftsmall_base.fvecs
--------------------------------------------------------------------------------
/siftsmall/siftsmall_groundtruth.ivecs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastax/jvector/c0f21804b60e0c0ad3c55ef6ba9781d5108df41d/siftsmall/siftsmall_groundtruth.ivecs
--------------------------------------------------------------------------------
/siftsmall/siftsmall_query.fvecs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datastax/jvector/c0f21804b60e0c0ad3c55ef6ba9781d5108df41d/siftsmall/siftsmall_query.fvecs
--------------------------------------------------------------------------------
/update_changelog.sh:
--------------------------------------------------------------------------------
1 | # Copyright DataStax, Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | #!/bin/bash
16 | # @author Madhavan Sridharan
17 | set -euo pipefail
18 |
19 | which docker > /dev/null || (echoerr "Please ensure that docker is installed" && exit 1)
20 |
21 | cd -P -- "$(dirname -- "$0")" # switch to this dir
22 |
23 | CHANGELOG_FILE=CHANGELOG.md
24 | previous_version_line_number=$(awk '/## \[[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9\.]+)?\]/ {print NR; exit}' "$CHANGELOG_FILE"
25 | )
26 | previous_version=$(head -$previous_version_line_number $CHANGELOG_FILE | grep "## \[" | awk -F']' '{print $1}' | cut -c 5-)
27 | echo "previous_version:" $previous_version
28 | # Remove the header so we can append the additions
29 | tail -n +$previous_version_line_number "$CHANGELOG_FILE" > "$CHANGELOG_FILE.tmp" && mv "$CHANGELOG_FILE.tmp" "$CHANGELOG_FILE"
30 |
31 | if [[ -z ${GITHUB_TOKEN-} ]]; then
32 | echo "**WARNING** GITHUB_TOKEN is not currently set" >&2
33 | exit 1
34 | fi
35 |
36 | INTERACTIVE=""
37 | if [[ -t 1 ]]; then
38 | INTERACTIVE="-it"
39 | fi
40 |
41 | docker run $INTERACTIVE --rm -v "$(pwd)":/usr/local/src/your-app githubchangeloggenerator/github-changelog-generator -u datastax -p jvector -t $GITHUB_TOKEN --since-tag $previous_version --output $CHANGELOG_FILE --release-branch 'main' --exclude-labels 'duplicate,question,invalid,wontfix'
42 |
43 | # Remove the additional footer added by the changelog generator
44 | head -n $(( $(wc -l < $CHANGELOG_FILE) - 3 )) $CHANGELOG_FILE > "$CHANGELOG_FILE.tmp" && mv "$CHANGELOG_FILE.tmp" "$CHANGELOG_FILE"
45 |
--------------------------------------------------------------------------------