1 year ago · 2e0f8d087c
--- a/docs/changelog/108088.yaml
+++ b/docs/changelog/108088.yaml
@@ -0,0 +1,5 @@
 
				+pr: 108088
			
 
				+summary: Add a SIMD (AVX2) optimised vector distance function for int7 on x64
			
 
				+area: "Search"
			
 
				+type: enhancement
			
 
				+issues: []
			
--- a/libs/native/libraries/build.gradle
+++ b/libs/native/libraries/build.gradle
@@ -18,7 +18,7 @@ configurations {
 
				 }
			
 
				 
			
 
				 var zstdVersion = "1.5.5"
			
 
				-var vecVersion = "1.0.6"
			
 
				+var vecVersion = "1.0.8"
			
 
				 
			
 
				 repositories {
			
 
				   exclusiveContent {
			
--- a/libs/native/src/main/java/org/elasticsearch/nativeaccess/PosixNativeAccess.java
+++ b/libs/native/src/main/java/org/elasticsearch/nativeaccess/PosixNativeAccess.java
@@ -45,7 +45,15 @@ abstract class PosixNativeAccess extends AbstractNativeAccess {
 
				     }
			
 
				 
			
 
				     static boolean isNativeVectorLibSupported() {
			
 
				-        return Runtime.version().feature() >= 21 && isMacOrLinuxAarch64() && checkEnableSystemProperty();
			
 
				+        return Runtime.version().feature() >= 21 && (isMacOrLinuxAarch64() || isLinuxAmd64()) && checkEnableSystemProperty();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Returns true iff the architecture is x64 (amd64) and the OS Linux (the OS we currently support for the native lib).
			
 
				+     */
			
 
				+    static boolean isLinuxAmd64() {
			
 
				+        String name = System.getProperty("os.name");
			
 
				+        return (name.startsWith("Linux")) && System.getProperty("os.arch").equals("amd64");
			
 
				     }
			
 
				 
			
 
				     /** Returns true iff the OS is Mac or Linux, and the architecture is aarch64. */
			
--- a/libs/native/src/test/java/org/elasticsearch/nativeaccess/VectorSimilarityFunctionsTests.java
+++ b/libs/native/src/test/java/org/elasticsearch/nativeaccess/VectorSimilarityFunctionsTests.java
@@ -37,7 +37,9 @@ public class VectorSimilarityFunctionsTests extends ESTestCase {
 
				         var arch = System.getProperty("os.arch");
			
 
				         var osName = System.getProperty("os.name");
			
 
				 
			
 
				-        if (jdkVersion >= 21 && arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux"))) {
			
 
				+        if (jdkVersion >= 21
			
 
				+            && ((arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux")))
			
 
				+                || (arch.equals("amd64") && osName.equals("Linux")))) {
			
 
				             assertThat(vectorSimilarityFunctions, isPresent());
			
 
				             return true;
			
 
				         } else {
			
--- a/libs/vec/native/Dockerfile
+++ b/libs/vec/native/Dockerfile
@@ -4,6 +4,7 @@ RUN apt update
 
				 RUN apt install -y gcc g++ openjdk-17-jdk
			
 
				 COPY . /workspace
			
 
				 WORKDIR /workspace
			
 
				-RUN ./gradlew --quiet --console=plain clean vecSharedLibrary
			
 
				+RUN ./gradlew --quiet --console=plain clean buildSharedLibrary
			
 
				+RUN strip --strip-unneeded build/output/libvec.so
			
 
				 
			
 
				-CMD cat build/libs/vec/shared/libvec.so
			
 
				+CMD cat build/output/libvec.so
			
--- a/libs/vec/native/build.gradle
+++ b/libs/vec/native/build.gradle
@@ -12,9 +12,10 @@ var os = org.gradle.internal.os.OperatingSystem.current()
 
				 // To update this library run publish_vec_binaries.sh  ( or ./gradlew vecSharedLibrary )
			
 
				 // Or
			
 
				 // For local development, build the docker image with:
			
 
				-//   docker build --platform linux/arm64 --progress=plain .
			
 
				+//   docker build --platform linux/arm64 --progress=plain . (for aarch64)
			
 
				+//   docker build --platform linux/amd64 --progress=plain . (for x64)
			
 
				 // Grab the image id from the console output, then, e.g.
			
 
				-//   docker run 9c9f36564c148b275aeecc42749e7b4580ded79dcf51ff6ccc008c8861e7a979 > build/libs/vec/shared/libvec.so
			
 
				+//   docker run 9c9f36564c148b275aeecc42749e7b4580ded79dcf51ff6ccc008c8861e7a979 > build/libs/vec/shared/$arch/libvec.so
			
 
				 //
			
 
				 // To run tests and benchmarks on a locally built libvec,
			
 
				 //  1. Temporarily comment out the download in libs/native/library/build.gradle
			
@@ -30,26 +31,83 @@ var os = org.gradle.internal.os.OperatingSystem.current()
 
				 
			
 
				 group = 'org.elasticsearch'
			
 
				 
			
 
				+def platformName = System.getProperty("os.arch");
			
 
				+
			
 
				 model {
			
 
				+  platforms {
			
 
				+    aarch64 {
			
 
				+      architecture "aarch64"
			
 
				+    }
			
 
				+    amd64 {
			
 
				+      architecture "x86-64"
			
 
				+    }
			
 
				+  }
			
 
				   toolChains {
			
 
				     gcc(Gcc) {
			
 
				       target("aarch64") {
			
 
				         cCompiler.executable = "/usr/bin/gcc"
			
 
				+        cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c99", "-march=armv8-a"]) }
			
 
				+      }
			
 
				+      target("amd64") {
			
 
				+        cCompiler.executable = "/usr/bin/gcc"
			
 
				+        cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c99", "-march=core-avx2", "-Wno-incompatible-pointer-types"]) }
			
 
				       }
			
 
				     }
			
 
				-    clang(Clang)
			
 
				-  }
			
 
				-  platforms {
			
 
				-    aarch64 {
			
 
				-      architecture "aarch64"
			
 
				+    cl(VisualCpp) {
			
 
				+      eachPlatform { toolchain ->
			
 
				+        def platform = toolchain.getPlatform()
			
 
				+        if (platform.name == "x64") {
			
 
				+          cCompiler.withArguments { args -> args.addAll(["/O2", "/LD", "-march=core-avx2"]) }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    clang(Clang) {
			
 
				+      target("amd64") {
			
 
				+        cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c99", "-march=core-avx2"]) }
			
 
				+      }
			
 
				     }
			
 
				   }
			
 
				   components {
			
 
				     vec(NativeLibrarySpec) {
			
 
				       targetPlatform "aarch64"
			
 
				-      binaries.withType(SharedLibraryBinarySpec) {
			
 
				-        cCompiler.args "-O3", "-std=c99", "-march=armv8-a"
			
 
				+      targetPlatform "amd64"
			
 
				+
			
 
				+      sources {
			
 
				+        c {
			
 
				+          source {
			
 
				+            srcDir "src/vec/c/${platformName}/"
			
 
				+            include "*.c"
			
 
				+          }
			
 
				+          exportedHeaders {
			
 
				+            srcDir "src/vec/headers/"
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+tasks.register('buildSharedLibrary') {
			
 
				+  description = 'Assembles native shared library for the host architecture'
			
 
				+  if (platformName.equals("aarch64")) {
			
 
				+    dependsOn tasks.vecAarch64SharedLibrary
			
 
				+    doLast {
			
 
				+      copy {
			
 
				+        from tasks.linkVecAarch64SharedLibrary.outputs.files.files
			
 
				+        into layout.buildDirectory.dir('output');
			
 
				+        duplicatesStrategy = 'INCLUDE'
			
 
				+      }
			
 
				+    }
			
 
				+  } else if (platformName.equals("amd64")) {
			
 
				+    dependsOn tasks.vecAmd64SharedLibrary
			
 
				+    doLast {
			
 
				+      copy {
			
 
				+        from tasks.linkVecAmd64SharedLibrary.outputs.files.files
			
 
				+        into layout.buildDirectory.dir('output');
			
 
				+        duplicatesStrategy = 'INCLUDE'
			
 
				       }
			
 
				     }
			
 
				+  } else {
			
 
				+    throw new GradleException("Unsupported platform: " + platformName)
			
 
				   }
			
 
				 }
			
--- a/libs/vec/native/publish_vec_binaries.sh
+++ b/libs/vec/native/publish_vec_binaries.sh
@@ -19,7 +19,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then
 
				   exit 1;
			
 
				 fi
			
 
				 
			
 
				-VERSION="1.0.6"
			
 
				+VERSION="1.0.8"
			
 
				 ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}"
			
 
				 TEMP=$(mktemp -d)
			
 
				 
			
@@ -29,16 +29,22 @@ if curl -sS -I --fail --location "${ARTIFACTORY_REPOSITORY}/org/elasticsearch/ve
 
				 fi
			
 
				 
			
 
				 echo 'Building Darwin binary...'
			
 
				-./gradlew --quiet --console=plain vecSharedLibrary
			
 
				+./gradlew --quiet --console=plain vecAarch64SharedLibrary
			
 
				 
			
 
				 echo 'Building Linux binary...'
			
 
				 DOCKER_IMAGE=$(docker build --platform linux/arm64 --quiet .)
			
 
				-docker run $DOCKER_IMAGE > build/libs/vec/shared/libvec.so
			
 
				+docker run $DOCKER_IMAGE > build/libs/vec/shared/aarch64/libvec.so
			
 
				+
			
 
				+echo 'Building Linux x64 binary...'
			
 
				+DOCKER_IMAGE=$(docker build --platform linux/amd64 --quiet .)
			
 
				+docker run --platform linux/amd64 $DOCKER_IMAGE > build/libs/vec/shared/amd64/libvec.so
			
 
				 
			
 
				 mkdir -p $TEMP/darwin-aarch64
			
 
				 mkdir -p $TEMP/linux-aarch64
			
 
				-cp build/libs/vec/shared/libvec.dylib $TEMP/darwin-aarch64/
			
 
				-cp build/libs/vec/shared/libvec.so $TEMP/linux-aarch64/
			
 
				+mkdir -p $TEMP/linux-x64
			
 
				+cp build/libs/vec/shared/aarch64/libvec.dylib $TEMP/darwin-aarch64/
			
 
				+cp build/libs/vec/shared/aarch64/libvec.so $TEMP/linux-aarch64/
			
 
				+cp build/libs/vec/shared/amd64/libvec.so $TEMP/linux-x64/
			
 
				 
			
 
				 echo 'Uploading to Artifactory...'
			
 
				 (cd $TEMP && zip -rq - .) | curl -sS -X PUT -H "X-JFrog-Art-Api: ${ARTIFACTORY_API_KEY}" --data-binary @- --location "${ARTIFACTORY_REPOSITORY}/org/elasticsearch/vec/${VERSION}/vec-${VERSION}.zip"
			
--- a/libs/vec/native/src/vec/c/aarch64/vec.c
+++ b/libs/vec/native/src/vec/c/aarch64/vec.c
@@ -121,7 +121,7 @@ static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, size_t dims) {
 
				 EXPORT int32_t sqr7u(int8_t* a, int8_t* b, size_t dims) {
			
 
				     int32_t res = 0;
			
 
				     int i = 0;
			
 
				-    if (i > SQR7U_STRIDE_BYTES_LEN) {
			
 
				+    if (dims > SQR7U_STRIDE_BYTES_LEN) {
			
 
				         i += dims & ~(SQR7U_STRIDE_BYTES_LEN - 1);
			
 
				         res = sqr7u_inner(a, b, i);
			
 
				     }
			
--- a/libs/vec/native/src/vec/c/amd64/vec.c
+++ b/libs/vec/native/src/vec/c/amd64/vec.c
@@ -0,0 +1,150 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License
			
 
				+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
			
 
				+ * in compliance with, at your election, the Elastic License 2.0 or the Server
			
 
				+ * Side Public License, v 1.
			
 
				+ */
			
 
				+
			
 
				+#include <stddef.h>
			
 
				+#include <stdint.h>
			
 
				+#include "vec.h"
			
 
				+
			
 
				+#include <emmintrin.h>
			
 
				+#include <immintrin.h>
			
 
				+
			
 
				+#ifndef DOT7U_STRIDE_BYTES_LEN
			
 
				+#define DOT7U_STRIDE_BYTES_LEN 32 // Must be a power of 2
			
 
				+#endif
			
 
				+
			
 
				+#ifndef SQR7U_STRIDE_BYTES_LEN
			
 
				+#define SQR7U_STRIDE_BYTES_LEN 32 // Must be a power of 2
			
 
				+#endif
			
 
				+
			
 
				+#ifdef _MSC_VER
			
 
				+#include <intrin.h>
			
 
				+#elif __GNUC__
			
 
				+#include <x86intrin.h>
			
 
				+#elif __clang__
			
 
				+#include <x86intrin.h>
			
 
				+#endif
			
 
				+
			
 
				+// Multi-platform CPUID "intrinsic"; it takes as input a "functionNumber" (or "leaf", the eax registry). "Subleaf"
			
 
				+// is always 0. Output is stored in the passed output parameter: output[0] = eax, output[1] = ebx, output[2] = ecx,
			
 
				+// output[3] = edx
			
 
				+static inline void cpuid(int output[4], int functionNumber) {
			
 
				+#if defined(__GNUC__) || defined(__clang__)
			
 
				+    // use inline assembly, Gnu/AT&T syntax
			
 
				+    int a, b, c, d;
			
 
				+    __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionNumber), "c"(0) : );
			
 
				+    output[0] = a;
			
 
				+    output[1] = b;
			
 
				+    output[2] = c;
			
 
				+    output[3] = d;
			
 
				+
			
 
				+#elif defined (_MSC_VER)
			
 
				+    __cpuidex(output, functionNumber, 0);
			
 
				+#else
			
 
				+   #error Unsupported compiler
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+// Utility function to horizontally add 8 32-bit integers
			
 
				+static inline int hsum_i32_8(const __m256i a) {
			
 
				+    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
			
 
				+    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
			
 
				+    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
			
 
				+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
			
 
				+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
			
 
				+}
			
 
				+
			
 
				+EXPORT int vec_caps() {
			
 
				+    int cpuInfo[4] = {-1};
			
 
				+    // Calling __cpuid with 0x0 as the function_id argument
			
 
				+    // gets the number of the highest valid function ID.
			
 
				+    cpuid(cpuInfo, 0);
			
 
				+    int functionIds = cpuInfo[0];
			
 
				+    if (functionIds >= 7) {
			
 
				+        cpuid(cpuInfo, 7);
			
 
				+        int ebx = cpuInfo[1];
			
 
				+        // AVX2 flag is the 5th bit
			
 
				+        // We assume that all processors that have AVX2 also have FMA3
			
 
				+        return (ebx & (1 << 5)) != 0;
			
 
				+    }
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int32_t dot7u_inner(int8_t* a, int8_t* b, size_t dims) {
			
 
				+    const __m256i ones = _mm256_set1_epi16(1);
			
 
				+
			
 
				+    // Init accumulator(s) with 0
			
 
				+    __m256i acc1 = _mm256_setzero_si256();
			
 
				+
			
 
				+#pragma GCC unroll 4
			
 
				+    for(int i = 0; i < dims; i += DOT7U_STRIDE_BYTES_LEN) {
			
 
				+        // Load packed 8-bit integers
			
 
				+        __m256i va1 = _mm256_loadu_si256(a + i);
			
 
				+        __m256i vb1 = _mm256_loadu_si256(b + i);
			
 
				+
			
 
				+        // Perform multiplication and create 16-bit values
			
 
				+        // Vertically multiply each unsigned 8-bit integer from va with the corresponding
			
 
				+        // 8-bit integer from vb, producing intermediate signed 16-bit integers.
			
 
				+        const __m256i vab = _mm256_maddubs_epi16(va1, vb1);
			
 
				+        // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the results.
			
 
				+        acc1 = _mm256_add_epi32(_mm256_madd_epi16(ones, vab), acc1);
			
 
				+    }
			
 
				+
			
 
				+    // reduce (horizontally add all)
			
 
				+    return hsum_i32_8(acc1);
			
 
				+}
			
 
				+
			
 
				+EXPORT int32_t dot7u(int8_t* a, int8_t* b, size_t dims) {
			
 
				+    int32_t res = 0;
			
 
				+    int i = 0;
			
 
				+    if (dims > DOT7U_STRIDE_BYTES_LEN) {
			
 
				+        i += dims & ~(DOT7U_STRIDE_BYTES_LEN - 1);
			
 
				+        res = dot7u_inner(a, b, i);
			
 
				+    }
			
 
				+    for (; i < dims; i++) {
			
 
				+        res += a[i] * b[i];
			
 
				+    }
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, size_t dims) {
			
 
				+    // Init accumulator(s) with 0
			
 
				+    __m256i acc1 = _mm256_setzero_si256();
			
 
				+
			
 
				+    const __m256i ones = _mm256_set1_epi16(1);
			
 
				+
			
 
				+#pragma GCC unroll 4
			
 
				+    for(int i = 0; i < dims; i += SQR7U_STRIDE_BYTES_LEN) {
			
 
				+        // Load packed 8-bit integers
			
 
				+        __m256i va1 = _mm256_loadu_si256(a + i);
			
 
				+        __m256i vb1 = _mm256_loadu_si256(b + i);
			
 
				+
			
 
				+        const __m256i dist1 = _mm256_sub_epi8(va1, vb1);
			
 
				+        const __m256i abs_dist1 = _mm256_sign_epi8(dist1, dist1);
			
 
				+        const __m256i sqr1 = _mm256_maddubs_epi16(abs_dist1, abs_dist1);
			
 
				+
			
 
				+        acc1 = _mm256_add_epi32(_mm256_madd_epi16(ones, sqr1), acc1);
			
 
				+    }
			
 
				+
			
 
				+    // reduce (accumulate all)
			
 
				+    return hsum_i32_8(acc1);
			
 
				+}
			
 
				+
			
 
				+EXPORT int32_t sqr7u(int8_t* a, int8_t* b, size_t dims) {
			
 
				+    int32_t res = 0;
			
 
				+    int i = 0;
			
 
				+    if (dims > SQR7U_STRIDE_BYTES_LEN) {
			
 
				+        i += dims & ~(SQR7U_STRIDE_BYTES_LEN - 1);
			
 
				+        res = sqr7u_inner(a, b, i);
			
 
				+    }
			
 
				+    for (; i < dims; i++) {
			
 
				+        int32_t dist = a[i] - b[i];
			
 
				+        res += dist * dist;
			
 
				+    }
			
 
				+    return res;
			
 
				+}
			
 
				+
			
--- a/libs/vec/native/src/vec/headers/vec.h
+++ b/libs/vec/native/src/vec/headers/vec.h
@@ -7,7 +7,7 @@
 
				  */
			
 
				 
			
 
				 #ifdef _MSC_VER
			
 
				-#define EXPORT extern "C" __declspec(dllexport)
			
 
				+#define EXPORT __declspec(dllexport)
			
 
				 #elif defined(__GNUC__) && !defined(__clang__)
			
 
				 #define EXPORT __attribute__((externally_visible,visibility("default")))
			
 
				 #elif __clang__
			
--- a/libs/vec/src/test/java/org/elasticsearch/vec/AbstractVectorTestCase.java
+++ b/libs/vec/src/test/java/org/elasticsearch/vec/AbstractVectorTestCase.java
@@ -39,7 +39,9 @@ public abstract class AbstractVectorTestCase extends ESTestCase {
 
				         var arch = System.getProperty("os.arch");
			
 
				         var osName = System.getProperty("os.name");
			
 
				 
			
 
				-        if (jdkVersion >= 21 && arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux"))) {
			
 
				+        if (jdkVersion >= 21
			
 
				+            && (arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux"))
			
 
				+                || arch.equals("amd64") && osName.equals("Linux"))) {
			
 
				             assertThat(factory, isPresent());
			
 
				             return true;
			
 
				         } else {