瀏覽代碼

Add a SIMD (AVX2) optimised vector distance function for int7 on x64 (#108088)

* Adding support for x64 to native vec library
* Fix: aarch64 sqr7u dims
* Fix: add symbol stripping (deb lintian)
---------
Co-authored-by: Chris Hegarty <62058229+ChrisHegarty@users.noreply.github.com>
Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
Lorenzo Dematté 1 年之前
父節點
當前提交
2e0f8d087c

+ 5 - 0
docs/changelog/108088.yaml

@@ -0,0 +1,5 @@
+pr: 108088
+summary: Add a SIMD (AVX2) optimised vector distance function for int7 on x64
+area: "Search"
+type: enhancement
+issues: []

+ 1 - 1
libs/native/libraries/build.gradle

@@ -18,7 +18,7 @@ configurations {
 }
 
 var zstdVersion = "1.5.5"
-var vecVersion = "1.0.6"
+var vecVersion = "1.0.8"
 
 repositories {
   exclusiveContent {

+ 9 - 1
libs/native/src/main/java/org/elasticsearch/nativeaccess/PosixNativeAccess.java

@@ -45,7 +45,15 @@ abstract class PosixNativeAccess extends AbstractNativeAccess {
     }
 
     static boolean isNativeVectorLibSupported() {
-        return Runtime.version().feature() >= 21 && isMacOrLinuxAarch64() && checkEnableSystemProperty();
+        return Runtime.version().feature() >= 21 && (isMacOrLinuxAarch64() || isLinuxAmd64()) && checkEnableSystemProperty();
+    }
+
+    /**
+     * Returns true iff the architecture is x64 (amd64) and the OS Linux (the OS we currently support for the native lib).
+     */
+    static boolean isLinuxAmd64() {
+        String name = System.getProperty("os.name");
+        return (name.startsWith("Linux")) && System.getProperty("os.arch").equals("amd64");
     }
 
     /** Returns true iff the OS is Mac or Linux, and the architecture is aarch64. */

+ 3 - 1
libs/native/src/test/java/org/elasticsearch/nativeaccess/VectorSimilarityFunctionsTests.java

@@ -37,7 +37,9 @@ public class VectorSimilarityFunctionsTests extends ESTestCase {
         var arch = System.getProperty("os.arch");
         var osName = System.getProperty("os.name");
 
-        if (jdkVersion >= 21 && arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux"))) {
+        if (jdkVersion >= 21
+            && ((arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux")))
+                || (arch.equals("amd64") && osName.equals("Linux")))) {
             assertThat(vectorSimilarityFunctions, isPresent());
             return true;
         } else {

+ 3 - 2
libs/vec/native/Dockerfile

@@ -4,6 +4,7 @@ RUN apt update
 RUN apt install -y gcc g++ openjdk-17-jdk
 COPY . /workspace
 WORKDIR /workspace
-RUN ./gradlew --quiet --console=plain clean vecSharedLibrary
+RUN ./gradlew --quiet --console=plain clean buildSharedLibrary
+RUN strip --strip-unneeded build/output/libvec.so
 
-CMD cat build/libs/vec/shared/libvec.so
+CMD cat build/output/libvec.so

+ 67 - 9
libs/vec/native/build.gradle

@@ -12,9 +12,10 @@ var os = org.gradle.internal.os.OperatingSystem.current()
 // To update this library run publish_vec_binaries.sh  ( or ./gradlew vecSharedLibrary )
 // Or
 // For local development, build the docker image with:
-//   docker build --platform linux/arm64 --progress=plain .
+//   docker build --platform linux/arm64 --progress=plain . (for aarch64)
+//   docker build --platform linux/amd64 --progress=plain . (for x64)
 // Grab the image id from the console output, then, e.g.
-//   docker run 9c9f36564c148b275aeecc42749e7b4580ded79dcf51ff6ccc008c8861e7a979 > build/libs/vec/shared/libvec.so
+//   docker run 9c9f36564c148b275aeecc42749e7b4580ded79dcf51ff6ccc008c8861e7a979 > build/libs/vec/shared/$arch/libvec.so
 //
 // To run tests and benchmarks on a locally built libvec,
 //  1. Temporarily comment out the download in libs/native/library/build.gradle
@@ -30,26 +31,83 @@ var os = org.gradle.internal.os.OperatingSystem.current()
 
 group = 'org.elasticsearch'
 
+def platformName = System.getProperty("os.arch");
+
 model {
+  platforms {
+    aarch64 {
+      architecture "aarch64"
+    }
+    amd64 {
+      architecture "x86-64"
+    }
+  }
   toolChains {
     gcc(Gcc) {
       target("aarch64") {
         cCompiler.executable = "/usr/bin/gcc"
+        cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c99", "-march=armv8-a"]) }
+      }
+      target("amd64") {
+        cCompiler.executable = "/usr/bin/gcc"
+        cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c99", "-march=core-avx2", "-Wno-incompatible-pointer-types"]) }
       }
     }
-    clang(Clang)
-  }
-  platforms {
-    aarch64 {
-      architecture "aarch64"
+    cl(VisualCpp) {
+      eachPlatform { toolchain ->
+        def platform = toolchain.getPlatform()
+        if (platform.name == "x64") {
+          cCompiler.withArguments { args -> args.addAll(["/O2", "/LD", "-march=core-avx2"]) }
+        }
+      }
+    }
+    clang(Clang) {
+      target("amd64") {
+        cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c99", "-march=core-avx2"]) }
+      }
     }
   }
   components {
     vec(NativeLibrarySpec) {
       targetPlatform "aarch64"
-      binaries.withType(SharedLibraryBinarySpec) {
-        cCompiler.args "-O3", "-std=c99", "-march=armv8-a"
+      targetPlatform "amd64"
+
+      sources {
+        c {
+          source {
+            srcDir "src/vec/c/${platformName}/"
+            include "*.c"
+          }
+          exportedHeaders {
+            srcDir "src/vec/headers/"
+          }
+        }
+      }
+    }
+  }
+}
+
+tasks.register('buildSharedLibrary') {
+  description = 'Assembles native shared library for the host architecture'
+  if (platformName.equals("aarch64")) {
+    dependsOn tasks.vecAarch64SharedLibrary
+    doLast {
+      copy {
+        from tasks.linkVecAarch64SharedLibrary.outputs.files.files
+        into layout.buildDirectory.dir('output');
+        duplicatesStrategy = 'INCLUDE'
+      }
+    }
+  } else if (platformName.equals("amd64")) {
+    dependsOn tasks.vecAmd64SharedLibrary
+    doLast {
+      copy {
+        from tasks.linkVecAmd64SharedLibrary.outputs.files.files
+        into layout.buildDirectory.dir('output');
+        duplicatesStrategy = 'INCLUDE'
       }
     }
+  } else {
+    throw new GradleException("Unsupported platform: " + platformName)
   }
 }

+ 11 - 5
libs/vec/native/publish_vec_binaries.sh

@@ -19,7 +19,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then
   exit 1;
 fi
 
-VERSION="1.0.6"
+VERSION="1.0.8"
 ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}"
 TEMP=$(mktemp -d)
 
@@ -29,16 +29,22 @@ if curl -sS -I --fail --location "${ARTIFACTORY_REPOSITORY}/org/elasticsearch/ve
 fi
 
 echo 'Building Darwin binary...'
-./gradlew --quiet --console=plain vecSharedLibrary
+./gradlew --quiet --console=plain vecAarch64SharedLibrary
 
 echo 'Building Linux binary...'
 DOCKER_IMAGE=$(docker build --platform linux/arm64 --quiet .)
-docker run $DOCKER_IMAGE > build/libs/vec/shared/libvec.so
+docker run $DOCKER_IMAGE > build/libs/vec/shared/aarch64/libvec.so
+
+echo 'Building Linux x64 binary...'
+DOCKER_IMAGE=$(docker build --platform linux/amd64 --quiet .)
+docker run --platform linux/amd64 $DOCKER_IMAGE > build/libs/vec/shared/amd64/libvec.so
 
 mkdir -p $TEMP/darwin-aarch64
 mkdir -p $TEMP/linux-aarch64
-cp build/libs/vec/shared/libvec.dylib $TEMP/darwin-aarch64/
-cp build/libs/vec/shared/libvec.so $TEMP/linux-aarch64/
+mkdir -p $TEMP/linux-x64
+cp build/libs/vec/shared/aarch64/libvec.dylib $TEMP/darwin-aarch64/
+cp build/libs/vec/shared/aarch64/libvec.so $TEMP/linux-aarch64/
+cp build/libs/vec/shared/amd64/libvec.so $TEMP/linux-x64/
 
 echo 'Uploading to Artifactory...'
 (cd $TEMP && zip -rq - .) | curl -sS -X PUT -H "X-JFrog-Art-Api: ${ARTIFACTORY_API_KEY}" --data-binary @- --location "${ARTIFACTORY_REPOSITORY}/org/elasticsearch/vec/${VERSION}/vec-${VERSION}.zip"

+ 1 - 1
libs/vec/native/src/vec/c/vec.c → libs/vec/native/src/vec/c/aarch64/vec.c

@@ -121,7 +121,7 @@ static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, size_t dims) {
 EXPORT int32_t sqr7u(int8_t* a, int8_t* b, size_t dims) {
     int32_t res = 0;
     int i = 0;
-    if (i > SQR7U_STRIDE_BYTES_LEN) {
+    if (dims > SQR7U_STRIDE_BYTES_LEN) {
         i += dims & ~(SQR7U_STRIDE_BYTES_LEN - 1);
         res = sqr7u_inner(a, b, i);
     }

+ 150 - 0
libs/vec/native/src/vec/c/amd64/vec.c

@@ -0,0 +1,150 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include "vec.h"
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+#ifndef DOT7U_STRIDE_BYTES_LEN
+#define DOT7U_STRIDE_BYTES_LEN 32 // Must be a power of 2
+#endif
+
+#ifndef SQR7U_STRIDE_BYTES_LEN
+#define SQR7U_STRIDE_BYTES_LEN 32 // Must be a power of 2
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#elif __GNUC__
+#include <x86intrin.h>
+#elif __clang__
+#include <x86intrin.h>
+#endif
+
+// Multi-platform CPUID "intrinsic"; it takes as input a "functionNumber" (or "leaf", the eax registry). "Subleaf"
+// is always 0. Output is stored in the passed output parameter: output[0] = eax, output[1] = ebx, output[2] = ecx,
+// output[3] = edx
+static inline void cpuid(int output[4], int functionNumber) {
+#if defined(__GNUC__) || defined(__clang__)
+    // use inline assembly, Gnu/AT&T syntax
+    int a, b, c, d;
+    __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionNumber), "c"(0) : );
+    output[0] = a;
+    output[1] = b;
+    output[2] = c;
+    output[3] = d;
+
+#elif defined (_MSC_VER)
+    __cpuidex(output, functionNumber, 0);
+#else
+   #error Unsupported compiler
+#endif
+}
+
+// Utility function to horizontally add 8 32-bit integers
+static inline int hsum_i32_8(const __m256i a) {
+    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+EXPORT int vec_caps() {
+    int cpuInfo[4] = {-1};
+    // Calling __cpuid with 0x0 as the function_id argument
+    // gets the number of the highest valid function ID.
+    cpuid(cpuInfo, 0);
+    int functionIds = cpuInfo[0];
+    if (functionIds >= 7) {
+        cpuid(cpuInfo, 7);
+        int ebx = cpuInfo[1];
+        // AVX2 flag is the 5th bit
+        // We assume that all processors that have AVX2 also have FMA3
+        return (ebx & (1 << 5)) != 0;
+    }
+    return 0;
+}
+
+static inline int32_t dot7u_inner(int8_t* a, int8_t* b, size_t dims) {
+    const __m256i ones = _mm256_set1_epi16(1);
+
+    // Init accumulator(s) with 0
+    __m256i acc1 = _mm256_setzero_si256();
+
+#pragma GCC unroll 4
+    for(int i = 0; i < dims; i += DOT7U_STRIDE_BYTES_LEN) {
+        // Load packed 8-bit integers
+        __m256i va1 = _mm256_loadu_si256(a + i);
+        __m256i vb1 = _mm256_loadu_si256(b + i);
+
+        // Perform multiplication and create 16-bit values
+        // Vertically multiply each unsigned 8-bit integer from va with the corresponding
+        // 8-bit integer from vb, producing intermediate signed 16-bit integers.
+        const __m256i vab = _mm256_maddubs_epi16(va1, vb1);
+        // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the results.
+        acc1 = _mm256_add_epi32(_mm256_madd_epi16(ones, vab), acc1);
+    }
+
+    // reduce (horizontally add all)
+    return hsum_i32_8(acc1);
+}
+
+EXPORT int32_t dot7u(int8_t* a, int8_t* b, size_t dims) {
+    int32_t res = 0;
+    int i = 0;
+    if (dims > DOT7U_STRIDE_BYTES_LEN) {
+        i += dims & ~(DOT7U_STRIDE_BYTES_LEN - 1);
+        res = dot7u_inner(a, b, i);
+    }
+    for (; i < dims; i++) {
+        res += a[i] * b[i];
+    }
+    return res;
+}
+
+static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, size_t dims) {
+    // Init accumulator(s) with 0
+    __m256i acc1 = _mm256_setzero_si256();
+
+    const __m256i ones = _mm256_set1_epi16(1);
+
+#pragma GCC unroll 4
+    for(int i = 0; i < dims; i += SQR7U_STRIDE_BYTES_LEN) {
+        // Load packed 8-bit integers
+        __m256i va1 = _mm256_loadu_si256(a + i);
+        __m256i vb1 = _mm256_loadu_si256(b + i);
+
+        const __m256i dist1 = _mm256_sub_epi8(va1, vb1);
+        const __m256i abs_dist1 = _mm256_sign_epi8(dist1, dist1);
+        const __m256i sqr1 = _mm256_maddubs_epi16(abs_dist1, abs_dist1);
+
+        acc1 = _mm256_add_epi32(_mm256_madd_epi16(ones, sqr1), acc1);
+    }
+
+    // reduce (accumulate all)
+    return hsum_i32_8(acc1);
+}
+
+EXPORT int32_t sqr7u(int8_t* a, int8_t* b, size_t dims) {
+    int32_t res = 0;
+    int i = 0;
+    if (dims > SQR7U_STRIDE_BYTES_LEN) {
+        i += dims & ~(SQR7U_STRIDE_BYTES_LEN - 1);
+        res = sqr7u_inner(a, b, i);
+    }
+    for (; i < dims; i++) {
+        int32_t dist = a[i] - b[i];
+        res += dist * dist;
+    }
+    return res;
+}
+

+ 1 - 1
libs/vec/native/src/vec/headers/vec.h

@@ -7,7 +7,7 @@
  */
 
 #ifdef _MSC_VER
-#define EXPORT extern "C" __declspec(dllexport)
+#define EXPORT __declspec(dllexport)
 #elif defined(__GNUC__) && !defined(__clang__)
 #define EXPORT __attribute__((externally_visible,visibility("default")))
 #elif __clang__

+ 3 - 1
libs/vec/src/test/java/org/elasticsearch/vec/AbstractVectorTestCase.java

@@ -39,7 +39,9 @@ public abstract class AbstractVectorTestCase extends ESTestCase {
         var arch = System.getProperty("os.arch");
         var osName = System.getProperty("os.name");
 
-        if (jdkVersion >= 21 && arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux"))) {
+        if (jdkVersion >= 21
+            && (arch.equals("aarch64") && (osName.startsWith("Mac") || osName.equals("Linux"))
+                || arch.equals("amd64") && osName.equals("Linux"))) {
             assertThat(factory, isPresent());
             return true;
         } else {