Bläddra i källkod

Fix diskbbq flush logic (#131470)

I accidentally broke recall on flush by allowing vectors to be double
quantized. Additionally, we shouldn't use the first vector as a
centroid, this can harm recall significantly when there is just one
centroid.

recall before this change:

```
index_name                             index_type  num_docs  index_time(ms)  force_merge_time(ms)  num_segments
-------------------------------------  ----------  --------  --------------  --------------------  ------------
corpus-dbpedia-entity-E5-small-0.fvec         ivf   1000000           25820                     0            14
corpus-dbpedia-entity-E5-small-0.fvec         ivf   1000000               0                 41693             0

index_name                             index_type  n_probe  latency(ms)  net_cpu_time(ms)  avg_cpu_count     QPS  recall    visited  filter_selectivity
-------------------------------------  ----------  -------  -----------  ----------------  -------------  ------  ------  ---------  ------------------
corpus-dbpedia-entity-E5-small-0.fvec         ivf       50        13.05              0.00           0.00   76.61    0.63  285267.44                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      150        31.92              0.00           0.00   31.33    0.68  629033.22                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      200        34.79              0.00           0.00   28.74    0.69  679699.13                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      500        39.40              0.00           0.00   25.38    0.71  794375.05                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf     1000        45.99              0.00           0.00   21.74    0.72  940493.52                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf       50         1.52              0.00           0.00  655.74    0.74   24201.82                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      150         2.94              0.00           0.00  340.43    0.85   67943.31                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      200         3.81              0.00           0.00  262.81    0.87   89575.99                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      500         7.67              0.00           0.00  130.38    0.93  213586.44                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf     1000        14.85              0.00           0.00   67.33    0.96  402628.11                1.00
```

With this fix:

```
index_name                             index_type  num_docs  index_time(ms)  force_merge_time(ms)  num_segments
-------------------------------------  ----------  --------  --------------  --------------------  ------------
corpus-dbpedia-entity-E5-small-0.fvec         ivf   1000000           25304                     0            15
corpus-dbpedia-entity-E5-small-0.fvec         ivf   1000000               0                 42110             0

index_name                             index_type  n_probe  latency(ms)  net_cpu_time(ms)  avg_cpu_count     QPS  recall    visited  filter_selectivity
-------------------------------------  ----------  -------  -----------  ----------------  -------------  ------  ------  ---------  ------------------
corpus-dbpedia-entity-E5-small-0.fvec         ivf       50        12.63              0.00           0.00   79.18    0.89  285527.22                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      150        32.49              0.00           0.00   30.77    0.94  619783.37                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      200        35.46              0.00           0.00   28.20    0.95  667903.47                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      500        40.38              0.00           0.00   24.76    0.97  781959.74                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf     1000        48.62              0.00           0.00   20.57    0.98  931017.40                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf       50         1.55              0.00           0.00  643.09    0.74   23595.57                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      150         2.98              0.00           0.00  335.29    0.85   66299.43                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      200         3.81              0.00           0.00  262.64    0.87   87416.15                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf      500         8.80              0.00           0.00  113.64    0.93  209061.37                1.00
corpus-dbpedia-entity-E5-small-0.fvec         ivf     1000        16.18              0.00           0.00   61.81    0.96  394906.29                1.00
```
Benjamin Trent 3 månader sedan
förälder
incheckning
cf5d40fa1f

+ 6 - 1
server/src/main/java/org/elasticsearch/index/codec/vectors/DefaultIVFVectorsWriter.java

@@ -420,6 +420,7 @@ public class DefaultIVFVectorsWriter extends IVFVectorsWriter {
         private final OptimizedScalarQuantizer quantizer;
         private final byte[] quantizedVector;
         private final int[] quantizedVectorScratch;
+        private final float[] floatVectorScratch;
         private OptimizedScalarQuantizer.QuantizationResult corrections;
         private float[] currentCentroid;
         private IntToIntFunction ordTransformer = null;
@@ -430,6 +431,7 @@ public class DefaultIVFVectorsWriter extends IVFVectorsWriter {
             this.vectorValues = vectorValues;
             this.quantizer = quantizer;
             this.quantizedVector = new byte[BQVectorUtils.discretize(dimension, 64) / 8];
+            this.floatVectorScratch = new float[dimension];
             this.quantizedVectorScratch = new int[dimension];
             this.corrections = null;
         }
@@ -454,7 +456,10 @@ public class DefaultIVFVectorsWriter extends IVFVectorsWriter {
             currOrd++;
             int ord = ordTransformer.apply(currOrd);
             float[] vector = vectorValues.vectorValue(ord);
-            corrections = quantizer.scalarQuantize(vector, quantizedVectorScratch, (byte) 1, currentCentroid);
+            // Its possible that the vectors are on-heap and we cannot mutate them as we may quantize twice
+            // due to overspill, so we copy the vector to a scratch array
+            System.arraycopy(vector, 0, floatVectorScratch, 0, vector.length);
+            corrections = quantizer.scalarQuantize(floatVectorScratch, quantizedVectorScratch, (byte) 1, currentCentroid);
             BQVectorUtils.packAsBinary(quantizedVectorScratch, quantizedVector);
             return quantizedVector;
         }

+ 12 - 2
server/src/main/java/org/elasticsearch/index/codec/vectors/cluster/HierarchicalKMeans.java

@@ -57,10 +57,20 @@ public class HierarchicalKMeans {
             return new KMeansIntermediate();
         }
 
-        // if we have a small number of vectors pick one and output that as the centroid
+        // if we have a small number of vectors calculate the centroid directly
         if (vectors.size() <= targetSize) {
             float[] centroid = new float[dimension];
-            System.arraycopy(vectors.vectorValue(0), 0, centroid, 0, dimension);
+            // sum the vectors
+            for (int i = 0; i < vectors.size(); i++) {
+                float[] vector = vectors.vectorValue(i);
+                for (int j = 0; j < dimension; j++) {
+                    centroid[j] += vector[j];
+                }
+            }
+            // average the vectors
+            for (int j = 0; j < dimension; j++) {
+                centroid[j] /= vectors.size();
+            }
             return new KMeansIntermediate(new float[][] { centroid }, new int[vectors.size()]);
         }