|  | @@ -13,10 +13,12 @@ import jdk.incubator.vector.ByteVector;
 | 
	
		
			
				|  |  |  import jdk.incubator.vector.FloatVector;
 | 
	
		
			
				|  |  |  import jdk.incubator.vector.IntVector;
 | 
	
		
			
				|  |  |  import jdk.incubator.vector.LongVector;
 | 
	
		
			
				|  |  | +import jdk.incubator.vector.VectorMask;
 | 
	
		
			
				|  |  |  import jdk.incubator.vector.VectorOperators;
 | 
	
		
			
				|  |  |  import jdk.incubator.vector.VectorShape;
 | 
	
		
			
				|  |  |  import jdk.incubator.vector.VectorSpecies;
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +import org.apache.lucene.util.BitUtil;
 | 
	
		
			
				|  |  |  import org.apache.lucene.util.Constants;
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  public final class PanamaESVectorUtilSupport implements ESVectorUtilSupport {
 | 
	
	
		
			
				|  | @@ -51,11 +53,25 @@ public final class PanamaESVectorUtilSupport implements ESVectorUtilSupport {
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      @Override
 | 
	
		
			
				|  |  |      public int ipByteBit(byte[] q, byte[] d) {
 | 
	
		
			
				|  |  | +        if (d.length >= 16 && HAS_FAST_INTEGER_VECTORS) {
 | 
	
		
			
				|  |  | +            if (VECTOR_BITSIZE >= 512) {
 | 
	
		
			
				|  |  | +                return ipByteBit512(q, d);
 | 
	
		
			
				|  |  | +            } else if (VECTOR_BITSIZE == 256) {
 | 
	
		
			
				|  |  | +                return ipByteBit256(q, d);
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  |          return DefaultESVectorUtilSupport.ipByteBitImpl(q, d);
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      @Override
 | 
	
		
			
				|  |  |      public float ipFloatBit(float[] q, byte[] d) {
 | 
	
		
			
				|  |  | +        if (q.length >= 16) {
 | 
	
		
			
				|  |  | +            if (VECTOR_BITSIZE >= 512) {
 | 
	
		
			
				|  |  | +                return ipFloatBit512(q, d);
 | 
	
		
			
				|  |  | +            } else if (VECTOR_BITSIZE == 256) {
 | 
	
		
			
				|  |  | +                return ipFloatBit256(q, d);
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  |          return DefaultESVectorUtilSupport.ipFloatBitImpl(q, d);
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -170,6 +186,240 @@ public final class PanamaESVectorUtilSupport implements ESVectorUtilSupport {
 | 
	
		
			
				|  |  |          return subRet0 + (subRet1 << 1) + (subRet2 << 2) + (subRet3 << 3);
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +    private static final VectorSpecies<Integer> INT_SPECIES_512 = IntVector.SPECIES_512;
 | 
	
		
			
				|  |  | +    private static final VectorSpecies<Byte> BYTE_SPECIES_FOR_INT_512 = VectorSpecies.of(
 | 
	
		
			
				|  |  | +        byte.class,
 | 
	
		
			
				|  |  | +        VectorShape.forBitSize(INT_SPECIES_512.vectorBitSize() / Integer.BYTES)
 | 
	
		
			
				|  |  | +    );
 | 
	
		
			
				|  |  | +    private static final VectorSpecies<Integer> INT_SPECIES_256 = IntVector.SPECIES_256;
 | 
	
		
			
				|  |  | +    private static final VectorSpecies<Byte> BYTE_SPECIES_FOR_INT_256 = VectorSpecies.of(
 | 
	
		
			
				|  |  | +        byte.class,
 | 
	
		
			
				|  |  | +        VectorShape.forBitSize(INT_SPECIES_256.vectorBitSize() / Integer.BYTES)
 | 
	
		
			
				|  |  | +    );
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    private static int limit(int length, int sectionSize) {
 | 
	
		
			
				|  |  | +        return length - (length % sectionSize);
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    static int ipByteBit512(byte[] q, byte[] d) {
 | 
	
		
			
				|  |  | +        assert q.length == d.length * Byte.SIZE;
 | 
	
		
			
				|  |  | +        int i = 0;
 | 
	
		
			
				|  |  | +        int sum = 0;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        int sectionLength = INT_SPECIES_512.length() * 4;
 | 
	
		
			
				|  |  | +        if (q.length >= sectionLength) {
 | 
	
		
			
				|  |  | +            IntVector acc0 = IntVector.zero(INT_SPECIES_512);
 | 
	
		
			
				|  |  | +            IntVector acc1 = IntVector.zero(INT_SPECIES_512);
 | 
	
		
			
				|  |  | +            IntVector acc2 = IntVector.zero(INT_SPECIES_512);
 | 
	
		
			
				|  |  | +            IntVector acc3 = IntVector.zero(INT_SPECIES_512);
 | 
	
		
			
				|  |  | +            int limit = limit(q.length, sectionLength);
 | 
	
		
			
				|  |  | +            for (; i < limit; i += sectionLength) {
 | 
	
		
			
				|  |  | +                var vals0 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_512, q, i).castShape(INT_SPECIES_512, 0);
 | 
	
		
			
				|  |  | +                var vals1 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_512, q, i + INT_SPECIES_512.length()).castShape(INT_SPECIES_512, 0);
 | 
	
		
			
				|  |  | +                var vals2 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_512, q, i + INT_SPECIES_512.length() * 2)
 | 
	
		
			
				|  |  | +                    .castShape(INT_SPECIES_512, 0);
 | 
	
		
			
				|  |  | +                var vals3 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_512, q, i + INT_SPECIES_512.length() * 3)
 | 
	
		
			
				|  |  | +                    .castShape(INT_SPECIES_512, 0);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                long maskBits = Long.reverse((long) BitUtil.VH_BE_LONG.get(d, i / 8));
 | 
	
		
			
				|  |  | +                var mask0 = VectorMask.fromLong(INT_SPECIES_512, maskBits);
 | 
	
		
			
				|  |  | +                var mask1 = VectorMask.fromLong(INT_SPECIES_512, maskBits >> 16);
 | 
	
		
			
				|  |  | +                var mask2 = VectorMask.fromLong(INT_SPECIES_512, maskBits >> 32);
 | 
	
		
			
				|  |  | +                var mask3 = VectorMask.fromLong(INT_SPECIES_512, maskBits >> 48);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                acc0 = acc0.add(vals0, mask0);
 | 
	
		
			
				|  |  | +                acc1 = acc1.add(vals1, mask1);
 | 
	
		
			
				|  |  | +                acc2 = acc2.add(vals2, mask2);
 | 
	
		
			
				|  |  | +                acc3 = acc3.add(vals3, mask3);
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +            sum += acc0.reduceLanes(VectorOperators.ADD) + acc1.reduceLanes(VectorOperators.ADD) + acc2.reduceLanes(VectorOperators.ADD)
 | 
	
		
			
				|  |  | +                + acc3.reduceLanes(VectorOperators.ADD);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        sectionLength = INT_SPECIES_256.length();
 | 
	
		
			
				|  |  | +        if (q.length - i >= sectionLength) {
 | 
	
		
			
				|  |  | +            IntVector acc = IntVector.zero(INT_SPECIES_256);
 | 
	
		
			
				|  |  | +            int limit = limit(q.length, sectionLength);
 | 
	
		
			
				|  |  | +            for (; i < limit; i += sectionLength) {
 | 
	
		
			
				|  |  | +                var vals = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, q, i).castShape(INT_SPECIES_256, 0);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                long maskBits = Integer.reverse(d[i / 8]) >> 24;
 | 
	
		
			
				|  |  | +                var mask = VectorMask.fromLong(INT_SPECIES_256, maskBits);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                acc = acc.add(vals, mask);
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +            sum += acc.reduceLanes(VectorOperators.ADD);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // that should have got them all (q.length is a multiple of 8, which fits in a 256-bit vector)
 | 
	
		
			
				|  |  | +        assert i == q.length;
 | 
	
		
			
				|  |  | +        return sum;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    static int ipByteBit256(byte[] q, byte[] d) {
 | 
	
		
			
				|  |  | +        assert q.length == d.length * Byte.SIZE;
 | 
	
		
			
				|  |  | +        int i = 0;
 | 
	
		
			
				|  |  | +        int sum = 0;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        int sectionLength = INT_SPECIES_256.length() * 4;
 | 
	
		
			
				|  |  | +        if (q.length >= sectionLength) {
 | 
	
		
			
				|  |  | +            IntVector acc0 = IntVector.zero(INT_SPECIES_256);
 | 
	
		
			
				|  |  | +            IntVector acc1 = IntVector.zero(INT_SPECIES_256);
 | 
	
		
			
				|  |  | +            IntVector acc2 = IntVector.zero(INT_SPECIES_256);
 | 
	
		
			
				|  |  | +            IntVector acc3 = IntVector.zero(INT_SPECIES_256);
 | 
	
		
			
				|  |  | +            int limit = limit(q.length, sectionLength);
 | 
	
		
			
				|  |  | +            for (; i < limit; i += sectionLength) {
 | 
	
		
			
				|  |  | +                var vals0 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, q, i).castShape(INT_SPECIES_256, 0);
 | 
	
		
			
				|  |  | +                var vals1 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, q, i + INT_SPECIES_256.length()).castShape(INT_SPECIES_256, 0);
 | 
	
		
			
				|  |  | +                var vals2 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, q, i + INT_SPECIES_256.length() * 2)
 | 
	
		
			
				|  |  | +                    .castShape(INT_SPECIES_256, 0);
 | 
	
		
			
				|  |  | +                var vals3 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, q, i + INT_SPECIES_256.length() * 3)
 | 
	
		
			
				|  |  | +                    .castShape(INT_SPECIES_256, 0);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                long maskBits = Integer.reverse((int) BitUtil.VH_BE_INT.get(d, i / 8));
 | 
	
		
			
				|  |  | +                var mask0 = VectorMask.fromLong(INT_SPECIES_256, maskBits);
 | 
	
		
			
				|  |  | +                var mask1 = VectorMask.fromLong(INT_SPECIES_256, maskBits >> 8);
 | 
	
		
			
				|  |  | +                var mask2 = VectorMask.fromLong(INT_SPECIES_256, maskBits >> 16);
 | 
	
		
			
				|  |  | +                var mask3 = VectorMask.fromLong(INT_SPECIES_256, maskBits >> 24);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                acc0 = acc0.add(vals0, mask0);
 | 
	
		
			
				|  |  | +                acc1 = acc1.add(vals1, mask1);
 | 
	
		
			
				|  |  | +                acc2 = acc2.add(vals2, mask2);
 | 
	
		
			
				|  |  | +                acc3 = acc3.add(vals3, mask3);
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +            sum += acc0.reduceLanes(VectorOperators.ADD) + acc1.reduceLanes(VectorOperators.ADD) + acc2.reduceLanes(VectorOperators.ADD)
 | 
	
		
			
				|  |  | +                + acc3.reduceLanes(VectorOperators.ADD);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        sectionLength = INT_SPECIES_256.length();
 | 
	
		
			
				|  |  | +        if (q.length - i >= sectionLength) {
 | 
	
		
			
				|  |  | +            IntVector acc = IntVector.zero(INT_SPECIES_256);
 | 
	
		
			
				|  |  | +            int limit = limit(q.length, sectionLength);
 | 
	
		
			
				|  |  | +            for (; i < limit; i += sectionLength) {
 | 
	
		
			
				|  |  | +                var vals = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, q, i).castShape(INT_SPECIES_256, 0);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                long maskBits = Integer.reverse(d[i / 8]) >> 24;
 | 
	
		
			
				|  |  | +                var mask = VectorMask.fromLong(INT_SPECIES_256, maskBits);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                acc = acc.add(vals, mask);
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +            sum += acc.reduceLanes(VectorOperators.ADD);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // that should have got them all (q.length is a multiple of 8, which fits in a 256-bit vector)
 | 
	
		
			
				|  |  | +        assert i == q.length;
 | 
	
		
			
				|  |  | +        return sum;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    private static final VectorSpecies<Float> FLOAT_SPECIES_512 = FloatVector.SPECIES_512;
 | 
	
		
			
				|  |  | +    private static final VectorSpecies<Float> FLOAT_SPECIES_256 = FloatVector.SPECIES_256;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    static float ipFloatBit512(float[] q, byte[] d) {
 | 
	
		
			
				|  |  | +        assert q.length == d.length * Byte.SIZE;
 | 
	
		
			
				|  |  | +        int i = 0;
 | 
	
		
			
				|  |  | +        float sum = 0;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        int sectionLength = FLOAT_SPECIES_512.length() * 4;
 | 
	
		
			
				|  |  | +        if (q.length >= sectionLength) {
 | 
	
		
			
				|  |  | +            FloatVector acc0 = FloatVector.zero(FLOAT_SPECIES_512);
 | 
	
		
			
				|  |  | +            FloatVector acc1 = FloatVector.zero(FLOAT_SPECIES_512);
 | 
	
		
			
				|  |  | +            FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES_512);
 | 
	
		
			
				|  |  | +            FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES_512);
 | 
	
		
			
				|  |  | +            int limit = limit(q.length, sectionLength);
 | 
	
		
			
				|  |  | +            for (; i < limit; i += sectionLength) {
 | 
	
		
			
				|  |  | +                var floats0 = FloatVector.fromArray(FLOAT_SPECIES_512, q, i);
 | 
	
		
			
				|  |  | +                var floats1 = FloatVector.fromArray(FLOAT_SPECIES_512, q, i + FLOAT_SPECIES_512.length());
 | 
	
		
			
				|  |  | +                var floats2 = FloatVector.fromArray(FLOAT_SPECIES_512, q, i + FLOAT_SPECIES_512.length() * 2);
 | 
	
		
			
				|  |  | +                var floats3 = FloatVector.fromArray(FLOAT_SPECIES_512, q, i + FLOAT_SPECIES_512.length() * 3);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                long maskBits = Long.reverse((long) BitUtil.VH_BE_LONG.get(d, i / 8));
 | 
	
		
			
				|  |  | +                var mask0 = VectorMask.fromLong(FLOAT_SPECIES_512, maskBits);
 | 
	
		
			
				|  |  | +                var mask1 = VectorMask.fromLong(FLOAT_SPECIES_512, maskBits >> 16);
 | 
	
		
			
				|  |  | +                var mask2 = VectorMask.fromLong(FLOAT_SPECIES_512, maskBits >> 32);
 | 
	
		
			
				|  |  | +                var mask3 = VectorMask.fromLong(FLOAT_SPECIES_512, maskBits >> 48);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                acc0 = acc0.add(floats0, mask0);
 | 
	
		
			
				|  |  | +                acc1 = acc1.add(floats1, mask1);
 | 
	
		
			
				|  |  | +                acc2 = acc2.add(floats2, mask2);
 | 
	
		
			
				|  |  | +                acc3 = acc3.add(floats3, mask3);
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +            sum += acc0.reduceLanes(VectorOperators.ADD) + acc1.reduceLanes(VectorOperators.ADD) + acc2.reduceLanes(VectorOperators.ADD)
 | 
	
		
			
				|  |  | +                + acc3.reduceLanes(VectorOperators.ADD);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        sectionLength = FLOAT_SPECIES_256.length();
 | 
	
		
			
				|  |  | +        if (q.length - i >= sectionLength) {
 | 
	
		
			
				|  |  | +            FloatVector acc = FloatVector.zero(FLOAT_SPECIES_256);
 | 
	
		
			
				|  |  | +            int limit = limit(q.length, sectionLength);
 | 
	
		
			
				|  |  | +            for (; i < limit; i += sectionLength) {
 | 
	
		
			
				|  |  | +                var floats = FloatVector.fromArray(FLOAT_SPECIES_256, q, i);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                long maskBits = Integer.reverse(d[i / 8]) >> 24;
 | 
	
		
			
				|  |  | +                var mask = VectorMask.fromLong(FLOAT_SPECIES_256, maskBits);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                acc = acc.add(floats, mask);
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +            sum += acc.reduceLanes(VectorOperators.ADD);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // that should have got them all (q.length is a multiple of 8, which fits in a 256-bit vector)
 | 
	
		
			
				|  |  | +        assert i == q.length;
 | 
	
		
			
				|  |  | +        return sum;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    static float ipFloatBit256(float[] q, byte[] d) {
 | 
	
		
			
				|  |  | +        assert q.length == d.length * Byte.SIZE;
 | 
	
		
			
				|  |  | +        int i = 0;
 | 
	
		
			
				|  |  | +        float sum = 0;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        int sectionLength = FLOAT_SPECIES_256.length() * 4;
 | 
	
		
			
				|  |  | +        if (q.length >= sectionLength) {
 | 
	
		
			
				|  |  | +            FloatVector acc0 = FloatVector.zero(FLOAT_SPECIES_256);
 | 
	
		
			
				|  |  | +            FloatVector acc1 = FloatVector.zero(FLOAT_SPECIES_256);
 | 
	
		
			
				|  |  | +            FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES_256);
 | 
	
		
			
				|  |  | +            FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES_256);
 | 
	
		
			
				|  |  | +            int limit = limit(q.length, sectionLength);
 | 
	
		
			
				|  |  | +            for (; i < limit; i += sectionLength) {
 | 
	
		
			
				|  |  | +                var floats0 = FloatVector.fromArray(FLOAT_SPECIES_256, q, i);
 | 
	
		
			
				|  |  | +                var floats1 = FloatVector.fromArray(FLOAT_SPECIES_256, q, i + FLOAT_SPECIES_256.length());
 | 
	
		
			
				|  |  | +                var floats2 = FloatVector.fromArray(FLOAT_SPECIES_256, q, i + FLOAT_SPECIES_256.length() * 2);
 | 
	
		
			
				|  |  | +                var floats3 = FloatVector.fromArray(FLOAT_SPECIES_256, q, i + FLOAT_SPECIES_256.length() * 3);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                long maskBits = Integer.reverse((int) BitUtil.VH_BE_INT.get(d, i / 8));
 | 
	
		
			
				|  |  | +                var mask0 = VectorMask.fromLong(FLOAT_SPECIES_256, maskBits);
 | 
	
		
			
				|  |  | +                var mask1 = VectorMask.fromLong(FLOAT_SPECIES_256, maskBits >> 8);
 | 
	
		
			
				|  |  | +                var mask2 = VectorMask.fromLong(FLOAT_SPECIES_256, maskBits >> 16);
 | 
	
		
			
				|  |  | +                var mask3 = VectorMask.fromLong(FLOAT_SPECIES_256, maskBits >> 24);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                acc0 = acc0.add(floats0, mask0);
 | 
	
		
			
				|  |  | +                acc1 = acc1.add(floats1, mask1);
 | 
	
		
			
				|  |  | +                acc2 = acc2.add(floats2, mask2);
 | 
	
		
			
				|  |  | +                acc3 = acc3.add(floats3, mask3);
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +            sum += acc0.reduceLanes(VectorOperators.ADD) + acc1.reduceLanes(VectorOperators.ADD) + acc2.reduceLanes(VectorOperators.ADD)
 | 
	
		
			
				|  |  | +                + acc3.reduceLanes(VectorOperators.ADD);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        sectionLength = FLOAT_SPECIES_256.length();
 | 
	
		
			
				|  |  | +        if (q.length - i >= sectionLength) {
 | 
	
		
			
				|  |  | +            FloatVector acc = FloatVector.zero(FLOAT_SPECIES_256);
 | 
	
		
			
				|  |  | +            int limit = limit(q.length, sectionLength);
 | 
	
		
			
				|  |  | +            for (; i < limit; i += sectionLength) {
 | 
	
		
			
				|  |  | +                var floats = FloatVector.fromArray(FLOAT_SPECIES_256, q, i);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                long maskBits = Integer.reverse(d[i / 8]) >> 24;
 | 
	
		
			
				|  |  | +                var mask = VectorMask.fromLong(FLOAT_SPECIES_256, maskBits);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                acc = acc.add(floats, mask);
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +            sum += acc.reduceLanes(VectorOperators.ADD);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // that should have got them all (q.length is a multiple of 8, which fits in a 256-bit vector)
 | 
	
		
			
				|  |  | +        assert i == q.length;
 | 
	
		
			
				|  |  | +        return sum;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |      private static final VectorSpecies<Float> PREFERRED_FLOAT_SPECIES = FloatVector.SPECIES_PREFERRED;
 | 
	
		
			
				|  |  |      private static final VectorSpecies<Byte> BYTE_SPECIES_FOR_PREFFERED_FLOATS;
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -177,7 +427,7 @@ public final class PanamaESVectorUtilSupport implements ESVectorUtilSupport {
 | 
	
		
			
				|  |  |          VectorSpecies<Byte> byteForFloat;
 | 
	
		
			
				|  |  |          try {
 | 
	
		
			
				|  |  |              // calculate vector size to convert from single bytes to 4-byte floats
 | 
	
		
			
				|  |  | -            byteForFloat = VectorSpecies.of(byte.class, VectorShape.forBitSize(PREFERRED_FLOAT_SPECIES.vectorBitSize() / Integer.BYTES));
 | 
	
		
			
				|  |  | +            byteForFloat = VectorSpecies.of(byte.class, VectorShape.forBitSize(PREFERRED_FLOAT_SPECIES.vectorBitSize() / Float.BYTES));
 | 
	
		
			
				|  |  |          } catch (IllegalArgumentException e) {
 | 
	
		
			
				|  |  |              // can't get a byte vector size small enough, just use default impl
 | 
	
		
			
				|  |  |              byteForFloat = null;
 |