/*
 * Decompiled with CFR 0.152.
 */
package org.elasticsearch.simdvec.internal.vectorization;

import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.FloatVector;
import jdk.incubator.vector.IntVector;
import jdk.incubator.vector.LongVector;
import jdk.incubator.vector.Vector;
import jdk.incubator.vector.VectorMask;
import jdk.incubator.vector.VectorOperators;
import jdk.incubator.vector.VectorShape;
import jdk.incubator.vector.VectorSpecies;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.Constants;
import org.elasticsearch.simdvec.internal.vectorization.ByteArrayUtils;
import org.elasticsearch.simdvec.internal.vectorization.DefaultESVectorUtilSupport;
import org.elasticsearch.simdvec.internal.vectorization.ESVectorUtilSupport;
import org.elasticsearch.simdvec.internal.vectorization.PanamaVectorConstants;

public final class PanamaESVectorUtilSupport
implements ESVectorUtilSupport {
    static final int VECTOR_BITSIZE;
    private static final VectorSpecies<Float> FLOAT_SPECIES;
    private static final VectorSpecies<Integer> INTEGER_SPECIES;
    static final boolean HAS_FAST_INTEGER_VECTORS;
    private static final VectorSpecies<Byte> BYTE_SPECIES_128;
    private static final VectorSpecies<Byte> BYTE_SPECIES_256;
    private static final VectorSpecies<Integer> INT_SPECIES_512;
    private static final VectorSpecies<Byte> BYTE_SPECIES_FOR_INT_512;
    private static final VectorSpecies<Integer> INT_SPECIES_256;
    private static final VectorSpecies<Byte> BYTE_SPECIES_FOR_INT_256;
    private static final VectorSpecies<Float> FLOAT_SPECIES_512;
    private static final VectorSpecies<Float> FLOAT_SPECIES_256;
    private static final VectorSpecies<Float> PREFERRED_FLOAT_SPECIES;
    private static final VectorSpecies<Byte> BYTE_SPECIES_FOR_PREFFERED_FLOATS;
    private static final VectorSpecies<Integer> INT_SPECIES_128;
    private static final IntVector SHIFTS_256;
    private static final IntVector HIGH_SHIFTS_128;
    private static final IntVector LOW_SHIFTS_128;
    private static final int[] SHIFTS;
    private static final VectorSpecies<Byte> PREFERRED_BYTE_SPECIES;

    private static FloatVector fma(FloatVector a, FloatVector b, FloatVector c) {
        if (Constants.HAS_FAST_VECTOR_FMA) {
            return a.fma((Vector)b, (Vector)c);
        }
        return a.mul((Vector)b).add((Vector)c);
    }

    private static float fma(float a, float b, float c) {
        if (Constants.HAS_FAST_SCALAR_FMA) {
            return Math.fma(a, b, c);
        }
        return a * b + c;
    }

    @Override
    public long ipByteBinByte(byte[] q, byte[] d) {
        if (d.length >= 16 && HAS_FAST_INTEGER_VECTORS) {
            if (VECTOR_BITSIZE >= 256) {
                return PanamaESVectorUtilSupport.ipByteBin256(q, d);
            }
            if (VECTOR_BITSIZE == 128) {
                return PanamaESVectorUtilSupport.ipByteBin128(q, d);
            }
        }
        return DefaultESVectorUtilSupport.ipByteBinByteImpl(q, d);
    }

    @Override
    public int ipByteBit(byte[] q, byte[] d) {
        if (d.length >= 16 && HAS_FAST_INTEGER_VECTORS) {
            if (VECTOR_BITSIZE >= 512) {
                return PanamaESVectorUtilSupport.ipByteBit512(q, d);
            }
            if (VECTOR_BITSIZE == 256) {
                return PanamaESVectorUtilSupport.ipByteBit256(q, d);
            }
        }
        return DefaultESVectorUtilSupport.ipByteBitImpl(q, d);
    }

    @Override
    public float ipFloatBit(float[] q, byte[] d) {
        if (q.length >= 16) {
            if (VECTOR_BITSIZE >= 512) {
                return PanamaESVectorUtilSupport.ipFloatBit512(q, d);
            }
            if (VECTOR_BITSIZE == 256) {
                return PanamaESVectorUtilSupport.ipFloatBit256(q, d);
            }
        }
        return DefaultESVectorUtilSupport.ipFloatBitImpl(q, d);
    }

    @Override
    public float ipFloatByte(float[] q, byte[] d) {
        if (BYTE_SPECIES_FOR_PREFFERED_FLOATS != null && q.length >= PREFERRED_FLOAT_SPECIES.length()) {
            return PanamaESVectorUtilSupport.ipFloatByteImpl(q, d);
        }
        return DefaultESVectorUtilSupport.ipFloatByteImpl(q, d);
    }

    @Override
    public void centerAndCalculateOSQStatsEuclidean(float[] vector, float[] centroid, float[] centered, float[] stats) {
        int i;
        assert (vector.length == centroid.length);
        assert (vector.length == centered.length);
        float vecMean = 0.0f;
        float vecVar = 0.0f;
        float norm2 = 0.0f;
        float min = Float.MAX_VALUE;
        float max = -3.4028235E38f;
        int vectCount = 0;
        if (vector.length > 2 * FLOAT_SPECIES.length()) {
            FloatVector vecMeanVec = FloatVector.zero(FLOAT_SPECIES);
            FloatVector m2Vec = FloatVector.zero(FLOAT_SPECIES);
            FloatVector norm2Vec = FloatVector.zero(FLOAT_SPECIES);
            FloatVector minVec = FloatVector.broadcast(FLOAT_SPECIES, (float)Float.MAX_VALUE);
            FloatVector maxVec = FloatVector.broadcast(FLOAT_SPECIES, (float)-3.4028235E38f);
            int count = 0;
            for (i = 0; i < FLOAT_SPECIES.loopBound(vector.length); i += FLOAT_SPECIES.length()) {
                ++count;
                FloatVector v = FloatVector.fromArray(FLOAT_SPECIES, (float[])vector, (int)i);
                FloatVector c = FloatVector.fromArray(FLOAT_SPECIES, (float[])centroid, (int)i);
                FloatVector centeredVec = v.sub((Vector)c);
                FloatVector deltaVec = centeredVec.sub((Vector)vecMeanVec);
                norm2Vec = PanamaESVectorUtilSupport.fma(centeredVec, centeredVec, norm2Vec);
                vecMeanVec = vecMeanVec.add((Vector)deltaVec.mul(1.0f / (float)count));
                FloatVector delta2Vec = centeredVec.sub((Vector)vecMeanVec);
                m2Vec = PanamaESVectorUtilSupport.fma(deltaVec, delta2Vec, m2Vec);
                minVec = minVec.min((Vector)centeredVec);
                maxVec = maxVec.max((Vector)centeredVec);
                centeredVec.intoArray(centered, i);
            }
            min = minVec.reduceLanes(VectorOperators.MIN);
            max = maxVec.reduceLanes(VectorOperators.MAX);
            norm2 = norm2Vec.reduceLanes(VectorOperators.ADD);
            vecMean = vecMeanVec.reduceLanes(VectorOperators.ADD) / (float)FLOAT_SPECIES.length();
            FloatVector d2Mean = vecMeanVec.sub(vecMean);
            m2Vec = PanamaESVectorUtilSupport.fma(d2Mean, d2Mean, m2Vec);
            vectCount = count * FLOAT_SPECIES.length();
            vecVar = m2Vec.reduceLanes(VectorOperators.ADD);
        }
        float tailMean = 0.0f;
        float tailM2 = 0.0f;
        int tailCount = 0;
        while (i < vector.length) {
            centered[i] = vector[i] - centroid[i];
            float delta = centered[i] - tailMean;
            float delta2 = centered[i] - (tailMean += delta / (float)(++tailCount));
            tailM2 = PanamaESVectorUtilSupport.fma(delta, delta2, tailM2);
            min = Math.min(min, centered[i]);
            max = Math.max(max, centered[i]);
            norm2 = PanamaESVectorUtilSupport.fma(centered[i], centered[i], norm2);
            ++i;
        }
        if (vectCount == 0) {
            vecMean = tailMean;
            vecVar = tailM2;
        } else if (tailCount > 0) {
            int totalCount = tailCount + vectCount;
            assert (totalCount == vector.length);
            float alpha = (float)vectCount / (float)totalCount;
            float beta = 1.0f - alpha;
            float completeMean = alpha * vecMean + beta * tailMean;
            float dMean2Lhs = (vecMean - completeMean) * (vecMean - completeMean);
            float dMean2Rhs = (tailMean - completeMean) * (tailMean - completeMean);
            vecVar = vecVar + dMean2Lhs + beta * (tailM2 + dMean2Rhs);
            vecMean = completeMean;
        }
        stats[0] = vecMean;
        stats[1] = vecVar / (float)vector.length;
        stats[2] = norm2;
        stats[3] = min;
        stats[4] = max;
    }

    @Override
    public void centerAndCalculateOSQStatsDp(float[] vector, float[] centroid, float[] centered, float[] stats) {
        int i;
        assert (vector.length == centroid.length);
        assert (vector.length == centered.length);
        float vecMean = 0.0f;
        float vecVar = 0.0f;
        float norm2 = 0.0f;
        float min = Float.MAX_VALUE;
        float max = -3.4028235E38f;
        float centroidDot = 0.0f;
        int vectCount = 0;
        int loopBound = FLOAT_SPECIES.loopBound(vector.length);
        if (vector.length > 2 * FLOAT_SPECIES.length()) {
            FloatVector vecMeanVec = FloatVector.zero(FLOAT_SPECIES);
            FloatVector m2Vec = FloatVector.zero(FLOAT_SPECIES);
            FloatVector norm2Vec = FloatVector.zero(FLOAT_SPECIES);
            FloatVector minVec = FloatVector.broadcast(FLOAT_SPECIES, (float)Float.MAX_VALUE);
            FloatVector maxVec = FloatVector.broadcast(FLOAT_SPECIES, (float)-3.4028235E38f);
            FloatVector centroidDotVec = FloatVector.zero(FLOAT_SPECIES);
            int count = 0;
            for (i = 0; i < loopBound; i += FLOAT_SPECIES.length()) {
                ++count;
                FloatVector v = FloatVector.fromArray(FLOAT_SPECIES, (float[])vector, (int)i);
                FloatVector c = FloatVector.fromArray(FLOAT_SPECIES, (float[])centroid, (int)i);
                centroidDotVec = PanamaESVectorUtilSupport.fma(v, c, centroidDotVec);
                FloatVector centeredVec = v.sub((Vector)c);
                FloatVector deltaVec = centeredVec.sub((Vector)vecMeanVec);
                norm2Vec = PanamaESVectorUtilSupport.fma(centeredVec, centeredVec, norm2Vec);
                vecMeanVec = vecMeanVec.add((Vector)deltaVec.mul(1.0f / (float)count));
                FloatVector delta2Vec = centeredVec.sub((Vector)vecMeanVec);
                m2Vec = PanamaESVectorUtilSupport.fma(deltaVec, delta2Vec, m2Vec);
                minVec = minVec.min((Vector)centeredVec);
                maxVec = maxVec.max((Vector)centeredVec);
                centeredVec.intoArray(centered, i);
            }
            min = minVec.reduceLanes(VectorOperators.MIN);
            max = maxVec.reduceLanes(VectorOperators.MAX);
            norm2 = norm2Vec.reduceLanes(VectorOperators.ADD);
            centroidDot = centroidDotVec.reduceLanes(VectorOperators.ADD);
            vecMean = vecMeanVec.reduceLanes(VectorOperators.ADD) / (float)FLOAT_SPECIES.length();
            FloatVector d2Mean = vecMeanVec.sub(vecMean);
            m2Vec = PanamaESVectorUtilSupport.fma(d2Mean, d2Mean, m2Vec);
            vectCount = count * FLOAT_SPECIES.length();
            vecVar = m2Vec.reduceLanes(VectorOperators.ADD);
        }
        float tailMean = 0.0f;
        float tailM2 = 0.0f;
        int tailCount = 0;
        while (i < vector.length) {
            centroidDot = PanamaESVectorUtilSupport.fma(vector[i], centroid[i], centroidDot);
            centered[i] = vector[i] - centroid[i];
            float delta = centered[i] - tailMean;
            float delta2 = centered[i] - (tailMean += delta / (float)(++tailCount));
            tailM2 = PanamaESVectorUtilSupport.fma(delta, delta2, tailM2);
            min = Math.min(min, centered[i]);
            max = Math.max(max, centered[i]);
            norm2 = PanamaESVectorUtilSupport.fma(centered[i], centered[i], norm2);
            ++i;
        }
        if (vectCount == 0) {
            vecMean = tailMean;
            vecVar = tailM2;
        } else if (tailCount > 0) {
            int totalCount = tailCount + vectCount;
            assert (totalCount == vector.length);
            float alpha = (float)vectCount / (float)totalCount;
            float beta = 1.0f - alpha;
            float completeMean = alpha * vecMean + beta * tailMean;
            float dMean2Lhs = (vecMean - completeMean) * (vecMean - completeMean);
            float dMean2Rhs = (tailMean - completeMean) * (tailMean - completeMean);
            vecVar = vecVar + dMean2Lhs + beta * (tailM2 + dMean2Rhs);
            vecMean = completeMean;
        }
        stats[0] = vecMean;
        stats[1] = vecVar / (float)vector.length;
        stats[2] = norm2;
        stats[3] = min;
        stats[4] = max;
        stats[5] = centroidDot;
    }

    @Override
    public void calculateOSQGridPoints(float[] target, int[] quantize, int points, float[] pts) {
        int i;
        float daa = 0.0f;
        float dab = 0.0f;
        float dbb = 0.0f;
        float dax = 0.0f;
        float dbx = 0.0f;
        float invPmOnes = 1.0f / ((float)points - 1.0f);
        if (target.length > 2 * FLOAT_SPECIES.length()) {
            FloatVector daaVec = FloatVector.zero(FLOAT_SPECIES);
            FloatVector dabVec = FloatVector.zero(FLOAT_SPECIES);
            FloatVector dbbVec = FloatVector.zero(FLOAT_SPECIES);
            FloatVector daxVec = FloatVector.zero(FLOAT_SPECIES);
            FloatVector dbxVec = FloatVector.zero(FLOAT_SPECIES);
            FloatVector ones = FloatVector.broadcast(FLOAT_SPECIES, (float)1.0f);
            FloatVector invPmOnesVec = FloatVector.broadcast(FLOAT_SPECIES, (float)invPmOnes);
            for (i = 0; i < FLOAT_SPECIES.loopBound(target.length); i += FLOAT_SPECIES.length()) {
                FloatVector v = FloatVector.fromArray(FLOAT_SPECIES, (float[])target, (int)i);
                FloatVector oVec = IntVector.fromArray(INTEGER_SPECIES, (int[])quantize, (int)i).convert(VectorOperators.I2F, 0).reinterpretAsFloats();
                FloatVector sVec = oVec.mul((Vector)invPmOnesVec);
                FloatVector smVec = ones.sub((Vector)sVec);
                daaVec = PanamaESVectorUtilSupport.fma(smVec, smVec, daaVec);
                dabVec = PanamaESVectorUtilSupport.fma(smVec, sVec, dabVec);
                dbbVec = PanamaESVectorUtilSupport.fma(sVec, sVec, dbbVec);
                daxVec = PanamaESVectorUtilSupport.fma(v, smVec, daxVec);
                dbxVec = PanamaESVectorUtilSupport.fma(v, sVec, dbxVec);
            }
            daa = daaVec.reduceLanes(VectorOperators.ADD);
            dab = dabVec.reduceLanes(VectorOperators.ADD);
            dbb = dbbVec.reduceLanes(VectorOperators.ADD);
            dax = daxVec.reduceLanes(VectorOperators.ADD);
            dbx = dbxVec.reduceLanes(VectorOperators.ADD);
        }
        while (i < target.length) {
            float k = quantize[i];
            float s = k * invPmOnes;
            float ms = 1.0f - s;
            daa = PanamaESVectorUtilSupport.fma(ms, ms, daa);
            dab = PanamaESVectorUtilSupport.fma(ms, s, dab);
            dbb = PanamaESVectorUtilSupport.fma(s, s, dbb);
            dax = PanamaESVectorUtilSupport.fma(ms, target[i], dax);
            dbx = PanamaESVectorUtilSupport.fma(s, target[i], dbx);
            ++i;
        }
        pts[0] = daa;
        pts[1] = dab;
        pts[2] = dbb;
        pts[3] = dax;
        pts[4] = dbx;
    }

    @Override
    public float calculateOSQLoss(float[] target, float lowerInterval, float upperInterval, float step, float invStep, float norm2, float lambda, int[] quantize) {
        int i;
        float a = lowerInterval;
        float b = upperInterval;
        float xe = 0.0f;
        float e = 0.0f;
        FloatVector xeVec = FloatVector.zero(FLOAT_SPECIES);
        FloatVector eVec = FloatVector.zero(FLOAT_SPECIES);
        if (target.length > 2 * FLOAT_SPECIES.length()) {
            for (i = 0; i < FLOAT_SPECIES.loopBound(target.length); i += FLOAT_SPECIES.length()) {
                FloatVector v = FloatVector.fromArray(FLOAT_SPECIES, (float[])target, (int)i);
                FloatVector vClamped = v.max(a).min(b);
                IntVector xiqint = vClamped.sub(a).mul(invStep).add(0.5f).convert(VectorOperators.F2I, 0).reinterpretAsInts();
                xiqint.intoArray(quantize, i);
                FloatVector quantizeVec = xiqint.convert(VectorOperators.I2F, 0).reinterpretAsFloats();
                FloatVector xiq = quantizeVec.mul(step).add(a);
                FloatVector xiiq = v.sub((Vector)xiq);
                xeVec = PanamaESVectorUtilSupport.fma(v, xiiq, xeVec);
                eVec = PanamaESVectorUtilSupport.fma(xiiq, xiiq, eVec);
            }
            e = eVec.reduceLanes(VectorOperators.ADD);
            xe = xeVec.reduceLanes(VectorOperators.ADD);
        }
        while (i < target.length) {
            quantize[i] = Math.round((Math.min(Math.max(target[i], a), b) - a) * invStep);
            float xiq = PanamaESVectorUtilSupport.fma(step, quantize[i], a);
            float xiiq = target[i] - xiq;
            e = PanamaESVectorUtilSupport.fma(xiiq, xiiq, e);
            xe = PanamaESVectorUtilSupport.fma(target[i], xiiq, xe);
            ++i;
        }
        return (1.0f - lambda) * xe * xe / norm2 + lambda * e;
    }

    @Override
    public float soarDistance(float[] v1, float[] centroid, float[] originalResidual, float soarLambda, float rnorm) {
        int i;
        assert (v1.length == centroid.length);
        assert (v1.length == originalResidual.length);
        float proj = 0.0f;
        float dsq = 0.0f;
        if (v1.length > 2 * FLOAT_SPECIES.length()) {
            FloatVector projVec1 = FloatVector.zero(FLOAT_SPECIES);
            FloatVector projVec2 = FloatVector.zero(FLOAT_SPECIES);
            FloatVector acc1 = FloatVector.zero(FLOAT_SPECIES);
            FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES);
            int unrolledLimit = FLOAT_SPECIES.loopBound(v1.length) - FLOAT_SPECIES.length();
            for (i = 0; i < unrolledLimit; i += 2 * FLOAT_SPECIES.length()) {
                FloatVector v1Vec0 = FloatVector.fromArray(FLOAT_SPECIES, (float[])v1, (int)i);
                FloatVector centroidVec0 = FloatVector.fromArray(FLOAT_SPECIES, (float[])centroid, (int)i);
                FloatVector originalResidualVec0 = FloatVector.fromArray(FLOAT_SPECIES, (float[])originalResidual, (int)i);
                FloatVector djkVec0 = v1Vec0.sub((Vector)centroidVec0);
                projVec1 = PanamaESVectorUtilSupport.fma(djkVec0, originalResidualVec0, projVec1);
                acc1 = PanamaESVectorUtilSupport.fma(djkVec0, djkVec0, acc1);
                FloatVector v1Vec1 = FloatVector.fromArray(FLOAT_SPECIES, (float[])v1, (int)(i + FLOAT_SPECIES.length()));
                FloatVector centroidVec1 = FloatVector.fromArray(FLOAT_SPECIES, (float[])centroid, (int)(i + FLOAT_SPECIES.length()));
                FloatVector originalResidualVec1 = FloatVector.fromArray(FLOAT_SPECIES, (float[])originalResidual, (int)(i + FLOAT_SPECIES.length()));
                FloatVector djkVec1 = v1Vec1.sub((Vector)centroidVec1);
                projVec2 = PanamaESVectorUtilSupport.fma(djkVec1, originalResidualVec1, projVec2);
                acc2 = PanamaESVectorUtilSupport.fma(djkVec1, djkVec1, acc2);
            }
            while (i < FLOAT_SPECIES.loopBound(v1.length)) {
                FloatVector v1Vec = FloatVector.fromArray(FLOAT_SPECIES, (float[])v1, (int)i);
                FloatVector centroidVec = FloatVector.fromArray(FLOAT_SPECIES, (float[])centroid, (int)i);
                FloatVector originalResidualVec = FloatVector.fromArray(FLOAT_SPECIES, (float[])originalResidual, (int)i);
                FloatVector djkVec = v1Vec.sub((Vector)centroidVec);
                projVec1 = PanamaESVectorUtilSupport.fma(djkVec, originalResidualVec, projVec1);
                acc1 = PanamaESVectorUtilSupport.fma(djkVec, djkVec, acc1);
                i += FLOAT_SPECIES.length();
            }
            proj += projVec1.add((Vector)projVec2).reduceLanes(VectorOperators.ADD);
            dsq += acc1.add((Vector)acc2).reduceLanes(VectorOperators.ADD);
        }
        while (i < v1.length) {
            float djk = v1[i] - centroid[i];
            proj = PanamaESVectorUtilSupport.fma(djk, originalResidual[i], proj);
            dsq = PanamaESVectorUtilSupport.fma(djk, djk, dsq);
            ++i;
        }
        return dsq + soarLambda * proj * proj / rnorm;
    }

    static long ipByteBin256(byte[] q, byte[] d) {
        LongVector vd;
        LongVector vq3;
        LongVector vq2;
        LongVector vq1;
        LongVector vq0;
        int i;
        long subRet0 = 0L;
        long subRet1 = 0L;
        long subRet2 = 0L;
        long subRet3 = 0L;
        if (d.length >= ByteVector.SPECIES_256.vectorByteSize() * 2) {
            int limit = ByteVector.SPECIES_256.loopBound(d.length);
            LongVector sum0 = LongVector.zero((VectorSpecies)LongVector.SPECIES_256);
            LongVector sum1 = LongVector.zero((VectorSpecies)LongVector.SPECIES_256);
            LongVector sum2 = LongVector.zero((VectorSpecies)LongVector.SPECIES_256);
            LongVector sum3 = LongVector.zero((VectorSpecies)LongVector.SPECIES_256);
            for (i = 0; i < limit; i += ByteVector.SPECIES_256.length()) {
                vq0 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)i).reinterpretAsLongs();
                vq1 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)(i + d.length)).reinterpretAsLongs();
                vq2 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)(i + d.length * 2)).reinterpretAsLongs();
                vq3 = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])q, (int)(i + d.length * 3)).reinterpretAsLongs();
                vd = ByteVector.fromArray(BYTE_SPECIES_256, (byte[])d, (int)i).reinterpretAsLongs();
                sum0 = sum0.add((Vector)vq0.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum1 = sum1.add((Vector)vq1.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum2 = sum2.add((Vector)vq2.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum3 = sum3.add((Vector)vq3.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
            }
            subRet0 += sum0.reduceLanes(VectorOperators.ADD);
            subRet1 += sum1.reduceLanes(VectorOperators.ADD);
            subRet2 += sum2.reduceLanes(VectorOperators.ADD);
            subRet3 += sum3.reduceLanes(VectorOperators.ADD);
        }
        if (d.length - i >= ByteVector.SPECIES_128.vectorByteSize()) {
            LongVector sum0 = LongVector.zero((VectorSpecies)LongVector.SPECIES_128);
            LongVector sum1 = LongVector.zero((VectorSpecies)LongVector.SPECIES_128);
            LongVector sum2 = LongVector.zero((VectorSpecies)LongVector.SPECIES_128);
            LongVector sum3 = LongVector.zero((VectorSpecies)LongVector.SPECIES_128);
            int limit = ByteVector.SPECIES_128.loopBound(d.length);
            while (i < limit) {
                vq0 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)i).reinterpretAsLongs();
                vq1 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + d.length)).reinterpretAsLongs();
                vq2 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + d.length * 2)).reinterpretAsLongs();
                vq3 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + d.length * 3)).reinterpretAsLongs();
                vd = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])d, (int)i).reinterpretAsLongs();
                sum0 = sum0.add((Vector)vq0.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum1 = sum1.add((Vector)vq1.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum2 = sum2.add((Vector)vq2.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                sum3 = sum3.add((Vector)vq3.and((Vector)vd).lanewise(VectorOperators.BIT_COUNT));
                i += ByteVector.SPECIES_128.length();
            }
            subRet0 += sum0.reduceLanes(VectorOperators.ADD);
            subRet1 += sum1.reduceLanes(VectorOperators.ADD);
            subRet2 += sum2.reduceLanes(VectorOperators.ADD);
            subRet3 += sum3.reduceLanes(VectorOperators.ADD);
        }
        while (i < d.length) {
            subRet0 += (long)Integer.bitCount(q[i] & d[i] & 0xFF);
            subRet1 += (long)Integer.bitCount(q[i + d.length] & d[i] & 0xFF);
            subRet2 += (long)Integer.bitCount(q[i + 2 * d.length] & d[i] & 0xFF);
            subRet3 += (long)Integer.bitCount(q[i + 3 * d.length] & d[i] & 0xFF);
            ++i;
        }
        return subRet0 + (subRet1 << 1) + (subRet2 << 2) + (subRet3 << 3);
    }

    public static long ipByteBin128(byte[] q, byte[] d) {
        int i;
        long subRet0 = 0L;
        long subRet1 = 0L;
        long subRet2 = 0L;
        long subRet3 = 0L;
        IntVector sum0 = IntVector.zero((VectorSpecies)IntVector.SPECIES_128);
        IntVector sum1 = IntVector.zero((VectorSpecies)IntVector.SPECIES_128);
        IntVector sum2 = IntVector.zero((VectorSpecies)IntVector.SPECIES_128);
        IntVector sum3 = IntVector.zero((VectorSpecies)IntVector.SPECIES_128);
        int limit = ByteVector.SPECIES_128.loopBound(d.length);
        for (i = 0; i < limit; i += ByteVector.SPECIES_128.length()) {
            IntVector vd = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])d, (int)i).reinterpretAsInts();
            IntVector vq0 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)i).reinterpretAsInts();
            IntVector vq1 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + d.length)).reinterpretAsInts();
            IntVector vq2 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + d.length * 2)).reinterpretAsInts();
            IntVector vq3 = ByteVector.fromArray(BYTE_SPECIES_128, (byte[])q, (int)(i + d.length * 3)).reinterpretAsInts();
            sum0 = sum0.add((Vector)vd.and((Vector)vq0).lanewise(VectorOperators.BIT_COUNT));
            sum1 = sum1.add((Vector)vd.and((Vector)vq1).lanewise(VectorOperators.BIT_COUNT));
            sum2 = sum2.add((Vector)vd.and((Vector)vq2).lanewise(VectorOperators.BIT_COUNT));
            sum3 = sum3.add((Vector)vd.and((Vector)vq3).lanewise(VectorOperators.BIT_COUNT));
        }
        subRet0 += (long)sum0.reduceLanes(VectorOperators.ADD);
        subRet1 += (long)sum1.reduceLanes(VectorOperators.ADD);
        subRet2 += (long)sum2.reduceLanes(VectorOperators.ADD);
        subRet3 += (long)sum3.reduceLanes(VectorOperators.ADD);
        while (i < d.length) {
            byte dValue = d[i];
            subRet0 += (long)Integer.bitCount(dValue & q[i] & 0xFF);
            subRet1 += (long)Integer.bitCount(dValue & q[i + d.length] & 0xFF);
            subRet2 += (long)Integer.bitCount(dValue & q[i + 2 * d.length] & 0xFF);
            subRet3 += (long)Integer.bitCount(dValue & q[i + 3 * d.length] & 0xFF);
            ++i;
        }
        return subRet0 + (subRet1 << 1) + (subRet2 << 2) + (subRet3 << 3);
    }

    private static int limit(int length, int sectionSize) {
        return length - length % sectionSize;
    }

    static int ipByteBit512(byte[] q, byte[] d) {
        int i;
        assert (q.length == d.length * 8);
        int sum = 0;
        int sectionLength = INT_SPECIES_512.length() * 4;
        if (q.length >= sectionLength) {
            IntVector acc0 = IntVector.zero(INT_SPECIES_512);
            IntVector acc1 = IntVector.zero(INT_SPECIES_512);
            IntVector acc2 = IntVector.zero(INT_SPECIES_512);
            IntVector acc3 = IntVector.zero(INT_SPECIES_512);
            int limit = PanamaESVectorUtilSupport.limit(q.length, sectionLength);
            for (i = 0; i < limit; i += sectionLength) {
                Vector vals0 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_512, (byte[])q, (int)i).castShape(INT_SPECIES_512, 0);
                Vector vals1 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_512, (byte[])q, (int)(i + INT_SPECIES_512.length())).castShape(INT_SPECIES_512, 0);
                Vector vals2 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_512, (byte[])q, (int)(i + INT_SPECIES_512.length() * 2)).castShape(INT_SPECIES_512, 0);
                Vector vals3 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_512, (byte[])q, (int)(i + INT_SPECIES_512.length() * 3)).castShape(INT_SPECIES_512, 0);
                long maskBits = Long.reverse(BitUtil.VH_BE_LONG.get(d, i / 8));
                VectorMask mask0 = VectorMask.fromLong(INT_SPECIES_512, (long)maskBits);
                VectorMask mask1 = VectorMask.fromLong(INT_SPECIES_512, (long)(maskBits >> 16));
                VectorMask mask2 = VectorMask.fromLong(INT_SPECIES_512, (long)(maskBits >> 32));
                VectorMask mask3 = VectorMask.fromLong(INT_SPECIES_512, (long)(maskBits >> 48));
                acc0 = acc0.add(vals0, mask0);
                acc1 = acc1.add(vals1, mask1);
                acc2 = acc2.add(vals2, mask2);
                acc3 = acc3.add(vals3, mask3);
            }
            sum += acc0.reduceLanes(VectorOperators.ADD) + acc1.reduceLanes(VectorOperators.ADD) + acc2.reduceLanes(VectorOperators.ADD) + acc3.reduceLanes(VectorOperators.ADD);
        }
        if (q.length - i >= (sectionLength = INT_SPECIES_256.length())) {
            IntVector acc = IntVector.zero(INT_SPECIES_256);
            int limit = PanamaESVectorUtilSupport.limit(q.length, sectionLength);
            while (i < limit) {
                Vector vals = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, (byte[])q, (int)i).castShape(INT_SPECIES_256, 0);
                long maskBits = Integer.reverse(d[i / 8]) >> 24;
                VectorMask mask = VectorMask.fromLong(INT_SPECIES_256, (long)maskBits);
                acc = acc.add(vals, mask);
                i += sectionLength;
            }
            sum += acc.reduceLanes(VectorOperators.ADD);
        }
        assert (i == q.length);
        return sum;
    }

    static int ipByteBit256(byte[] q, byte[] d) {
        int i;
        assert (q.length == d.length * 8);
        int sum = 0;
        int sectionLength = INT_SPECIES_256.length() * 4;
        if (q.length >= sectionLength) {
            IntVector acc0 = IntVector.zero(INT_SPECIES_256);
            IntVector acc1 = IntVector.zero(INT_SPECIES_256);
            IntVector acc2 = IntVector.zero(INT_SPECIES_256);
            IntVector acc3 = IntVector.zero(INT_SPECIES_256);
            int limit = PanamaESVectorUtilSupport.limit(q.length, sectionLength);
            for (i = 0; i < limit; i += sectionLength) {
                Vector vals0 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, (byte[])q, (int)i).castShape(INT_SPECIES_256, 0);
                Vector vals1 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, (byte[])q, (int)(i + INT_SPECIES_256.length())).castShape(INT_SPECIES_256, 0);
                Vector vals2 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, (byte[])q, (int)(i + INT_SPECIES_256.length() * 2)).castShape(INT_SPECIES_256, 0);
                Vector vals3 = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, (byte[])q, (int)(i + INT_SPECIES_256.length() * 3)).castShape(INT_SPECIES_256, 0);
                long maskBits = Integer.reverse(BitUtil.VH_BE_INT.get(d, i / 8));
                VectorMask mask0 = VectorMask.fromLong(INT_SPECIES_256, (long)maskBits);
                VectorMask mask1 = VectorMask.fromLong(INT_SPECIES_256, (long)(maskBits >> 8));
                VectorMask mask2 = VectorMask.fromLong(INT_SPECIES_256, (long)(maskBits >> 16));
                VectorMask mask3 = VectorMask.fromLong(INT_SPECIES_256, (long)(maskBits >> 24));
                acc0 = acc0.add(vals0, mask0);
                acc1 = acc1.add(vals1, mask1);
                acc2 = acc2.add(vals2, mask2);
                acc3 = acc3.add(vals3, mask3);
            }
            sum += acc0.reduceLanes(VectorOperators.ADD) + acc1.reduceLanes(VectorOperators.ADD) + acc2.reduceLanes(VectorOperators.ADD) + acc3.reduceLanes(VectorOperators.ADD);
        }
        if (q.length - i >= (sectionLength = INT_SPECIES_256.length())) {
            IntVector acc = IntVector.zero(INT_SPECIES_256);
            int limit = PanamaESVectorUtilSupport.limit(q.length, sectionLength);
            while (i < limit) {
                Vector vals = ByteVector.fromArray(BYTE_SPECIES_FOR_INT_256, (byte[])q, (int)i).castShape(INT_SPECIES_256, 0);
                long maskBits = Integer.reverse(d[i / 8]) >> 24;
                VectorMask mask = VectorMask.fromLong(INT_SPECIES_256, (long)maskBits);
                acc = acc.add(vals, mask);
                i += sectionLength;
            }
            sum += acc.reduceLanes(VectorOperators.ADD);
        }
        assert (i == q.length);
        return sum;
    }

    static float ipFloatBit512(float[] q, byte[] d) {
        int i;
        assert (q.length == d.length * 8);
        float sum = 0.0f;
        int sectionLength = FLOAT_SPECIES_512.length() * 4;
        if (q.length >= sectionLength) {
            FloatVector acc0 = FloatVector.zero(FLOAT_SPECIES_512);
            FloatVector acc1 = FloatVector.zero(FLOAT_SPECIES_512);
            FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES_512);
            FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES_512);
            int limit = PanamaESVectorUtilSupport.limit(q.length, sectionLength);
            for (i = 0; i < limit; i += sectionLength) {
                FloatVector floats0 = FloatVector.fromArray(FLOAT_SPECIES_512, (float[])q, (int)i);
                FloatVector floats1 = FloatVector.fromArray(FLOAT_SPECIES_512, (float[])q, (int)(i + FLOAT_SPECIES_512.length()));
                FloatVector floats2 = FloatVector.fromArray(FLOAT_SPECIES_512, (float[])q, (int)(i + FLOAT_SPECIES_512.length() * 2));
                FloatVector floats3 = FloatVector.fromArray(FLOAT_SPECIES_512, (float[])q, (int)(i + FLOAT_SPECIES_512.length() * 3));
                long maskBits = Long.reverse(BitUtil.VH_BE_LONG.get(d, i / 8));
                VectorMask mask0 = VectorMask.fromLong(FLOAT_SPECIES_512, (long)maskBits);
                VectorMask mask1 = VectorMask.fromLong(FLOAT_SPECIES_512, (long)(maskBits >> 16));
                VectorMask mask2 = VectorMask.fromLong(FLOAT_SPECIES_512, (long)(maskBits >> 32));
                VectorMask mask3 = VectorMask.fromLong(FLOAT_SPECIES_512, (long)(maskBits >> 48));
                acc0 = acc0.add((Vector)floats0, mask0);
                acc1 = acc1.add((Vector)floats1, mask1);
                acc2 = acc2.add((Vector)floats2, mask2);
                acc3 = acc3.add((Vector)floats3, mask3);
            }
            sum += acc0.reduceLanes(VectorOperators.ADD) + acc1.reduceLanes(VectorOperators.ADD) + acc2.reduceLanes(VectorOperators.ADD) + acc3.reduceLanes(VectorOperators.ADD);
        }
        if (q.length - i >= (sectionLength = FLOAT_SPECIES_256.length())) {
            FloatVector acc = FloatVector.zero(FLOAT_SPECIES_256);
            int limit = PanamaESVectorUtilSupport.limit(q.length, sectionLength);
            while (i < limit) {
                FloatVector floats = FloatVector.fromArray(FLOAT_SPECIES_256, (float[])q, (int)i);
                long maskBits = Integer.reverse(d[i / 8]) >> 24;
                VectorMask mask = VectorMask.fromLong(FLOAT_SPECIES_256, (long)maskBits);
                acc = acc.add((Vector)floats, mask);
                i += sectionLength;
            }
            sum += acc.reduceLanes(VectorOperators.ADD);
        }
        assert (i == q.length);
        return sum;
    }

    static float ipFloatBit256(float[] q, byte[] d) {
        int i;
        assert (q.length == d.length * 8);
        float sum = 0.0f;
        int sectionLength = FLOAT_SPECIES_256.length() * 4;
        if (q.length >= sectionLength) {
            FloatVector acc0 = FloatVector.zero(FLOAT_SPECIES_256);
            FloatVector acc1 = FloatVector.zero(FLOAT_SPECIES_256);
            FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES_256);
            FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES_256);
            int limit = PanamaESVectorUtilSupport.limit(q.length, sectionLength);
            for (i = 0; i < limit; i += sectionLength) {
                FloatVector floats0 = FloatVector.fromArray(FLOAT_SPECIES_256, (float[])q, (int)i);
                FloatVector floats1 = FloatVector.fromArray(FLOAT_SPECIES_256, (float[])q, (int)(i + FLOAT_SPECIES_256.length()));
                FloatVector floats2 = FloatVector.fromArray(FLOAT_SPECIES_256, (float[])q, (int)(i + FLOAT_SPECIES_256.length() * 2));
                FloatVector floats3 = FloatVector.fromArray(FLOAT_SPECIES_256, (float[])q, (int)(i + FLOAT_SPECIES_256.length() * 3));
                long maskBits = Integer.reverse(BitUtil.VH_BE_INT.get(d, i / 8));
                VectorMask mask0 = VectorMask.fromLong(FLOAT_SPECIES_256, (long)maskBits);
                VectorMask mask1 = VectorMask.fromLong(FLOAT_SPECIES_256, (long)(maskBits >> 8));
                VectorMask mask2 = VectorMask.fromLong(FLOAT_SPECIES_256, (long)(maskBits >> 16));
                VectorMask mask3 = VectorMask.fromLong(FLOAT_SPECIES_256, (long)(maskBits >> 24));
                acc0 = acc0.add((Vector)floats0, mask0);
                acc1 = acc1.add((Vector)floats1, mask1);
                acc2 = acc2.add((Vector)floats2, mask2);
                acc3 = acc3.add((Vector)floats3, mask3);
            }
            sum += acc0.reduceLanes(VectorOperators.ADD) + acc1.reduceLanes(VectorOperators.ADD) + acc2.reduceLanes(VectorOperators.ADD) + acc3.reduceLanes(VectorOperators.ADD);
        }
        if (q.length - i >= (sectionLength = FLOAT_SPECIES_256.length())) {
            FloatVector acc = FloatVector.zero(FLOAT_SPECIES_256);
            int limit = PanamaESVectorUtilSupport.limit(q.length, sectionLength);
            while (i < limit) {
                FloatVector floats = FloatVector.fromArray(FLOAT_SPECIES_256, (float[])q, (int)i);
                long maskBits = Integer.reverse(d[i / 8]) >> 24;
                VectorMask mask = VectorMask.fromLong(FLOAT_SPECIES_256, (long)maskBits);
                acc = acc.add((Vector)floats, mask);
                i += sectionLength;
            }
            sum += acc.reduceLanes(VectorOperators.ADD);
        }
        assert (i == q.length);
        return sum;
    }

    public static float ipFloatByteImpl(float[] q, byte[] d) {
        int i;
        assert (BYTE_SPECIES_FOR_PREFFERED_FLOATS != null);
        FloatVector acc = FloatVector.zero(PREFERRED_FLOAT_SPECIES);
        int limit = PREFERRED_FLOAT_SPECIES.loopBound(q.length);
        for (i = 0; i < limit; i += PREFERRED_FLOAT_SPECIES.length()) {
            FloatVector qv = FloatVector.fromArray(PREFERRED_FLOAT_SPECIES, (float[])q, (int)i);
            ByteVector bv = ByteVector.fromArray(BYTE_SPECIES_FOR_PREFFERED_FLOATS, (byte[])d, (int)i);
            acc = qv.fma(bv.castShape(PREFERRED_FLOAT_SPECIES, 0), (Vector)acc);
        }
        float sum = acc.reduceLanes(VectorOperators.ADD);
        while (i < q.length) {
            sum += q[i] * (float)d[i];
            ++i;
        }
        return sum;
    }

    @Override
    public int quantizeVectorWithIntervals(float[] vector, int[] destination, float lowInterval, float upperInterval, byte bits) {
        int i;
        float nSteps = (1 << bits) - 1;
        float invStep = nSteps / (upperInterval - lowInterval);
        int sumQuery = 0;
        if (vector.length > 2 * FLOAT_SPECIES.length()) {
            int limit = FLOAT_SPECIES.loopBound(vector.length);
            FloatVector lowVec = FloatVector.broadcast(FLOAT_SPECIES, (float)lowInterval);
            FloatVector upperVec = FloatVector.broadcast(FLOAT_SPECIES, (float)upperInterval);
            FloatVector invStepVec = FloatVector.broadcast(FLOAT_SPECIES, (float)invStep);
            for (i = 0; i < limit; i += FLOAT_SPECIES.length()) {
                FloatVector v = FloatVector.fromArray(FLOAT_SPECIES, (float[])vector, (int)i);
                FloatVector xi = v.max((Vector)lowVec).min((Vector)upperVec);
                IntVector assignment = xi.sub((Vector)lowVec).mul((Vector)invStepVec).add(0.5f).convert(VectorOperators.F2I, 0).reinterpretAsInts();
                sumQuery += assignment.reduceLanes(VectorOperators.ADD);
                assignment.intoArray(destination, i);
            }
        }
        while (i < vector.length) {
            float xi = Math.min(Math.max(vector[i], lowInterval), upperInterval);
            int assignment = Math.round((xi - lowInterval) * invStep);
            sumQuery += assignment;
            destination[i] = assignment;
            ++i;
        }
        return sumQuery;
    }

    @Override
    public void squareDistanceBulk(float[] query, float[] v0, float[] v1, float[] v2, float[] v3, float[] distances) {
        int i;
        FloatVector sv0 = FloatVector.zero(FLOAT_SPECIES);
        FloatVector sv1 = FloatVector.zero(FLOAT_SPECIES);
        FloatVector sv2 = FloatVector.zero(FLOAT_SPECIES);
        FloatVector sv3 = FloatVector.zero(FLOAT_SPECIES);
        int limit = FLOAT_SPECIES.loopBound(query.length);
        for (i = 0; i < limit; i += FLOAT_SPECIES.length()) {
            FloatVector qv = FloatVector.fromArray(FLOAT_SPECIES, (float[])query, (int)i);
            FloatVector dv0 = FloatVector.fromArray(FLOAT_SPECIES, (float[])v0, (int)i);
            FloatVector dv1 = FloatVector.fromArray(FLOAT_SPECIES, (float[])v1, (int)i);
            FloatVector dv2 = FloatVector.fromArray(FLOAT_SPECIES, (float[])v2, (int)i);
            FloatVector dv3 = FloatVector.fromArray(FLOAT_SPECIES, (float[])v3, (int)i);
            FloatVector diff0 = qv.sub((Vector)dv0);
            sv0 = PanamaESVectorUtilSupport.fma(diff0, diff0, sv0);
            FloatVector diff1 = qv.sub((Vector)dv1);
            sv1 = PanamaESVectorUtilSupport.fma(diff1, diff1, sv1);
            FloatVector diff2 = qv.sub((Vector)dv2);
            sv2 = PanamaESVectorUtilSupport.fma(diff2, diff2, sv2);
            FloatVector diff3 = qv.sub((Vector)dv3);
            sv3 = PanamaESVectorUtilSupport.fma(diff3, diff3, sv3);
        }
        float distance0 = sv0.reduceLanes(VectorOperators.ADD);
        float distance1 = sv1.reduceLanes(VectorOperators.ADD);
        float distance2 = sv2.reduceLanes(VectorOperators.ADD);
        float distance3 = sv3.reduceLanes(VectorOperators.ADD);
        while (i < query.length) {
            float qValue = query[i];
            float diff0 = qValue - v0[i];
            float diff1 = qValue - v1[i];
            float diff2 = qValue - v2[i];
            float diff3 = qValue - v3[i];
            distance0 = PanamaESVectorUtilSupport.fma(diff0, diff0, distance0);
            distance1 = PanamaESVectorUtilSupport.fma(diff1, diff1, distance1);
            distance2 = PanamaESVectorUtilSupport.fma(diff2, diff2, distance2);
            distance3 = PanamaESVectorUtilSupport.fma(diff3, diff3, distance3);
            ++i;
        }
        distances[0] = distance0;
        distances[1] = distance1;
        distances[2] = distance2;
        distances[3] = distance3;
    }

    @Override
    public void soarDistanceBulk(float[] v1, float[] c0, float[] c1, float[] c2, float[] c3, float[] originalResidual, float soarLambda, float rnorm, float[] distances) {
        int i;
        FloatVector projVec0 = FloatVector.zero(FLOAT_SPECIES);
        FloatVector projVec1 = FloatVector.zero(FLOAT_SPECIES);
        FloatVector projVec2 = FloatVector.zero(FLOAT_SPECIES);
        FloatVector projVec3 = FloatVector.zero(FLOAT_SPECIES);
        FloatVector acc0 = FloatVector.zero(FLOAT_SPECIES);
        FloatVector acc1 = FloatVector.zero(FLOAT_SPECIES);
        FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES);
        FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES);
        int limit = FLOAT_SPECIES.loopBound(v1.length);
        for (i = 0; i < limit; i += FLOAT_SPECIES.length()) {
            FloatVector v1Vec = FloatVector.fromArray(FLOAT_SPECIES, (float[])v1, (int)i);
            FloatVector c0Vec = FloatVector.fromArray(FLOAT_SPECIES, (float[])c0, (int)i);
            FloatVector c1Vec = FloatVector.fromArray(FLOAT_SPECIES, (float[])c1, (int)i);
            FloatVector c2Vec = FloatVector.fromArray(FLOAT_SPECIES, (float[])c2, (int)i);
            FloatVector c3Vec = FloatVector.fromArray(FLOAT_SPECIES, (float[])c3, (int)i);
            FloatVector originalResidualVec = FloatVector.fromArray(FLOAT_SPECIES, (float[])originalResidual, (int)i);
            FloatVector djkVec0 = v1Vec.sub((Vector)c0Vec);
            FloatVector djkVec1 = v1Vec.sub((Vector)c1Vec);
            FloatVector djkVec2 = v1Vec.sub((Vector)c2Vec);
            FloatVector djkVec3 = v1Vec.sub((Vector)c3Vec);
            projVec0 = PanamaESVectorUtilSupport.fma(djkVec0, originalResidualVec, projVec0);
            projVec1 = PanamaESVectorUtilSupport.fma(djkVec1, originalResidualVec, projVec1);
            projVec2 = PanamaESVectorUtilSupport.fma(djkVec2, originalResidualVec, projVec2);
            projVec3 = PanamaESVectorUtilSupport.fma(djkVec3, originalResidualVec, projVec3);
            acc0 = PanamaESVectorUtilSupport.fma(djkVec0, djkVec0, acc0);
            acc1 = PanamaESVectorUtilSupport.fma(djkVec1, djkVec1, acc1);
            acc2 = PanamaESVectorUtilSupport.fma(djkVec2, djkVec2, acc2);
            acc3 = PanamaESVectorUtilSupport.fma(djkVec3, djkVec3, acc3);
        }
        float proj0 = projVec0.reduceLanes(VectorOperators.ADD);
        float dsq0 = acc0.reduceLanes(VectorOperators.ADD);
        float proj1 = projVec1.reduceLanes(VectorOperators.ADD);
        float dsq1 = acc1.reduceLanes(VectorOperators.ADD);
        float proj2 = projVec2.reduceLanes(VectorOperators.ADD);
        float dsq2 = acc2.reduceLanes(VectorOperators.ADD);
        float proj3 = projVec3.reduceLanes(VectorOperators.ADD);
        float dsq3 = acc3.reduceLanes(VectorOperators.ADD);
        while (i < v1.length) {
            float v = v1[i];
            float djk0 = v - c0[i];
            float djk1 = v - c1[i];
            float djk2 = v - c2[i];
            float djk3 = v - c3[i];
            proj0 = PanamaESVectorUtilSupport.fma(djk0, originalResidual[i], proj0);
            proj1 = PanamaESVectorUtilSupport.fma(djk1, originalResidual[i], proj1);
            proj2 = PanamaESVectorUtilSupport.fma(djk2, originalResidual[i], proj2);
            proj3 = PanamaESVectorUtilSupport.fma(djk3, originalResidual[i], proj3);
            dsq0 = PanamaESVectorUtilSupport.fma(djk0, djk0, dsq0);
            dsq1 = PanamaESVectorUtilSupport.fma(djk1, djk1, dsq1);
            dsq2 = PanamaESVectorUtilSupport.fma(djk2, djk2, dsq2);
            dsq3 = PanamaESVectorUtilSupport.fma(djk3, djk3, dsq3);
            ++i;
        }
        distances[0] = dsq0 + soarLambda * proj0 * proj0 / rnorm;
        distances[1] = dsq1 + soarLambda * proj1 * proj1 / rnorm;
        distances[2] = dsq2 + soarLambda * proj2 * proj2 / rnorm;
        distances[3] = dsq3 + soarLambda * proj3 * proj3 / rnorm;
    }

    @Override
    public void packAsBinary(int[] vector, byte[] packed) {
        if (vector.length >= 8 && HAS_FAST_INTEGER_VECTORS) {
            if (VECTOR_BITSIZE >= 256) {
                this.packAsBinary256(vector, packed);
                return;
            }
            if (VECTOR_BITSIZE == 128) {
                this.packAsBinary128(vector, packed);
                return;
            }
        }
        DefaultESVectorUtilSupport.packAsBinaryImpl(vector, packed);
    }

    private void packAsBinary256(int[] vector, byte[] packed) {
        int limit = INT_SPECIES_256.loopBound(vector.length);
        int i = 0;
        int index = 0;
        while (i < limit) {
            IntVector v = IntVector.fromArray(INT_SPECIES_256, (int[])vector, (int)i);
            int result = v.lanewise(VectorOperators.LSHL, (Vector)SHIFTS_256).reduceLanes(VectorOperators.OR);
            packed[index] = (byte)result;
            i += INT_SPECIES_256.length();
            ++index;
        }
        if (i == vector.length) {
            return;
        }
        byte result = 0;
        for (int j = 7; j >= 0 && i < vector.length; ++i, --j) {
            assert (vector[i] == 0 || vector[i] == 1);
            result = (byte)(result | (byte)((vector[i] & 1) << j));
        }
        packed[index] = result;
    }

    private void packAsBinary128(int[] vector, byte[] packed) {
        int limit = INT_SPECIES_128.loopBound(vector.length) - INT_SPECIES_128.length();
        int i = 0;
        int index = 0;
        while (i < limit) {
            IntVector v = IntVector.fromArray(INT_SPECIES_128, (int[])vector, (int)i);
            IntVector v1 = v.lanewise(VectorOperators.LSHL, (Vector)HIGH_SHIFTS_128);
            v = IntVector.fromArray(INT_SPECIES_128, (int[])vector, (int)(i + INT_SPECIES_128.length()));
            IntVector v2 = v.lanewise(VectorOperators.LSHL, (Vector)LOW_SHIFTS_128);
            int result = v1.lanewise((VectorOperators.Binary)VectorOperators.OR, (Vector)v2).reduceLanes(VectorOperators.OR);
            packed[index] = (byte)result;
            i += 2 * INT_SPECIES_128.length();
            ++index;
        }
        if (i == vector.length) {
            return;
        }
        byte result = 0;
        for (int j = 7; j >= 0 && i < vector.length; ++i, --j) {
            assert (vector[i] == 0 || vector[i] == 1);
            result = (byte)(result | (byte)((vector[i] & 1) << j));
        }
        packed[index] = result;
    }

    @Override
    public void packDibit(int[] vector, byte[] packed) {
        DefaultESVectorUtilSupport.packDibitImpl(vector, packed);
    }

    @Override
    public void transposeHalfByte(int[] q, byte[] quantQueryByte) {
        if (q.length >= 8 && HAS_FAST_INTEGER_VECTORS) {
            if (VECTOR_BITSIZE >= 256) {
                this.transposeHalfByte256(q, quantQueryByte);
                return;
            }
            if (VECTOR_BITSIZE == 128) {
                this.transposeHalfByte128(q, quantQueryByte);
                return;
            }
        }
        DefaultESVectorUtilSupport.transposeHalfByteImpl(q, quantQueryByte);
    }

    private void transposeHalfByte256(int[] q, byte[] quantQueryByte) {
        int limit = INT_SPECIES_256.loopBound(q.length);
        int i = 0;
        int index = 0;
        while (i < limit) {
            IntVector v = IntVector.fromArray(INT_SPECIES_256, (int[])q, (int)i);
            int lowerByte = v.and(1).lanewise(VectorOperators.LSHL, (Vector)SHIFTS_256).reduceLanes(VectorOperators.OR);
            int lowerMiddleByte = v.lanewise(VectorOperators.ASHR, 1).and(1).lanewise(VectorOperators.LSHL, (Vector)SHIFTS_256).reduceLanes(VectorOperators.OR);
            int upperMiddleByte = v.lanewise(VectorOperators.ASHR, 2).and(1).lanewise(VectorOperators.LSHL, (Vector)SHIFTS_256).reduceLanes(VectorOperators.OR);
            int upperByte = v.lanewise(VectorOperators.ASHR, 3).and(1).lanewise(VectorOperators.LSHL, (Vector)SHIFTS_256).reduceLanes(VectorOperators.OR);
            quantQueryByte[index] = (byte)lowerByte;
            quantQueryByte[index + quantQueryByte.length / 4] = (byte)lowerMiddleByte;
            quantQueryByte[index + quantQueryByte.length / 2] = (byte)upperMiddleByte;
            quantQueryByte[index + 3 * quantQueryByte.length / 4] = (byte)upperByte;
            i += INT_SPECIES_256.length();
            ++index;
        }
        if (i == q.length) {
            return;
        }
        int lowerByte = 0;
        int lowerMiddleByte = 0;
        int upperMiddleByte = 0;
        int upperByte = 0;
        int j = 7;
        while (i < q.length) {
            lowerByte |= (q[i] & 1) << j;
            lowerMiddleByte |= (q[i] >> 1 & 1) << j;
            upperMiddleByte |= (q[i] >> 2 & 1) << j;
            upperByte |= (q[i] >> 3 & 1) << j;
            --j;
            ++i;
        }
        quantQueryByte[index] = (byte)lowerByte;
        quantQueryByte[index + quantQueryByte.length / 4] = (byte)lowerMiddleByte;
        quantQueryByte[index + quantQueryByte.length / 2] = (byte)upperMiddleByte;
        quantQueryByte[index + 3 * quantQueryByte.length / 4] = (byte)upperByte;
    }

    private void transposeHalfByte128(int[] q, byte[] quantQueryByte) {
        int limit = INT_SPECIES_128.loopBound(q.length) - INT_SPECIES_128.length();
        int i = 0;
        int index = 0;
        while (i < limit) {
            IntVector v = IntVector.fromArray(INT_SPECIES_128, (int[])q, (int)i);
            IntVector lowerByteHigh = v.and(1).lanewise(VectorOperators.LSHL, (Vector)HIGH_SHIFTS_128);
            IntVector lowerMiddleByteHigh = v.lanewise(VectorOperators.ASHR, 1).and(1).lanewise(VectorOperators.LSHL, (Vector)HIGH_SHIFTS_128);
            IntVector upperMiddleByteHigh = v.lanewise(VectorOperators.ASHR, 2).and(1).lanewise(VectorOperators.LSHL, (Vector)HIGH_SHIFTS_128);
            IntVector upperByteHigh = v.lanewise(VectorOperators.ASHR, 3).and(1).lanewise(VectorOperators.LSHL, (Vector)HIGH_SHIFTS_128);
            v = IntVector.fromArray(INT_SPECIES_128, (int[])q, (int)(i + INT_SPECIES_128.length()));
            IntVector lowerByteLow = v.and(1).lanewise(VectorOperators.LSHL, (Vector)LOW_SHIFTS_128);
            IntVector lowerMiddleByteLow = v.lanewise(VectorOperators.ASHR, 1).and(1).lanewise(VectorOperators.LSHL, (Vector)LOW_SHIFTS_128);
            IntVector upperMiddleByteLow = v.lanewise(VectorOperators.ASHR, 2).and(1).lanewise(VectorOperators.LSHL, (Vector)LOW_SHIFTS_128);
            IntVector upperByteLow = v.lanewise(VectorOperators.ASHR, 3).and(1).lanewise(VectorOperators.LSHL, (Vector)LOW_SHIFTS_128);
            int lowerByte = lowerByteHigh.lanewise((VectorOperators.Binary)VectorOperators.OR, (Vector)lowerByteLow).reduceLanes(VectorOperators.OR);
            int lowerMiddleByte = lowerMiddleByteHigh.lanewise((VectorOperators.Binary)VectorOperators.OR, (Vector)lowerMiddleByteLow).reduceLanes(VectorOperators.OR);
            int upperMiddleByte = upperMiddleByteHigh.lanewise((VectorOperators.Binary)VectorOperators.OR, (Vector)upperMiddleByteLow).reduceLanes(VectorOperators.OR);
            int upperByte = upperByteHigh.lanewise((VectorOperators.Binary)VectorOperators.OR, (Vector)upperByteLow).reduceLanes(VectorOperators.OR);
            quantQueryByte[index] = (byte)lowerByte;
            quantQueryByte[index + quantQueryByte.length / 4] = (byte)lowerMiddleByte;
            quantQueryByte[index + quantQueryByte.length / 2] = (byte)upperMiddleByte;
            quantQueryByte[index + 3 * quantQueryByte.length / 4] = (byte)upperByte;
            i += 2 * INT_SPECIES_128.length();
            ++index;
        }
        if (i == q.length) {
            return;
        }
        int lowerByte = 0;
        int lowerMiddleByte = 0;
        int upperMiddleByte = 0;
        int upperByte = 0;
        int j = 7;
        while (i < q.length) {
            lowerByte |= (q[i] & 1) << j;
            lowerMiddleByte |= (q[i] >> 1 & 1) << j;
            upperMiddleByte |= (q[i] >> 2 & 1) << j;
            upperByte |= (q[i] >> 3 & 1) << j;
            --j;
            ++i;
        }
        quantQueryByte[index] = (byte)lowerByte;
        quantQueryByte[index + quantQueryByte.length / 4] = (byte)lowerMiddleByte;
        quantQueryByte[index + quantQueryByte.length / 2] = (byte)upperMiddleByte;
        quantQueryByte[index + 3 * quantQueryByte.length / 4] = (byte)upperByte;
    }

    @Override
    public int indexOf(byte[] bytes, int offset, int length, byte marker) {
        int remaining;
        int tail;
        ByteVector markerVector = ByteVector.broadcast(PREFERRED_BYTE_SPECIES, (byte)marker);
        int loopBound = PREFERRED_BYTE_SPECIES.loopBound(length);
        for (int i = 0; i < loopBound; i += PREFERRED_BYTE_SPECIES.length()) {
            ByteVector chunk = ByteVector.fromArray(PREFERRED_BYTE_SPECIES, (byte[])bytes, (int)(offset + i));
            VectorMask mask = chunk.eq((Vector)markerVector);
            if (!mask.anyTrue()) continue;
            return i + mask.firstTrue();
        }
        if (loopBound < length && (tail = ByteArrayUtils.indexOf(bytes, offset + loopBound, remaining = length - loopBound, marker)) >= 0) {
            return loopBound + tail;
        }
        return -1;
    }

    static {
        VectorSpecies byteForFloat;
        VECTOR_BITSIZE = PanamaVectorConstants.PREFERRED_VECTOR_BITSIZE;
        FLOAT_SPECIES = PanamaVectorConstants.PREFERRED_FLOAT_SPECIES;
        INTEGER_SPECIES = PanamaVectorConstants.PREFERRED_INTEGER_SPECIES;
        HAS_FAST_INTEGER_VECTORS = PanamaVectorConstants.ENABLE_INTEGER_VECTORS;
        BYTE_SPECIES_128 = ByteVector.SPECIES_128;
        BYTE_SPECIES_256 = ByteVector.SPECIES_256;
        INT_SPECIES_512 = IntVector.SPECIES_512;
        BYTE_SPECIES_FOR_INT_512 = VectorSpecies.of(Byte.TYPE, (VectorShape)VectorShape.forBitSize((int)(INT_SPECIES_512.vectorBitSize() / 4)));
        INT_SPECIES_256 = IntVector.SPECIES_256;
        BYTE_SPECIES_FOR_INT_256 = VectorSpecies.of(Byte.TYPE, (VectorShape)VectorShape.forBitSize((int)(INT_SPECIES_256.vectorBitSize() / 4)));
        FLOAT_SPECIES_512 = FloatVector.SPECIES_512;
        FLOAT_SPECIES_256 = FloatVector.SPECIES_256;
        PREFERRED_FLOAT_SPECIES = PanamaVectorConstants.PREFERRED_FLOAT_SPECIES;
        try {
            byteForFloat = VectorSpecies.of(Byte.TYPE, (VectorShape)VectorShape.forBitSize((int)(PREFERRED_FLOAT_SPECIES.vectorBitSize() / 4)));
        }
        catch (IllegalArgumentException e) {
            byteForFloat = null;
        }
        BYTE_SPECIES_FOR_PREFFERED_FLOATS = byteForFloat;
        INT_SPECIES_128 = IntVector.SPECIES_128;
        int[] shifts = new int[]{7, 6, 5, 4, 3, 2, 1, 0};
        if (VECTOR_BITSIZE == 128) {
            HIGH_SHIFTS_128 = IntVector.fromArray(INT_SPECIES_128, (int[])shifts, (int)0);
            LOW_SHIFTS_128 = IntVector.fromArray(INT_SPECIES_128, (int[])shifts, (int)INT_SPECIES_128.length());
            SHIFTS_256 = null;
        } else {
            SHIFTS_256 = IntVector.fromArray(INT_SPECIES_256, (int[])shifts, (int)0);
            HIGH_SHIFTS_128 = null;
            LOW_SHIFTS_128 = null;
        }
        SHIFTS = new int[]{7, 6, 5, 4, 3, 2, 1, 0};
        PREFERRED_BYTE_SPECIES = PanamaVectorConstants.PREFERRED_BYTE_SPECIES;
    }
}

