/*
 * Copyright (c) 2026 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__aarch64__)

#include "arm_gemm/arm_gemm.hpp"
#include "arm_common/internal/utils.hpp"
#include "arm_common/bfloat.hpp"

#include <cassert>
#include <limits>

namespace arm_gemm {

void sme_gemv_fp32bf16fp32_dot_8VL (
    const float *A_ptr, const bfloat16 *B_ptr, float *output_ptr,
    size_t N, size_t K,
    const float *bias, Activation act, bool
)
{
    struct KernelArgs {
        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
        const bfloat16 *B_ptr = {};
        size_t output_offset = {};
        unsigned int input_initial_col = {};
    } args;

    unsigned long flags=0;
    args.B_ptr = B_ptr;
    switch(act.type) {
        default:
        case Activation::Type::None:
            break;
        case Activation::Type::BoundedReLU:
            args.maxval = static_cast<float>(act.param1);
            /* fall through */
        case Activation::Type::ReLU:
            args.minval = 0;
            flags |= 0x2;
            break;
    }
    __asm__ __volatile__(
      ".inst 0xd503477f  // SMSTART ZA\n"
      "mov x26, #0x4\n"
      "mov x25, %x[bias]\n"
      "cntw x24\n"
      "ptrue p3.b\n"
      "add x23, %x[N], x24\n"
      "sub x23, x23, #0x1\n"
      "udiv x23, x23, x24\n"
      "1:"  // Column loop
      "cmp x23, #0x8\n"
      "bge 50f\n"
      "cmp x23, #0x6\n"
      "bgt 43f\n"
      "beq 36f\n"
      "cmp x23, #0x4\n"
      "bgt 29f\n"
      "beq 22f\n"
      "cmp x23, #0x2\n"
      "bgt 15f\n"
      "beq 8f\n"
      "mov x22, %x[K]\n"
      "mov x21, %x[A_ptr]\n"
      "whilelt p2.s, XZR, %x[N]\n"
      "cbz x25, 2f\n"
      "ld1w { z24.s }, p3/Z, [x25]\n"
      "b 3f\n"
      "2:"  // Width 1: no bias
      "fmov z24.s, #0\n"
      "3:"  // Width 1: setup done
      "cmp x22, #0x8\n"
      "ble 5f\n"
      "4:"  // Width 1: Multiply loop: Main loop head
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "sub x22, x22, #0x8\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "cmp x22, #0x8\n"
      "add x21, x21, #0x20\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr]]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr]]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr]]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "trn1 z0.d, z0.d, z16.d\n"
      ".inst 0x64604038  // bfdot z24.s, z1.h, z0.h[0]\n"
      ".inst 0x64684058  // bfdot z24.s, z2.h, z0.h[1]\n"
      ".inst 0x64704078  // bfdot z24.s, z3.h, z0.h[2]\n"
      ".inst 0x64784098  // bfdot z24.s, z4.h, z0.h[3]\n"
      "bgt 4b\n"
      "5:"  // Width 1: Multiply loop: Single iteration only
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "subs x22, x22, #0x2\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "trn1 z0.d, z0.d, z16.d\n"
      ".inst 0x646040b8  // bfdot z24.s, z5.h, z0.h[0]\n"
      "ble 6f\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x646840d8  // bfdot z24.s, z6.h, z0.h[1]\n"
      "ble 6f\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x647040f8  // bfdot z24.s, z7.h, z0.h[2]\n"
      "ble 6f\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr]]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x64784118  // bfdot z24.s, z8.h, z0.h[3]\n"
      "6:"  // Width 1: Multiply loop: multiply skip
      "tbz %x[flags], #1, 7f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z17.s }, p3/Z, [x21]\n"
      "ld1rw { z16.s }, p3/Z, [x20]\n"
      "fmin z24.s, p3/M, z24.s, z17.s\n"
      "fmax z24.s, p3/M, z24.s, z16.s\n"
      "7:"  // Width 1: No activation
      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
      "addvl %x[output_ptr], %x[output_ptr], #1\n"
      "b 57f\n"
      "8:"  // Width 2
      "sub x20, %x[N], x24\n"
      "mov x22, %x[K]\n"
      "mov x21, %x[A_ptr]\n"
      "whilelt p2.s, XZR, x20\n"
      "cbz x25, 9f\n"
      "ld1w { z24.s }, p3/Z, [x25]\n"
      "ld1w { z25.s }, p3/Z, [x25, #1, MUL VL]\n"
      "b 10f\n"
      "9:"  // Width 2: no bias
      "fmov z24.s, #0\n"
      "fmov z25.s, #0\n"
      "10:"  // Width 2: setup done
      "cmp x22, #0x8\n"
      "ble 12f\n"
      "11:"  // Width 2: Multiply loop: Main loop head
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "sub x22, x22, #0x8\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "cmp x22, #0x8\n"
      "add x21, x21, #0x20\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr]]\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr]]\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "trn1 z0.d, z0.d, z16.d\n"
      ".inst 0x64604038  // bfdot z24.s, z1.h, z0.h[0]\n"
      ".inst 0x64604059  // bfdot z25.s, z2.h, z0.h[0]\n"
      ".inst 0x64684078  // bfdot z24.s, z3.h, z0.h[1]\n"
      ".inst 0x64684099  // bfdot z25.s, z4.h, z0.h[1]\n"
      ".inst 0x647040b8  // bfdot z24.s, z5.h, z0.h[2]\n"
      ".inst 0x647040d9  // bfdot z25.s, z6.h, z0.h[2]\n"
      ".inst 0x647840f8  // bfdot z24.s, z7.h, z0.h[3]\n"
      ".inst 0x64784119  // bfdot z25.s, z8.h, z0.h[3]\n"
      "bgt 11b\n"
      "12:"  // Width 2: Multiply loop: Single iteration only
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "subs x22, x22, #0x2\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "trn1 z0.d, z0.d, z16.d\n"
      ".inst 0x64604138  // bfdot z24.s, z9.h, z0.h[0]\n"
      ".inst 0x64604159  // bfdot z25.s, z10.h, z0.h[0]\n"
      "ble 13f\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x64684178  // bfdot z24.s, z11.h, z0.h[1]\n"
      ".inst 0x64684199  // bfdot z25.s, z12.h, z0.h[1]\n"
      "ble 13f\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x647041b8  // bfdot z24.s, z13.h, z0.h[2]\n"
      ".inst 0x647041d9  // bfdot z25.s, z14.h, z0.h[2]\n"
      "ble 13f\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x647841f8  // bfdot z24.s, z15.h, z0.h[3]\n"
      ".inst 0x64784219  // bfdot z25.s, z16.h, z0.h[3]\n"
      "13:"  // Width 2: Multiply loop: multiply skip
      "tbz %x[flags], #1, 14f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z17.s }, p3/Z, [x21]\n"
      "ld1rw { z16.s }, p3/Z, [x20]\n"
      "fmin z24.s, p3/M, z24.s, z17.s\n"
      "fmin z25.s, p3/M, z25.s, z17.s\n"
      "fmax z24.s, p3/M, z24.s, z16.s\n"
      "fmax z25.s, p3/M, z25.s, z16.s\n"
      "14:"  // Width 2: No activation
      "st1w { z24.s }, p3, [%x[output_ptr]]\n"
      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
      "addvl %x[output_ptr], %x[output_ptr], #2\n"
      "b 57f\n"
      "15:"  // Width 3
      "mov x20, #0x2\n"
      "mov x22, %x[K]\n"
      "msub x20, x24, x20, %x[N]\n"
      "mov x21, %x[A_ptr]\n"
      "whilelt p2.s, XZR, x20\n"
      "cbz x25, 16f\n"
      "ld1w { z24.s }, p3/Z, [x25]\n"
      "ld1w { z25.s }, p3/Z, [x25, #1, MUL VL]\n"
      "ld1w { z26.s }, p3/Z, [x25, #2, MUL VL]\n"
      "b 17f\n"
      "16:"  // Width 3: no bias
      "fmov z24.s, #0\n"
      "fmov z25.s, #0\n"
      "fmov z26.s, #0\n"
      "17:"  // Width 3: setup done
      "cmp x22, #0x8\n"
      "ble 19f\n"
      "18:"  // Width 3: Multiply loop: Main loop head
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "sub x22, x22, #0x8\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "cmp x22, #0x8\n"
      "add x21, x21, #0x20\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr]]\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "trn1 z0.d, z0.d, z16.d\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x64604038  // bfdot z24.s, z1.h, z0.h[0]\n"
      ".inst 0x64604059  // bfdot z25.s, z2.h, z0.h[0]\n"
      ".inst 0x6460407a  // bfdot z26.s, z3.h, z0.h[0]\n"
      ".inst 0x64684098  // bfdot z24.s, z4.h, z0.h[1]\n"
      ".inst 0x646840b9  // bfdot z25.s, z5.h, z0.h[1]\n"
      ".inst 0x646840da  // bfdot z26.s, z6.h, z0.h[1]\n"
      ".inst 0x647040f8  // bfdot z24.s, z7.h, z0.h[2]\n"
      ".inst 0x64704119  // bfdot z25.s, z8.h, z0.h[2]\n"
      ".inst 0x6470413a  // bfdot z26.s, z9.h, z0.h[2]\n"
      ".inst 0x64784158  // bfdot z24.s, z10.h, z0.h[3]\n"
      ".inst 0x64784179  // bfdot z25.s, z11.h, z0.h[3]\n"
      ".inst 0x6478419a  // bfdot z26.s, z12.h, z0.h[3]\n"
      "bgt 18b\n"
      "19:"  // Width 3: Multiply loop: Single iteration only
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "subs x22, x22, #0x2\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "trn1 z0.d, z0.d, z16.d\n"
      ".inst 0x646041b8  // bfdot z24.s, z13.h, z0.h[0]\n"
      ".inst 0x646041d9  // bfdot z25.s, z14.h, z0.h[0]\n"
      ".inst 0x646041fa  // bfdot z26.s, z15.h, z0.h[0]\n"
      "ble 20f\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z17.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z18.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x64684218  // bfdot z24.s, z16.h, z0.h[1]\n"
      ".inst 0x64684239  // bfdot z25.s, z17.h, z0.h[1]\n"
      ".inst 0x6468425a  // bfdot z26.s, z18.h, z0.h[1]\n"
      "ble 20f\n"
      "ldnt1h { z19.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z20.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z21.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x64704278  // bfdot z24.s, z19.h, z0.h[2]\n"
      ".inst 0x64704299  // bfdot z25.s, z20.h, z0.h[2]\n"
      ".inst 0x647042ba  // bfdot z26.s, z21.h, z0.h[2]\n"
      "ble 20f\n"
      "ldnt1h { z22.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z23.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x647842d8  // bfdot z24.s, z22.h, z0.h[3]\n"
      ".inst 0x647842f9  // bfdot z25.s, z23.h, z0.h[3]\n"
      ".inst 0x6478403a  // bfdot z26.s, z1.h, z0.h[3]\n"
      "20:"  // Width 3: Multiply loop: multiply skip
      "tbz %x[flags], #1, 21f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z17.s }, p3/Z, [x21]\n"
      "ld1rw { z16.s }, p3/Z, [x20]\n"
      "fmin z24.s, p3/M, z24.s, z17.s\n"
      "fmin z25.s, p3/M, z25.s, z17.s\n"
      "fmin z26.s, p3/M, z26.s, z17.s\n"
      "fmax z24.s, p3/M, z24.s, z16.s\n"
      "fmax z25.s, p3/M, z25.s, z16.s\n"
      "fmax z26.s, p3/M, z26.s, z16.s\n"
      "21:"  // Width 3: No activation
      "st1w { z24.s }, p3, [%x[output_ptr]]\n"
      "st1w { z25.s }, p3, [%x[output_ptr], #1, MUL VL]\n"
      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
      "addvl %x[output_ptr], %x[output_ptr], #3\n"
      "b 57f\n"
      "22:"  // Width 4
      "mov x20, #0x3\n"
      "mov x22, %x[K]\n"
      "msub x20, x24, x20, %x[N]\n"
      "mov x21, %x[A_ptr]\n"
      "whilelt p2.s, XZR, x20\n"
      "cbz x25, 23f\n"
      "ld1w { z24.s }, p3/Z, [x25]\n"
      "ld1w { z25.s }, p3/Z, [x25, #1, MUL VL]\n"
      "ld1w { z26.s }, p3/Z, [x25, #2, MUL VL]\n"
      "ld1w { z27.s }, p3/Z, [x25, #3, MUL VL]\n"
      "b 24f\n"
      "23:"  // Width 4: no bias
      "fmov z24.s, #0\n"
      "fmov z25.s, #0\n"
      "fmov z26.s, #0\n"
      "fmov z27.s, #0\n"
      "24:"  // Width 4: setup done
      "cmp x22, #0x8\n"
      "ble 26f\n"
      "25:"  // Width 4: Multiply loop: Main loop head
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "sub x22, x22, #0x8\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "cmp x22, #0x8\n"
      "add x21, x21, #0x20\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "trn1 z0.d, z0.d, z16.d\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x64604038  // bfdot z24.s, z1.h, z0.h[0]\n"
      ".inst 0x64604059  // bfdot z25.s, z2.h, z0.h[0]\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6460407a  // bfdot z26.s, z3.h, z0.h[0]\n"
      ".inst 0x6460409b  // bfdot z27.s, z4.h, z0.h[0]\n"
      ".inst 0x646840b8  // bfdot z24.s, z5.h, z0.h[1]\n"
      ".inst 0x646840d9  // bfdot z25.s, z6.h, z0.h[1]\n"
      ".inst 0x646840fa  // bfdot z26.s, z7.h, z0.h[1]\n"
      ".inst 0x6468411b  // bfdot z27.s, z8.h, z0.h[1]\n"
      ".inst 0x64704138  // bfdot z24.s, z9.h, z0.h[2]\n"
      ".inst 0x64704159  // bfdot z25.s, z10.h, z0.h[2]\n"
      ".inst 0x6470417a  // bfdot z26.s, z11.h, z0.h[2]\n"
      ".inst 0x6470419b  // bfdot z27.s, z12.h, z0.h[2]\n"
      ".inst 0x647841b8  // bfdot z24.s, z13.h, z0.h[3]\n"
      ".inst 0x647841d9  // bfdot z25.s, z14.h, z0.h[3]\n"
      ".inst 0x647841fa  // bfdot z26.s, z15.h, z0.h[3]\n"
      ".inst 0x6478421b  // bfdot z27.s, z16.h, z0.h[3]\n"
      "bgt 25b\n"
      "26:"  // Width 4: Multiply loop: Single iteration only
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z17.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "subs x22, x22, #0x2\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "ldnt1h { z18.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z19.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z20.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "trn1 z0.d, z0.d, z16.d\n"
      ".inst 0x64604238  // bfdot z24.s, z17.h, z0.h[0]\n"
      ".inst 0x64604259  // bfdot z25.s, z18.h, z0.h[0]\n"
      ".inst 0x6460427a  // bfdot z26.s, z19.h, z0.h[0]\n"
      ".inst 0x6460429b  // bfdot z27.s, z20.h, z0.h[0]\n"
      "ble 27f\n"
      "ldnt1h { z21.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z22.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z23.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x646842b8  // bfdot z24.s, z21.h, z0.h[1]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x646842d9  // bfdot z25.s, z22.h, z0.h[1]\n"
      ".inst 0x646842fa  // bfdot z26.s, z23.h, z0.h[1]\n"
      ".inst 0x6468403b  // bfdot z27.s, z1.h, z0.h[1]\n"
      "ble 27f\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x64704058  // bfdot z24.s, z2.h, z0.h[2]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x64704079  // bfdot z25.s, z3.h, z0.h[2]\n"
      ".inst 0x6470409a  // bfdot z26.s, z4.h, z0.h[2]\n"
      ".inst 0x647040bb  // bfdot z27.s, z5.h, z0.h[2]\n"
      "ble 27f\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x647840d8  // bfdot z24.s, z6.h, z0.h[3]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x647840f9  // bfdot z25.s, z7.h, z0.h[3]\n"
      ".inst 0x6478411a  // bfdot z26.s, z8.h, z0.h[3]\n"
      ".inst 0x6478413b  // bfdot z27.s, z9.h, z0.h[3]\n"
      "27:"  // Width 4: Multiply loop: multiply skip
      "tbz %x[flags], #1, 28f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z17.s }, p3/Z, [x21]\n"
      "ld1rw { z16.s }, p3/Z, [x20]\n"
      "fmin z24.s, p3/M, z24.s, z17.s\n"
      "fmin z25.s, p3/M, z25.s, z17.s\n"
      "fmin z26.s, p3/M, z26.s, z17.s\n"
      "fmin z27.s, p3/M, z27.s, z17.s\n"
      "fmax z24.s, p3/M, z24.s, z16.s\n"
      "fmax z25.s, p3/M, z25.s, z16.s\n"
      "fmax z26.s, p3/M, z26.s, z16.s\n"
      "fmax z27.s, p3/M, z27.s, z16.s\n"
      "28:"  // Width 4: No activation
      "st1w { z24.s }, p3, [%x[output_ptr]]\n"
      "st1w { z25.s }, p3, [%x[output_ptr], #1, MUL VL]\n"
      "st1w { z26.s }, p3, [%x[output_ptr], #2, MUL VL]\n"
      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
      "addvl %x[output_ptr], %x[output_ptr], #4\n"
      "b 57f\n"
      "29:"  // Width 5
      "mov x20, #0x4\n"
      "mov x22, %x[K]\n"
      "msub x20, x24, x20, %x[N]\n"
      "mov x21, %x[A_ptr]\n"
      "whilelt p2.s, XZR, x20\n"
      "cbz x25, 30f\n"
      "ld1w { z24.s }, p3/Z, [x25]\n"
      "ld1w { z25.s }, p3/Z, [x25, #1, MUL VL]\n"
      "ld1w { z26.s }, p3/Z, [x25, #2, MUL VL]\n"
      "ld1w { z27.s }, p3/Z, [x25, #3, MUL VL]\n"
      "ld1w { z28.s }, p3/Z, [x25, #4, MUL VL]\n"
      "b 31f\n"
      "30:"  // Width 5: no bias
      "fmov z24.s, #0\n"
      "fmov z25.s, #0\n"
      "fmov z26.s, #0\n"
      "fmov z27.s, #0\n"
      "fmov z28.s, #0\n"
      "31:"  // Width 5: setup done
      "cmp x22, #0x8\n"
      "ble 33f\n"
      "32:"  // Width 5: Multiply loop: Main loop head
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "sub x22, x22, #0x8\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "cmp x22, #0x8\n"
      "add x21, x21, #0x20\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr]]\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr]]\n"
      "trn1 z0.d, z0.d, z16.d\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x64604038  // bfdot z24.s, z1.h, z0.h[0]\n"
      ".inst 0x64604059  // bfdot z25.s, z2.h, z0.h[0]\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr]]\n"
      ".inst 0x6460407a  // bfdot z26.s, z3.h, z0.h[0]\n"
      ".inst 0x6460409b  // bfdot z27.s, z4.h, z0.h[0]\n"
      "ldnt1h { z17.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x646040bc  // bfdot z28.s, z5.h, z0.h[0]\n"
      "ldnt1h { z18.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z19.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x646840d8  // bfdot z24.s, z6.h, z0.h[1]\n"
      ".inst 0x646840f9  // bfdot z25.s, z7.h, z0.h[1]\n"
      "ldnt1h { z20.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6468411a  // bfdot z26.s, z8.h, z0.h[1]\n"
      ".inst 0x6468413b  // bfdot z27.s, z9.h, z0.h[1]\n"
      ".inst 0x6468415c  // bfdot z28.s, z10.h, z0.h[1]\n"
      ".inst 0x64704178  // bfdot z24.s, z11.h, z0.h[2]\n"
      ".inst 0x64704199  // bfdot z25.s, z12.h, z0.h[2]\n"
      ".inst 0x647041ba  // bfdot z26.s, z13.h, z0.h[2]\n"
      ".inst 0x647041db  // bfdot z27.s, z14.h, z0.h[2]\n"
      ".inst 0x647041fc  // bfdot z28.s, z15.h, z0.h[2]\n"
      ".inst 0x64784218  // bfdot z24.s, z16.h, z0.h[3]\n"
      ".inst 0x64784239  // bfdot z25.s, z17.h, z0.h[3]\n"
      ".inst 0x6478425a  // bfdot z26.s, z18.h, z0.h[3]\n"
      ".inst 0x6478427b  // bfdot z27.s, z19.h, z0.h[3]\n"
      ".inst 0x6478429c  // bfdot z28.s, z20.h, z0.h[3]\n"
      "bgt 32b\n"
      "33:"  // Width 5: Multiply loop: Single iteration only
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z21.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "subs x22, x22, #0x2\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "ldnt1h { z22.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z23.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "trn1 z0.d, z0.d, z16.d\n"
      ".inst 0x646042b8  // bfdot z24.s, z21.h, z0.h[0]\n"
      ".inst 0x646042d9  // bfdot z25.s, z22.h, z0.h[0]\n"
      ".inst 0x646042fa  // bfdot z26.s, z23.h, z0.h[0]\n"
      ".inst 0x6460403b  // bfdot z27.s, z1.h, z0.h[0]\n"
      ".inst 0x6460405c  // bfdot z28.s, z2.h, z0.h[0]\n"
      "ble 34f\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x64684078  // bfdot z24.s, z3.h, z0.h[1]\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x64684099  // bfdot z25.s, z4.h, z0.h[1]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x646840ba  // bfdot z26.s, z5.h, z0.h[1]\n"
      ".inst 0x646840db  // bfdot z27.s, z6.h, z0.h[1]\n"
      ".inst 0x646840fc  // bfdot z28.s, z7.h, z0.h[1]\n"
      "ble 34f\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x64704118  // bfdot z24.s, z8.h, z0.h[2]\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x64704139  // bfdot z25.s, z9.h, z0.h[2]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6470415a  // bfdot z26.s, z10.h, z0.h[2]\n"
      ".inst 0x6470417b  // bfdot z27.s, z11.h, z0.h[2]\n"
      ".inst 0x6470419c  // bfdot z28.s, z12.h, z0.h[2]\n"
      "ble 34f\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x647841b8  // bfdot z24.s, z13.h, z0.h[3]\n"
      "ldnt1h { z17.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x647841d9  // bfdot z25.s, z14.h, z0.h[3]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x647841fa  // bfdot z26.s, z15.h, z0.h[3]\n"
      ".inst 0x6478421b  // bfdot z27.s, z16.h, z0.h[3]\n"
      ".inst 0x6478423c  // bfdot z28.s, z17.h, z0.h[3]\n"
      "34:"  // Width 5: Multiply loop: multiply skip
      "tbz %x[flags], #1, 35f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z17.s }, p3/Z, [x21]\n"
      "ld1rw { z16.s }, p3/Z, [x20]\n"
      "fmin z24.s, p3/M, z24.s, z17.s\n"
      "fmin z25.s, p3/M, z25.s, z17.s\n"
      "fmin z26.s, p3/M, z26.s, z17.s\n"
      "fmin z27.s, p3/M, z27.s, z17.s\n"
      "fmin z28.s, p3/M, z28.s, z17.s\n"
      "fmax z24.s, p3/M, z24.s, z16.s\n"
      "fmax z25.s, p3/M, z25.s, z16.s\n"
      "fmax z26.s, p3/M, z26.s, z16.s\n"
      "fmax z27.s, p3/M, z27.s, z16.s\n"
      "fmax z28.s, p3/M, z28.s, z16.s\n"
      "35:"  // Width 5: No activation
      "st1w { z24.s }, p3, [%x[output_ptr]]\n"
      "st1w { z25.s }, p3, [%x[output_ptr], #1, MUL VL]\n"
      "st1w { z26.s }, p3, [%x[output_ptr], #2, MUL VL]\n"
      "st1w { z27.s }, p3, [%x[output_ptr], #3, MUL VL]\n"
      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
      "addvl %x[output_ptr], %x[output_ptr], #5\n"
      "b 57f\n"
      "36:"  // Width 6
      "mov x20, #0x5\n"
      "mov x22, %x[K]\n"
      "msub x20, x24, x20, %x[N]\n"
      "mov x21, %x[A_ptr]\n"
      "whilelt p2.s, XZR, x20\n"
      "cbz x25, 37f\n"
      "ld1w { z24.s }, p3/Z, [x25]\n"
      "ld1w { z25.s }, p3/Z, [x25, #1, MUL VL]\n"
      "ld1w { z26.s }, p3/Z, [x25, #2, MUL VL]\n"
      "ld1w { z27.s }, p3/Z, [x25, #3, MUL VL]\n"
      "ld1w { z28.s }, p3/Z, [x25, #4, MUL VL]\n"
      "ld1w { z29.s }, p3/Z, [x25, #5, MUL VL]\n"
      "b 38f\n"
      "37:"  // Width 6: no bias
      "fmov z24.s, #0\n"
      "fmov z25.s, #0\n"
      "fmov z26.s, #0\n"
      "fmov z27.s, #0\n"
      "fmov z28.s, #0\n"
      "fmov z29.s, #0\n"
      "38:"  // Width 6: setup done
      "cmp x22, #0x8\n"
      "ble 40f\n"
      "39:"  // Width 6: Multiply loop: Main loop head
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "sub x22, x22, #0x8\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "cmp x22, #0x8\n"
      "add x21, x21, #0x20\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr]]\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "trn1 z0.d, z0.d, z16.d\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x64604038  // bfdot z24.s, z1.h, z0.h[0]\n"
      ".inst 0x64604059  // bfdot z25.s, z2.h, z0.h[0]\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x6460407a  // bfdot z26.s, z3.h, z0.h[0]\n"
      ".inst 0x6460409b  // bfdot z27.s, z4.h, z0.h[0]\n"
      "ldnt1h { z17.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x646040bc  // bfdot z28.s, z5.h, z0.h[0]\n"
      ".inst 0x646040dd  // bfdot z29.s, z6.h, z0.h[0]\n"
      "ldnt1h { z18.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ldnt1h { z19.h }, p3/Z, [%x[B_ptr]]\n"
      ".inst 0x646840f8  // bfdot z24.s, z7.h, z0.h[1]\n"
      ".inst 0x64684119  // bfdot z25.s, z8.h, z0.h[1]\n"
      "ldnt1h { z20.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x6468413a  // bfdot z26.s, z9.h, z0.h[1]\n"
      ".inst 0x6468415b  // bfdot z27.s, z10.h, z0.h[1]\n"
      "ldnt1h { z21.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x6468417c  // bfdot z28.s, z11.h, z0.h[1]\n"
      ".inst 0x6468419d  // bfdot z29.s, z12.h, z0.h[1]\n"
      "ldnt1h { z22.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z23.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x647041b8  // bfdot z24.s, z13.h, z0.h[2]\n"
      ".inst 0x647041d9  // bfdot z25.s, z14.h, z0.h[2]\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x647041fa  // bfdot z26.s, z15.h, z0.h[2]\n"
      ".inst 0x6470421b  // bfdot z27.s, z16.h, z0.h[2]\n"
      ".inst 0x6470423c  // bfdot z28.s, z17.h, z0.h[2]\n"
      ".inst 0x6470425d  // bfdot z29.s, z18.h, z0.h[2]\n"
      ".inst 0x64784278  // bfdot z24.s, z19.h, z0.h[3]\n"
      ".inst 0x64784299  // bfdot z25.s, z20.h, z0.h[3]\n"
      ".inst 0x647842ba  // bfdot z26.s, z21.h, z0.h[3]\n"
      ".inst 0x647842db  // bfdot z27.s, z22.h, z0.h[3]\n"
      ".inst 0x647842fc  // bfdot z28.s, z23.h, z0.h[3]\n"
      ".inst 0x6478403d  // bfdot z29.s, z1.h, z0.h[3]\n"
      "bgt 39b\n"
      "40:"  // Width 6: Multiply loop: Single iteration only
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "subs x22, x22, #0x2\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "trn1 z0.d, z0.d, z16.d\n"
      ".inst 0x64604058  // bfdot z24.s, z2.h, z0.h[0]\n"
      ".inst 0x64604079  // bfdot z25.s, z3.h, z0.h[0]\n"
      ".inst 0x6460409a  // bfdot z26.s, z4.h, z0.h[0]\n"
      ".inst 0x646040bb  // bfdot z27.s, z5.h, z0.h[0]\n"
      ".inst 0x646040dc  // bfdot z28.s, z6.h, z0.h[0]\n"
      ".inst 0x646040fd  // bfdot z29.s, z7.h, z0.h[0]\n"
      "ble 41f\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x64684118  // bfdot z24.s, z8.h, z0.h[1]\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x64684139  // bfdot z25.s, z9.h, z0.h[1]\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x6468415a  // bfdot z26.s, z10.h, z0.h[1]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6468417b  // bfdot z27.s, z11.h, z0.h[1]\n"
      ".inst 0x6468419c  // bfdot z28.s, z12.h, z0.h[1]\n"
      ".inst 0x646841bd  // bfdot z29.s, z13.h, z0.h[1]\n"
      "ble 41f\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z17.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x647041d8  // bfdot z24.s, z14.h, z0.h[2]\n"
      "ldnt1h { z18.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x647041f9  // bfdot z25.s, z15.h, z0.h[2]\n"
      "ldnt1h { z19.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x6470421a  // bfdot z26.s, z16.h, z0.h[2]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6470423b  // bfdot z27.s, z17.h, z0.h[2]\n"
      ".inst 0x6470425c  // bfdot z28.s, z18.h, z0.h[2]\n"
      ".inst 0x6470427d  // bfdot z29.s, z19.h, z0.h[2]\n"
      "ble 41f\n"
      "ldnt1h { z20.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z21.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z22.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z23.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x64784298  // bfdot z24.s, z20.h, z0.h[3]\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x647842b9  // bfdot z25.s, z21.h, z0.h[3]\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x647842da  // bfdot z26.s, z22.h, z0.h[3]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x647842fb  // bfdot z27.s, z23.h, z0.h[3]\n"
      ".inst 0x6478403c  // bfdot z28.s, z1.h, z0.h[3]\n"
      ".inst 0x6478405d  // bfdot z29.s, z2.h, z0.h[3]\n"
      "41:"  // Width 6: Multiply loop: multiply skip
      "tbz %x[flags], #1, 42f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z17.s }, p3/Z, [x21]\n"
      "ld1rw { z16.s }, p3/Z, [x20]\n"
      "fmin z24.s, p3/M, z24.s, z17.s\n"
      "fmin z25.s, p3/M, z25.s, z17.s\n"
      "fmin z26.s, p3/M, z26.s, z17.s\n"
      "fmin z27.s, p3/M, z27.s, z17.s\n"
      "fmin z28.s, p3/M, z28.s, z17.s\n"
      "fmin z29.s, p3/M, z29.s, z17.s\n"
      "fmax z24.s, p3/M, z24.s, z16.s\n"
      "fmax z25.s, p3/M, z25.s, z16.s\n"
      "fmax z26.s, p3/M, z26.s, z16.s\n"
      "fmax z27.s, p3/M, z27.s, z16.s\n"
      "fmax z28.s, p3/M, z28.s, z16.s\n"
      "fmax z29.s, p3/M, z29.s, z16.s\n"
      "42:"  // Width 6: No activation
      "st1w { z24.s }, p3, [%x[output_ptr]]\n"
      "st1w { z25.s }, p3, [%x[output_ptr], #1, MUL VL]\n"
      "st1w { z26.s }, p3, [%x[output_ptr], #2, MUL VL]\n"
      "st1w { z27.s }, p3, [%x[output_ptr], #3, MUL VL]\n"
      "st1w { z28.s }, p3, [%x[output_ptr], #4, MUL VL]\n"
      "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
      "addvl %x[output_ptr], %x[output_ptr], #6\n"
      "b 57f\n"
      "43:"  // Width 7
      "mov x20, #0x6\n"
      "mov x22, %x[K]\n"
      "msub x20, x24, x20, %x[N]\n"
      "mov x21, %x[A_ptr]\n"
      "whilelt p2.s, XZR, x20\n"
      "cbz x25, 44f\n"
      "ld1w { z24.s }, p3/Z, [x25]\n"
      "ld1w { z25.s }, p3/Z, [x25, #1, MUL VL]\n"
      "ld1w { z26.s }, p3/Z, [x25, #2, MUL VL]\n"
      "ld1w { z27.s }, p3/Z, [x25, #3, MUL VL]\n"
      "ld1w { z28.s }, p3/Z, [x25, #4, MUL VL]\n"
      "ld1w { z29.s }, p3/Z, [x25, #5, MUL VL]\n"
      "ld1w { z30.s }, p3/Z, [x25, #6, MUL VL]\n"
      "b 45f\n"
      "44:"  // Width 7: no bias
      "fmov z24.s, #0\n"
      "fmov z25.s, #0\n"
      "fmov z26.s, #0\n"
      "fmov z27.s, #0\n"
      "fmov z28.s, #0\n"
      "fmov z29.s, #0\n"
      "fmov z30.s, #0\n"
      "45:"  // Width 7: setup done
      "cmp x22, #0x8\n"
      "ble 47f\n"
      "46:"  // Width 7: Multiply loop: Main loop head
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "sub x22, x22, #0x8\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "cmp x22, #0x8\n"
      "add x21, x21, #0x20\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "trn1 z0.d, z0.d, z16.d\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr]]\n"
      ".inst 0x64604038  // bfdot z24.s, z1.h, z0.h[0]\n"
      ".inst 0x64604059  // bfdot z25.s, z2.h, z0.h[0]\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x6460407a  // bfdot z26.s, z3.h, z0.h[0]\n"
      ".inst 0x6460409b  // bfdot z27.s, z4.h, z0.h[0]\n"
      "ldnt1h { z17.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x646040bc  // bfdot z28.s, z5.h, z0.h[0]\n"
      ".inst 0x646040dd  // bfdot z29.s, z6.h, z0.h[0]\n"
      "ldnt1h { z18.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x646040fe  // bfdot z30.s, z7.h, z0.h[0]\n"
      "ldnt1h { z19.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x64684118  // bfdot z24.s, z8.h, z0.h[1]\n"
      ".inst 0x64684139  // bfdot z25.s, z9.h, z0.h[1]\n"
      "ldnt1h { z20.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x6468415a  // bfdot z26.s, z10.h, z0.h[1]\n"
      ".inst 0x6468417b  // bfdot z27.s, z11.h, z0.h[1]\n"
      "ldnt1h { z21.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6468419c  // bfdot z28.s, z12.h, z0.h[1]\n"
      ".inst 0x646841bd  // bfdot z29.s, z13.h, z0.h[1]\n"
      "ldnt1h { z22.h }, p3/Z, [%x[B_ptr]]\n"
      ".inst 0x646841de  // bfdot z30.s, z14.h, z0.h[1]\n"
      "ldnt1h { z23.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x647041f8  // bfdot z24.s, z15.h, z0.h[2]\n"
      ".inst 0x64704219  // bfdot z25.s, z16.h, z0.h[2]\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x6470423a  // bfdot z26.s, z17.h, z0.h[2]\n"
      ".inst 0x6470425b  // bfdot z27.s, z18.h, z0.h[2]\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x6470427c  // bfdot z28.s, z19.h, z0.h[2]\n"
      ".inst 0x6470429d  // bfdot z29.s, z20.h, z0.h[2]\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x647042be  // bfdot z30.s, z21.h, z0.h[2]\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      ".inst 0x647842d8  // bfdot z24.s, z22.h, z0.h[3]\n"
      ".inst 0x647842f9  // bfdot z25.s, z23.h, z0.h[3]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6478403a  // bfdot z26.s, z1.h, z0.h[3]\n"
      ".inst 0x6478405b  // bfdot z27.s, z2.h, z0.h[3]\n"
      ".inst 0x6478407c  // bfdot z28.s, z3.h, z0.h[3]\n"
      ".inst 0x6478409d  // bfdot z29.s, z4.h, z0.h[3]\n"
      ".inst 0x647840be  // bfdot z30.s, z5.h, z0.h[3]\n"
      "bgt 46b\n"
      "47:"  // Width 7: Multiply loop: Single iteration only
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "subs x22, x22, #0x2\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "trn1 z0.d, z0.d, z16.d\n"
      ".inst 0x646040d8  // bfdot z24.s, z6.h, z0.h[0]\n"
      ".inst 0x646040f9  // bfdot z25.s, z7.h, z0.h[0]\n"
      ".inst 0x6460411a  // bfdot z26.s, z8.h, z0.h[0]\n"
      ".inst 0x6460413b  // bfdot z27.s, z9.h, z0.h[0]\n"
      ".inst 0x6460415c  // bfdot z28.s, z10.h, z0.h[0]\n"
      ".inst 0x6460417d  // bfdot z29.s, z11.h, z0.h[0]\n"
      ".inst 0x6460419e  // bfdot z30.s, z12.h, z0.h[0]\n"
      "ble 48f\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x646841b8  // bfdot z24.s, z13.h, z0.h[1]\n"
      "ldnt1h { z17.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x646841d9  // bfdot z25.s, z14.h, z0.h[1]\n"
      "ldnt1h { z18.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x646841fa  // bfdot z26.s, z15.h, z0.h[1]\n"
      "ldnt1h { z19.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      ".inst 0x6468421b  // bfdot z27.s, z16.h, z0.h[1]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6468423c  // bfdot z28.s, z17.h, z0.h[1]\n"
      ".inst 0x6468425d  // bfdot z29.s, z18.h, z0.h[1]\n"
      ".inst 0x6468427e  // bfdot z30.s, z19.h, z0.h[1]\n"
      "ble 48f\n"
      "ldnt1h { z20.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z21.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z22.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z23.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x64704298  // bfdot z24.s, z20.h, z0.h[2]\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x647042b9  // bfdot z25.s, z21.h, z0.h[2]\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x647042da  // bfdot z26.s, z22.h, z0.h[2]\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      ".inst 0x647042fb  // bfdot z27.s, z23.h, z0.h[2]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6470403c  // bfdot z28.s, z1.h, z0.h[2]\n"
      ".inst 0x6470405d  // bfdot z29.s, z2.h, z0.h[2]\n"
      ".inst 0x6470407e  // bfdot z30.s, z3.h, z0.h[2]\n"
      "ble 48f\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x64784098  // bfdot z24.s, z4.h, z0.h[3]\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x647840b9  // bfdot z25.s, z5.h, z0.h[3]\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x647840da  // bfdot z26.s, z6.h, z0.h[3]\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      ".inst 0x647840fb  // bfdot z27.s, z7.h, z0.h[3]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6478411c  // bfdot z28.s, z8.h, z0.h[3]\n"
      ".inst 0x6478413d  // bfdot z29.s, z9.h, z0.h[3]\n"
      ".inst 0x6478415e  // bfdot z30.s, z10.h, z0.h[3]\n"
      "48:"  // Width 7: Multiply loop: multiply skip
      "tbz %x[flags], #1, 49f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z17.s }, p3/Z, [x21]\n"
      "ld1rw { z16.s }, p3/Z, [x20]\n"
      "fmin z24.s, p3/M, z24.s, z17.s\n"
      "fmin z25.s, p3/M, z25.s, z17.s\n"
      "fmin z26.s, p3/M, z26.s, z17.s\n"
      "fmin z27.s, p3/M, z27.s, z17.s\n"
      "fmin z28.s, p3/M, z28.s, z17.s\n"
      "fmin z29.s, p3/M, z29.s, z17.s\n"
      "fmin z30.s, p3/M, z30.s, z17.s\n"
      "fmax z24.s, p3/M, z24.s, z16.s\n"
      "fmax z25.s, p3/M, z25.s, z16.s\n"
      "fmax z26.s, p3/M, z26.s, z16.s\n"
      "fmax z27.s, p3/M, z27.s, z16.s\n"
      "fmax z28.s, p3/M, z28.s, z16.s\n"
      "fmax z29.s, p3/M, z29.s, z16.s\n"
      "fmax z30.s, p3/M, z30.s, z16.s\n"
      "49:"  // Width 7: No activation
      "st1w { z24.s }, p3, [%x[output_ptr]]\n"
      "st1w { z25.s }, p3, [%x[output_ptr], #1, MUL VL]\n"
      "st1w { z26.s }, p3, [%x[output_ptr], #2, MUL VL]\n"
      "st1w { z27.s }, p3, [%x[output_ptr], #3, MUL VL]\n"
      "st1w { z28.s }, p3, [%x[output_ptr], #4, MUL VL]\n"
      "st1w { z29.s }, p3, [%x[output_ptr], #5, MUL VL]\n"
      "st1w { z30.s }, p2, [%x[output_ptr], #6, MUL VL]\n"
      "addvl %x[output_ptr], %x[output_ptr], #7\n"
      "b 57f\n"
      "50:"  // Width 8
      "mov x20, #0x7\n"
      "mov x22, %x[K]\n"
      "msub x20, x24, x20, %x[N]\n"
      "mov x21, %x[A_ptr]\n"
      "whilelt p2.s, XZR, x20\n"
      "cbz x25, 51f\n"
      "ld1w { z24.s }, p3/Z, [x25]\n"
      "ld1w { z25.s }, p3/Z, [x25, #1, MUL VL]\n"
      "ld1w { z26.s }, p3/Z, [x25, #2, MUL VL]\n"
      "ld1w { z27.s }, p3/Z, [x25, #3, MUL VL]\n"
      "ld1w { z28.s }, p3/Z, [x25, #4, MUL VL]\n"
      "ld1w { z29.s }, p3/Z, [x25, #5, MUL VL]\n"
      "ld1w { z30.s }, p3/Z, [x25, #6, MUL VL]\n"
      "ld1w { z31.s }, p3/Z, [x25, #7, MUL VL]\n"
      "incb x25, ALL, MUL #8\n"
      "b 52f\n"
      "51:"  // Width 8: no bias
      "fmov z24.s, #0\n"
      "fmov z25.s, #0\n"
      "fmov z26.s, #0\n"
      "fmov z27.s, #0\n"
      "fmov z28.s, #0\n"
      "fmov z29.s, #0\n"
      "fmov z30.s, #0\n"
      "fmov z31.s, #0\n"
      "52:"  // Width 8: setup done
      "cmp x22, #0x8\n"
      "ble 54f\n"
      "53:"  // Width 8: Multiply loop: Main loop head
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "sub x22, x22, #0x8\n"
      "ld1rqw { z16.s }, p0/Z, [x21, #16]\n"
      "cmp x22, #0x8\n"
      "add x21, x21, #0x20\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x658aae10  // bfcvt z16.h, p3/M, z16.s\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      "uzp1 z16.h, z16.h, z16.h\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr], #7, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "trn1 z0.d, z0.d, z16.d\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      ".inst 0x64604038  // bfdot z24.s, z1.h, z0.h[0]\n"
      ".inst 0x64604059  // bfdot z25.s, z2.h, z0.h[0]\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr], #7, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6460407a  // bfdot z26.s, z3.h, z0.h[0]\n"
      ".inst 0x6460409b  // bfdot z27.s, z4.h, z0.h[0]\n"
      "ldnt1h { z17.h }, p3/Z, [%x[B_ptr]]\n"
      ".inst 0x646040bc  // bfdot z28.s, z5.h, z0.h[0]\n"
      ".inst 0x646040dd  // bfdot z29.s, z6.h, z0.h[0]\n"
      "ldnt1h { z18.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x646040fe  // bfdot z30.s, z7.h, z0.h[0]\n"
      ".inst 0x6460411f  // bfdot z31.s, z8.h, z0.h[0]\n"
      "ldnt1h { z19.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x64684138  // bfdot z24.s, z9.h, z0.h[1]\n"
      ".inst 0x64684159  // bfdot z25.s, z10.h, z0.h[1]\n"
      "ldnt1h { z20.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x6468417a  // bfdot z26.s, z11.h, z0.h[1]\n"
      ".inst 0x6468419b  // bfdot z27.s, z12.h, z0.h[1]\n"
      "ldnt1h { z21.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x646841bc  // bfdot z28.s, z13.h, z0.h[1]\n"
      ".inst 0x646841dd  // bfdot z29.s, z14.h, z0.h[1]\n"
      "ldnt1h { z22.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x646841fe  // bfdot z30.s, z15.h, z0.h[1]\n"
      ".inst 0x6468421f  // bfdot z31.s, z16.h, z0.h[1]\n"
      "ldnt1h { z23.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr], #7, MUL VL]\n"
      ".inst 0x64704238  // bfdot z24.s, z17.h, z0.h[2]\n"
      ".inst 0x64704259  // bfdot z25.s, z18.h, z0.h[2]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6470427a  // bfdot z26.s, z19.h, z0.h[2]\n"
      ".inst 0x6470429b  // bfdot z27.s, z20.h, z0.h[2]\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr]]\n"
      ".inst 0x647042bc  // bfdot z28.s, z21.h, z0.h[2]\n"
      ".inst 0x647042dd  // bfdot z29.s, z22.h, z0.h[2]\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x647042fe  // bfdot z30.s, z23.h, z0.h[2]\n"
      ".inst 0x6470403f  // bfdot z31.s, z1.h, z0.h[2]\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x64784058  // bfdot z24.s, z2.h, z0.h[3]\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x64784079  // bfdot z25.s, z3.h, z0.h[3]\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x6478409a  // bfdot z26.s, z4.h, z0.h[3]\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      ".inst 0x647840bb  // bfdot z27.s, z5.h, z0.h[3]\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr], #7, MUL VL]\n"
      ".inst 0x647840dc  // bfdot z28.s, z6.h, z0.h[3]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x647840fd  // bfdot z29.s, z7.h, z0.h[3]\n"
      ".inst 0x6478411e  // bfdot z30.s, z8.h, z0.h[3]\n"
      ".inst 0x6478413f  // bfdot z31.s, z9.h, z0.h[3]\n"
      "bgt 53b\n"
      "54:"  // Width 8: Multiply loop: Single iteration only
      "whilelt p1.s, XZR, x22\n"
      "whilelt p0.s, x26, x22\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr]]\n"
      "ld1rqw { z0.s }, p1/Z, [x21]\n"
      "subs x22, x22, #0x2\n"
      "ld1rqw { z18.s }, p0/Z, [x21, #16]\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      ".inst 0x658aac00  // bfcvt z0.h, p3/M, z0.s\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      ".inst 0x658aae52  // bfcvt z18.h, p3/M, z18.s\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      "uzp1 z0.h, z0.h, z0.h\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      "uzp1 z18.h, z18.h, z18.h\n"
      "ldnt1h { z17.h }, p3/Z, [%x[B_ptr], #7, MUL VL]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      "trn1 z0.d, z0.d, z18.d\n"
      ".inst 0x64604158  // bfdot z24.s, z10.h, z0.h[0]\n"
      ".inst 0x64604179  // bfdot z25.s, z11.h, z0.h[0]\n"
      ".inst 0x6460419a  // bfdot z26.s, z12.h, z0.h[0]\n"
      ".inst 0x646041bb  // bfdot z27.s, z13.h, z0.h[0]\n"
      ".inst 0x646041dc  // bfdot z28.s, z14.h, z0.h[0]\n"
      ".inst 0x646041fd  // bfdot z29.s, z15.h, z0.h[0]\n"
      ".inst 0x6460421e  // bfdot z30.s, z16.h, z0.h[0]\n"
      ".inst 0x6460423f  // bfdot z31.s, z17.h, z0.h[0]\n"
      "ble 55f\n"
      "ldnt1h { z18.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z19.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z20.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z21.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x64684258  // bfdot z24.s, z18.h, z0.h[1]\n"
      "ldnt1h { z22.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x64684279  // bfdot z25.s, z19.h, z0.h[1]\n"
      "ldnt1h { z23.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x6468429a  // bfdot z26.s, z20.h, z0.h[1]\n"
      "ldnt1h { z1.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      ".inst 0x646842bb  // bfdot z27.s, z21.h, z0.h[1]\n"
      "ldnt1h { z2.h }, p3/Z, [%x[B_ptr], #7, MUL VL]\n"
      ".inst 0x646842dc  // bfdot z28.s, z22.h, z0.h[1]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x646842fd  // bfdot z29.s, z23.h, z0.h[1]\n"
      ".inst 0x6468403e  // bfdot z30.s, z1.h, z0.h[1]\n"
      ".inst 0x6468405f  // bfdot z31.s, z2.h, z0.h[1]\n"
      "ble 55f\n"
      "ldnt1h { z3.h }, p3/Z, [%x[B_ptr]]\n"
      "subs x22, x22, #0x2\n"
      "ldnt1h { z4.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z5.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z6.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x64704078  // bfdot z24.s, z3.h, z0.h[2]\n"
      "ldnt1h { z7.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x64704099  // bfdot z25.s, z4.h, z0.h[2]\n"
      "ldnt1h { z8.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x647040ba  // bfdot z26.s, z5.h, z0.h[2]\n"
      "ldnt1h { z9.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      ".inst 0x647040db  // bfdot z27.s, z6.h, z0.h[2]\n"
      "ldnt1h { z10.h }, p3/Z, [%x[B_ptr], #7, MUL VL]\n"
      ".inst 0x647040fc  // bfdot z28.s, z7.h, z0.h[2]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6470411d  // bfdot z29.s, z8.h, z0.h[2]\n"
      ".inst 0x6470413e  // bfdot z30.s, z9.h, z0.h[2]\n"
      ".inst 0x6470415f  // bfdot z31.s, z10.h, z0.h[2]\n"
      "ble 55f\n"
      "ldnt1h { z11.h }, p3/Z, [%x[B_ptr]]\n"
      "ldnt1h { z12.h }, p3/Z, [%x[B_ptr], #1, MUL VL]\n"
      "ldnt1h { z13.h }, p3/Z, [%x[B_ptr], #2, MUL VL]\n"
      "ldnt1h { z14.h }, p3/Z, [%x[B_ptr], #3, MUL VL]\n"
      ".inst 0x64784178  // bfdot z24.s, z11.h, z0.h[3]\n"
      "ldnt1h { z15.h }, p3/Z, [%x[B_ptr], #4, MUL VL]\n"
      ".inst 0x64784199  // bfdot z25.s, z12.h, z0.h[3]\n"
      "ldnt1h { z16.h }, p3/Z, [%x[B_ptr], #5, MUL VL]\n"
      ".inst 0x647841ba  // bfdot z26.s, z13.h, z0.h[3]\n"
      "ldnt1h { z17.h }, p3/Z, [%x[B_ptr], #6, MUL VL]\n"
      ".inst 0x647841db  // bfdot z27.s, z14.h, z0.h[3]\n"
      "ldnt1h { z18.h }, p3/Z, [%x[B_ptr], #7, MUL VL]\n"
      ".inst 0x647841fc  // bfdot z28.s, z15.h, z0.h[3]\n"
      "addvl %x[B_ptr], %x[B_ptr], #8\n"
      ".inst 0x6478421d  // bfdot z29.s, z16.h, z0.h[3]\n"
      ".inst 0x6478423e  // bfdot z30.s, z17.h, z0.h[3]\n"
      ".inst 0x6478425f  // bfdot z31.s, z18.h, z0.h[3]\n"
      "55:"  // Width 8: Multiply loop: multiply skip
      "tbz %x[flags], #1, 56f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z17.s }, p3/Z, [x21]\n"
      "ld1rw { z16.s }, p3/Z, [x20]\n"
      "fmin z24.s, p3/M, z24.s, z17.s\n"
      "fmin z25.s, p3/M, z25.s, z17.s\n"
      "fmin z26.s, p3/M, z26.s, z17.s\n"
      "fmin z27.s, p3/M, z27.s, z17.s\n"
      "fmin z28.s, p3/M, z28.s, z17.s\n"
      "fmin z29.s, p3/M, z29.s, z17.s\n"
      "fmin z30.s, p3/M, z30.s, z17.s\n"
      "fmin z31.s, p3/M, z31.s, z17.s\n"
      "fmax z24.s, p3/M, z24.s, z16.s\n"
      "fmax z25.s, p3/M, z25.s, z16.s\n"
      "fmax z26.s, p3/M, z26.s, z16.s\n"
      "fmax z27.s, p3/M, z27.s, z16.s\n"
      "fmax z28.s, p3/M, z28.s, z16.s\n"
      "fmax z29.s, p3/M, z29.s, z16.s\n"
      "fmax z30.s, p3/M, z30.s, z16.s\n"
      "fmax z31.s, p3/M, z31.s, z16.s\n"
      "56:"  // Width 8: No activation
      "subs x23, x23, #0x8\n"
      "st1w { z24.s }, p3, [%x[output_ptr]]\n"
      "sub %x[N], %x[N], x24, LSL #3\n"
      "st1w { z25.s }, p3, [%x[output_ptr], #1, MUL VL]\n"
      "st1w { z26.s }, p3, [%x[output_ptr], #2, MUL VL]\n"
      "st1w { z27.s }, p3, [%x[output_ptr], #3, MUL VL]\n"
      "st1w { z28.s }, p3, [%x[output_ptr], #4, MUL VL]\n"
      "st1w { z29.s }, p3, [%x[output_ptr], #5, MUL VL]\n"
      "st1w { z30.s }, p3, [%x[output_ptr], #6, MUL VL]\n"
      "st1w { z31.s }, p2, [%x[output_ptr], #7, MUL VL]\n"
      "addvl %x[output_ptr], %x[output_ptr], #8\n"
      "bgt 1b\n"
      "57:"  // Exit
      ".inst 0xd503467f  // SMSTOP\n"
      : [B_ptr] "+&r" (B_ptr), [N] "+&r" (N), [output_ptr] "+&r" (output_ptr)
      : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&args), [bias] "r" (bias), [flags] "r" (flags), [offsetof_maxval] "I" (offsetof(KernelArgs, maxval)), [offsetof_minval] "I" (offsetof(KernelArgs, minval))
      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
    );
}

} // namespace arm_gemm

#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__aarch64__)

