/*
 * Copyright (c) 2025-2026 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#if (defined(ENABLE_FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) && defined(ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__aarch64__)

#include "arm_gemm/arm_gemm.hpp"
#include "arm_common/internal/utils.hpp"
#include <cassert>
#include <limits>

namespace arm_gemm {

void sve_ffhybrid_fp16fp32fp16_mla_6x4VL (
    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
    size_t M, size_t N, const __fp16 *B_ptr, size_t B_stride, IndirectOutputArg<__fp16> output_arg,
    const __fp16 *bias, Activation act, bool accumulate
)
{
    struct KernelArgs {
        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
        unsigned int num_strings = {};
        const unsigned int *string_lengths = {};
        size_t N = {};
        const __fp16 *B_ptr = {};
        const __fp16 *cur_B_ptr = {};
        size_t B_stride = {};
        size_t output_offset = {};
        size_t input_initial_col = {};
        size_t input_offset = {};
        void *output_ptr = {};
        const __fp16 *bias = {};
    } ka;

    unsigned long flags=0;
    void *input_ptr;

    if (output_arg.is_indirect) {
        ka.output_ptr=(void *)(output_arg.indirect.ptr);
        ka.output_offset=output_arg.indirect.offset;
        flags |= 0x4;
    } else {
        ka.output_ptr=(void *)(output_arg.direct.base);
        ka.output_offset=output_arg.direct.stride;
    }

    if (A_arg.is_indirect) {
        input_ptr=(void *)(A_arg.indirect.ptr);
        ka.input_offset=A_arg.indirect.start_row;
        ka.input_initial_col=A_arg.indirect.start_col;
        flags |= 0x8;
    } else {
        assert(num_strings==1);
        input_ptr=(void *)(A_arg.direct.base);
        ka.input_offset=A_arg.direct.stride;
    }
    if (accumulate) {
        flags |= 0x1;
    }
    ka.num_strings = num_strings;
    ka.string_lengths = string_lengths;
    ka.N = N;
    ka.B_ptr = B_ptr;
    ka.bias = bias;
    ka.B_stride = B_stride;
    switch(act.type) {
        default:
        case Activation::Type::None:
            break;
        case Activation::Type::BoundedReLU:
            ka.maxval = static_cast<float>(act.param1);
            /* fall through */
        case Activation::Type::ReLU:
            ka.minval = 0;
            flags |= 0x2;
            break;
    }
    __asm__ __volatile__(
      "ptrue p5.b\n"
      "1:"  // Row loop
      "cmp %x[M], #0x6\n"
      "bge 71f\n"
      "cmp %x[M], #0x4\n"
      "bgt 57f\n"
      "beq 43f\n"
      "cmp %x[M], #0x2\n"
      "bgt 29f\n"
      "beq 15f\n"
      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
      "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
      "2:"  // Height 1: Column loop
      "ldr x10, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
      "cnth x21\n"
      "add x9, x10, x20, LSL #1\n"
      "add x20, x9, x20, LSL #1\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "cmp x13, x21\n"
      "bgt 3f\n"
      "mov x9, x10\n"
      "3:"  // Height 1: B setup done
      "mov x20, #0\n"
      "whilelt p4.s, x20, x13\n"
      "incw x20\n"
      "whilelt p3.s, x20, x13\n"
      "incw x20\n"
      "whilelt p2.s, x20, x13\n"
      "incw x20\n"
      "whilelt p1.s, x20, x13\n"
      "cbz x12, 4f\n"
      "ld1h { z8.s }, p4/Z, [x12]\n"
      "ld1h { z9.s }, p3/Z, [x12, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x12, #3, MUL VL]\n"
      "addvl x12, x12, #2\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "b 6f\n"
      "4:"  // Height 1: no bias
      "tbz %x[flags], #0, 5f\n"
      "ld1h { z8.s }, p4/Z, [x11]\n"
      "ld1h { z9.s }, p3/Z, [x11, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x11, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x11, #3, MUL VL]\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "b 6f\n"
      "5:"  // Height 1: no accumulate
      "mov z8.b, #0\n"
      "mov z9.b, #0\n"
      "mov z10.b, #0\n"
      "mov z11.b, #0\n"
      "6:"  // Height 1: setup done
      "mov x28, #0\n"
      "7:"  // Height 1: String loop
      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
      "ldr w27, [x20, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 8f\n"
      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x20, x20, x21, LSL #3\n"
      "ldr x26, [x20, #0]\n"
      "cbnz x28, 9f\n"
      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x20, LSL #1\n"
      "b 9f\n"
      "8:"  // Height 1: setup direct input
      "mov x26, %x[input_ptr]\n"
      "9:"  // Height 1: input setup done
      "cmp x27, #0x8\n"
      "ble 11f\n"
      "10:"  // Height 1: Multiply loop: Main loop head
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      "sub x27, x27, #0x8\n"
      "cmp x27, #0x8\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      "add x26, x26, #0x10\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      "bgt 10b\n"
      "11:"  // Height 1: Multiply loop: Single iteration only
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      "ble 12f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      "ble 12f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      "ble 12f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      "ble 12f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      "ble 12f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      "ble 12f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      "ble 12f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      "12:"  // Height 1: Multiply loop: multiply skip
      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "add x28, x28, #0x1\n"
      "cmp x28, x20\n"
      "bne 7b\n"
      "tbz %x[flags], #1, 13f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z17.s }, p5/Z, [x21]\n"
      "ld1rw { z16.s }, p5/Z, [x20]\n"
      "fmin z8.s, p5/M, z8.s, z17.s\n"
      "fmin z9.s, p5/M, z9.s, z17.s\n"
      "fmin z10.s, p5/M, z10.s, z17.s\n"
      "fmin z11.s, p5/M, z11.s, z17.s\n"
      "fmax z8.s, p5/M, z8.s, z16.s\n"
      "fmax z9.s, p5/M, z9.s, z16.s\n"
      "fmax z10.s, p5/M, z10.s, z16.s\n"
      "fmax z11.s, p5/M, z11.s, z16.s\n"
      "13:"  // Height 1: No activation
      "fcvt z8.h, p5/m, z8.s\n"
      "fcvt z9.h, p5/m, z9.s\n"
      "fcvt z10.h, p5/m, z10.s\n"
      "fcvt z11.h, p5/m, z11.s\n"
      "st1h { z8.s }, p4, [x11]\n"
      "st1h { z9.s }, p3, [x11, #1, MUL VL]\n"
      "st1h { z10.s }, p2, [x11, #2, MUL VL]\n"
      "st1h { z11.s }, p1, [x11, #3, MUL VL]\n"
      "addvl x11, x11, #2\n"
      "decw x13, ALL, MUL #4\n"
      "cmp x13, XZR\n"
      "bgt 2b\n"
      "b 86f\n"
      "15:"  // Height 2
      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
      "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
      "16:"  // Height 2: Column loop
      "ldr x10, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
      "cnth x21\n"
      "add x9, x10, x20, LSL #1\n"
      "add x20, x9, x20, LSL #1\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "cmp x13, x21\n"
      "bgt 17f\n"
      "mov x9, x10\n"
      "17:"  // Height 2: B setup done
      "mov x20, #0\n"
      "whilelt p4.s, x20, x13\n"
      "incw x20\n"
      "whilelt p3.s, x20, x13\n"
      "incw x20\n"
      "whilelt p2.s, x20, x13\n"
      "incw x20\n"
      "whilelt p1.s, x20, x13\n"
      "cbz x12, 18f\n"
      "ld1h { z8.s }, p4/Z, [x12]\n"
      "ld1h { z9.s }, p3/Z, [x12, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x12, #3, MUL VL]\n"
      "addvl x12, x12, #2\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "mov z12.d, z8.d\n"
      "mov z13.d, z9.d\n"
      "mov z14.d, z10.d\n"
      "mov z15.d, z11.d\n"
      "b 20f\n"
      "18:"  // Height 2: no bias
      "tbz %x[flags], #0, 19f\n"
      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
      "ld1h { z8.s }, p4/Z, [x11]\n"
      "ld1h { z9.s }, p3/Z, [x11, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x11, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x11, #3, MUL VL]\n"
      "add x20, x11, x20, LSL #1\n"
      "ld1h { z12.s }, p4/Z, [x20]\n"
      "ld1h { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
      "ld1h { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "ld1h { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "fcvt z12.s, p5/m, z12.h\n"
      "fcvt z13.s, p5/m, z13.h\n"
      "fcvt z14.s, p5/m, z14.h\n"
      "fcvt z15.s, p5/m, z15.h\n"
      "b 20f\n"
      "19:"  // Height 2: no accumulate
      "mov z8.b, #0\n"
      "mov z9.b, #0\n"
      "mov z10.b, #0\n"
      "mov z11.b, #0\n"
      "mov z12.b, #0\n"
      "mov z13.b, #0\n"
      "mov z14.b, #0\n"
      "mov z15.b, #0\n"
      "20:"  // Height 2: setup done
      "mov x28, #0\n"
      "21:"  // Height 2: String loop
      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
      "ldr w27, [x20, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 22f\n"
      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x20, x20, x21, LSL #3\n"
      "ldr x26, [x20, #0]\n"
      "ldr x25, [x20, #0x8]\n"
      "cbnz x28, 23f\n"
      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x20, LSL #1\n"
      "add x25, x25, x20, LSL #1\n"
      "b 23f\n"
      "22:"  // Height 2: setup direct input
      "mov x26, %x[input_ptr]\n"
      "add x25, x26, x21, LSL #1\n"
      "23:"  // Height 2: input setup done
      "cmp x27, #0x8\n"
      "ble 25f\n"
      "24:"  // Height 2: Multiply loop: Main loop head
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      "sub x27, x27, #0x8\n"
      "cmp x27, #0x8\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      "add x26, x26, #0x10\n"
      "ld1rqh { z1.h }, p0/Z, [x25]\n"
      "add x25, x25, #0x10\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140cc  // fmlalb z12.s, z6.h, z1.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a140ed  // fmlalb z13.s, z7.h, z1.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140ce  // fmlalb z14.s, z6.h, z1.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ef  // fmlalb z15.s, z7.h, z1.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148cc  // fmlalb z12.s, z6.h, z1.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ed  // fmlalb z13.s, z7.h, z1.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148ce  // fmlalb z14.s, z6.h, z1.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ef  // fmlalb z15.s, z7.h, z1.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940cc  // fmlalb z12.s, z6.h, z1.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ed  // fmlalb z13.s, z7.h, z1.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940ce  // fmlalb z14.s, z6.h, z1.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ef  // fmlalb z15.s, z7.h, z1.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948cc  // fmlalb z12.s, z6.h, z1.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ed  // fmlalb z13.s, z7.h, z1.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948ce  // fmlalb z14.s, z6.h, z1.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ef  // fmlalb z15.s, z7.h, z1.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140cc  // fmlalb z12.s, z6.h, z1.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ed  // fmlalb z13.s, z7.h, z1.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140ce  // fmlalb z14.s, z6.h, z1.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ef  // fmlalb z15.s, z7.h, z1.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148cc  // fmlalb z12.s, z6.h, z1.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ed  // fmlalb z13.s, z7.h, z1.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148ce  // fmlalb z14.s, z6.h, z1.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ef  // fmlalb z15.s, z7.h, z1.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940cc  // fmlalb z12.s, z6.h, z1.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ed  // fmlalb z13.s, z7.h, z1.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940ce  // fmlalb z14.s, z6.h, z1.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ef  // fmlalb z15.s, z7.h, z1.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948cc  // fmlalb z12.s, z6.h, z1.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ed  // fmlalb z13.s, z7.h, z1.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948ce  // fmlalb z14.s, z6.h, z1.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ef  // fmlalb z15.s, z7.h, z1.h[7]\n"
      "bgt 24b\n"
      "25:"  // Height 2: Multiply loop: Single iteration only
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      "ld1rqh { z1.h }, p0/Z, [x25]\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140cc  // fmlalb z12.s, z6.h, z1.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ed  // fmlalb z13.s, z7.h, z1.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140ce  // fmlalb z14.s, z6.h, z1.h[0]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ef  // fmlalb z15.s, z7.h, z1.h[0]\n"
      "ble 26f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148cc  // fmlalb z12.s, z6.h, z1.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ed  // fmlalb z13.s, z7.h, z1.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148ce  // fmlalb z14.s, z6.h, z1.h[1]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ef  // fmlalb z15.s, z7.h, z1.h[1]\n"
      "ble 26f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940cc  // fmlalb z12.s, z6.h, z1.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ed  // fmlalb z13.s, z7.h, z1.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940ce  // fmlalb z14.s, z6.h, z1.h[2]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ef  // fmlalb z15.s, z7.h, z1.h[2]\n"
      "ble 26f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948cc  // fmlalb z12.s, z6.h, z1.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ed  // fmlalb z13.s, z7.h, z1.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948ce  // fmlalb z14.s, z6.h, z1.h[3]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ef  // fmlalb z15.s, z7.h, z1.h[3]\n"
      "ble 26f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140cc  // fmlalb z12.s, z6.h, z1.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ed  // fmlalb z13.s, z7.h, z1.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140ce  // fmlalb z14.s, z6.h, z1.h[4]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ef  // fmlalb z15.s, z7.h, z1.h[4]\n"
      "ble 26f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148cc  // fmlalb z12.s, z6.h, z1.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ed  // fmlalb z13.s, z7.h, z1.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148ce  // fmlalb z14.s, z6.h, z1.h[5]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ef  // fmlalb z15.s, z7.h, z1.h[5]\n"
      "ble 26f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940cc  // fmlalb z12.s, z6.h, z1.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ed  // fmlalb z13.s, z7.h, z1.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940ce  // fmlalb z14.s, z6.h, z1.h[6]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ef  // fmlalb z15.s, z7.h, z1.h[6]\n"
      "ble 26f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948cc  // fmlalb z12.s, z6.h, z1.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ed  // fmlalb z13.s, z7.h, z1.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948ce  // fmlalb z14.s, z6.h, z1.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ef  // fmlalb z15.s, z7.h, z1.h[7]\n"
      "26:"  // Height 2: Multiply loop: multiply skip
      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "add x28, x28, #0x1\n"
      "cmp x28, x20\n"
      "bne 21b\n"
      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
      "add x26, x11, x20, LSL #1\n"
      "tbz %x[flags], #1, 27f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z17.s }, p5/Z, [x21]\n"
      "ld1rw { z16.s }, p5/Z, [x20]\n"
      "fmin z8.s, p5/M, z8.s, z17.s\n"
      "fmin z9.s, p5/M, z9.s, z17.s\n"
      "fmin z10.s, p5/M, z10.s, z17.s\n"
      "fmin z11.s, p5/M, z11.s, z17.s\n"
      "fmin z12.s, p5/M, z12.s, z17.s\n"
      "fmin z13.s, p5/M, z13.s, z17.s\n"
      "fmin z14.s, p5/M, z14.s, z17.s\n"
      "fmin z15.s, p5/M, z15.s, z17.s\n"
      "fmax z8.s, p5/M, z8.s, z16.s\n"
      "fmax z9.s, p5/M, z9.s, z16.s\n"
      "fmax z10.s, p5/M, z10.s, z16.s\n"
      "fmax z11.s, p5/M, z11.s, z16.s\n"
      "fmax z12.s, p5/M, z12.s, z16.s\n"
      "fmax z13.s, p5/M, z13.s, z16.s\n"
      "fmax z14.s, p5/M, z14.s, z16.s\n"
      "fmax z15.s, p5/M, z15.s, z16.s\n"
      "27:"  // Height 2: No activation
      "fcvt z8.h, p5/m, z8.s\n"
      "fcvt z9.h, p5/m, z9.s\n"
      "fcvt z10.h, p5/m, z10.s\n"
      "fcvt z11.h, p5/m, z11.s\n"
      "fcvt z12.h, p5/m, z12.s\n"
      "fcvt z13.h, p5/m, z13.s\n"
      "fcvt z14.h, p5/m, z14.s\n"
      "fcvt z15.h, p5/m, z15.s\n"
      "st1h { z8.s }, p4, [x11]\n"
      "st1h { z9.s }, p3, [x11, #1, MUL VL]\n"
      "st1h { z10.s }, p2, [x11, #2, MUL VL]\n"
      "st1h { z11.s }, p1, [x11, #3, MUL VL]\n"
      "addvl x11, x11, #2\n"
      "st1h { z12.s }, p4, [x26]\n"
      "st1h { z13.s }, p3, [x26, #1, MUL VL]\n"
      "st1h { z14.s }, p2, [x26, #2, MUL VL]\n"
      "st1h { z15.s }, p1, [x26, #3, MUL VL]\n"
      "decw x13, ALL, MUL #4\n"
      "cmp x13, XZR\n"
      "bgt 16b\n"
      "b 86f\n"
      "29:"  // Height 3
      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
      "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
      "30:"  // Height 3: Column loop
      "ldr x10, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
      "cnth x21\n"
      "add x9, x10, x20, LSL #1\n"
      "add x20, x9, x20, LSL #1\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "cmp x13, x21\n"
      "bgt 31f\n"
      "mov x9, x10\n"
      "31:"  // Height 3: B setup done
      "mov x20, #0\n"
      "whilelt p4.s, x20, x13\n"
      "incw x20\n"
      "whilelt p3.s, x20, x13\n"
      "incw x20\n"
      "whilelt p2.s, x20, x13\n"
      "incw x20\n"
      "whilelt p1.s, x20, x13\n"
      "cbz x12, 32f\n"
      "ld1h { z8.s }, p4/Z, [x12]\n"
      "ld1h { z9.s }, p3/Z, [x12, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x12, #3, MUL VL]\n"
      "addvl x12, x12, #2\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "mov z12.d, z8.d\n"
      "mov z16.d, z8.d\n"
      "mov z13.d, z9.d\n"
      "mov z14.d, z10.d\n"
      "mov z17.d, z9.d\n"
      "mov z15.d, z11.d\n"
      "mov z18.d, z10.d\n"
      "mov z19.d, z11.d\n"
      "b 34f\n"
      "32:"  // Height 3: no bias
      "tbz %x[flags], #0, 33f\n"
      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
      "ld1h { z8.s }, p4/Z, [x11]\n"
      "ld1h { z9.s }, p3/Z, [x11, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x11, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x11, #3, MUL VL]\n"
      "add x21, x11, x20, LSL #1\n"
      "add x20, x21, x20, LSL #1\n"
      "ld1h { z12.s }, p4/Z, [x21]\n"
      "ld1h { z13.s }, p3/Z, [x21, #1, MUL VL]\n"
      "ld1h { z14.s }, p2/Z, [x21, #2, MUL VL]\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "ld1h { z15.s }, p1/Z, [x21, #3, MUL VL]\n"
      "ld1h { z16.s }, p4/Z, [x20]\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "ld1h { z17.s }, p3/Z, [x20, #1, MUL VL]\n"
      "ld1h { z18.s }, p2/Z, [x20, #2, MUL VL]\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "fcvt z12.s, p5/m, z12.h\n"
      "ld1h { z19.s }, p1/Z, [x20, #3, MUL VL]\n"
      "fcvt z13.s, p5/m, z13.h\n"
      "fcvt z14.s, p5/m, z14.h\n"
      "fcvt z15.s, p5/m, z15.h\n"
      "fcvt z16.s, p5/m, z16.h\n"
      "fcvt z17.s, p5/m, z17.h\n"
      "fcvt z18.s, p5/m, z18.h\n"
      "fcvt z19.s, p5/m, z19.h\n"
      "b 34f\n"
      "33:"  // Height 3: no accumulate
      "mov z8.b, #0\n"
      "mov z9.b, #0\n"
      "mov z10.b, #0\n"
      "mov z11.b, #0\n"
      "mov z12.b, #0\n"
      "mov z13.b, #0\n"
      "mov z14.b, #0\n"
      "mov z15.b, #0\n"
      "mov z16.b, #0\n"
      "mov z17.b, #0\n"
      "mov z18.b, #0\n"
      "mov z19.b, #0\n"
      "34:"  // Height 3: setup done
      "mov x28, #0\n"
      "35:"  // Height 3: String loop
      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
      "ldr w27, [x20, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 36f\n"
      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x20, x20, x21, LSL #3\n"
      "ldr x26, [x20, #0]\n"
      "ldr x25, [x20, #0x8]\n"
      "ldr x24, [x20, #0x10]\n"
      "cbnz x28, 37f\n"
      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x20, LSL #1\n"
      "add x25, x25, x20, LSL #1\n"
      "add x24, x24, x20, LSL #1\n"
      "b 37f\n"
      "36:"  // Height 3: setup direct input
      "mov x26, %x[input_ptr]\n"
      "add x25, x26, x21, LSL #1\n"
      "add x24, x25, x21, LSL #1\n"
      "37:"  // Height 3: input setup done
      "cmp x27, #0x8\n"
      "ble 39f\n"
      "38:"  // Height 3: Multiply loop: Main loop head
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      "sub x27, x27, #0x8\n"
      "cmp x27, #0x8\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      "add x26, x26, #0x10\n"
      "ld1rqh { z1.h }, p0/Z, [x25]\n"
      "add x25, x25, #0x10\n"
      "ld1rqh { z2.h }, p0/Z, [x24]\n"
      "add x24, x24, #0x10\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140cc  // fmlalb z12.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d0  // fmlalb z16.s, z6.h, z2.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a140ed  // fmlalb z13.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240f1  // fmlalb z17.s, z7.h, z2.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140ce  // fmlalb z14.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d2  // fmlalb z18.s, z6.h, z2.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ef  // fmlalb z15.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240f3  // fmlalb z19.s, z7.h, z2.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148cc  // fmlalb z12.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d0  // fmlalb z16.s, z6.h, z2.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ed  // fmlalb z13.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f1  // fmlalb z17.s, z7.h, z2.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148ce  // fmlalb z14.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d2  // fmlalb z18.s, z6.h, z2.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ef  // fmlalb z15.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f3  // fmlalb z19.s, z7.h, z2.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940cc  // fmlalb z12.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d0  // fmlalb z16.s, z6.h, z2.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ed  // fmlalb z13.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f1  // fmlalb z17.s, z7.h, z2.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940ce  // fmlalb z14.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d2  // fmlalb z18.s, z6.h, z2.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ef  // fmlalb z15.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f3  // fmlalb z19.s, z7.h, z2.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948cc  // fmlalb z12.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d0  // fmlalb z16.s, z6.h, z2.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ed  // fmlalb z13.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f1  // fmlalb z17.s, z7.h, z2.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948ce  // fmlalb z14.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d2  // fmlalb z18.s, z6.h, z2.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ef  // fmlalb z15.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f3  // fmlalb z19.s, z7.h, z2.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140cc  // fmlalb z12.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d0  // fmlalb z16.s, z6.h, z2.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ed  // fmlalb z13.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f1  // fmlalb z17.s, z7.h, z2.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140ce  // fmlalb z14.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d2  // fmlalb z18.s, z6.h, z2.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ef  // fmlalb z15.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f3  // fmlalb z19.s, z7.h, z2.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148cc  // fmlalb z12.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d0  // fmlalb z16.s, z6.h, z2.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ed  // fmlalb z13.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f1  // fmlalb z17.s, z7.h, z2.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148ce  // fmlalb z14.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d2  // fmlalb z18.s, z6.h, z2.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ef  // fmlalb z15.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f3  // fmlalb z19.s, z7.h, z2.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940cc  // fmlalb z12.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d0  // fmlalb z16.s, z6.h, z2.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ed  // fmlalb z13.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f1  // fmlalb z17.s, z7.h, z2.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940ce  // fmlalb z14.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d2  // fmlalb z18.s, z6.h, z2.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ef  // fmlalb z15.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f3  // fmlalb z19.s, z7.h, z2.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948cc  // fmlalb z12.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d0  // fmlalb z16.s, z6.h, z2.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ed  // fmlalb z13.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f1  // fmlalb z17.s, z7.h, z2.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948ce  // fmlalb z14.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d2  // fmlalb z18.s, z6.h, z2.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ef  // fmlalb z15.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f3  // fmlalb z19.s, z7.h, z2.h[7]\n"
      "bgt 38b\n"
      "39:"  // Height 3: Multiply loop: Single iteration only
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      "ld1rqh { z1.h }, p0/Z, [x25]\n"
      "ld1rqh { z2.h }, p0/Z, [x24]\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140cc  // fmlalb z12.s, z6.h, z1.h[0]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ed  // fmlalb z13.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240d0  // fmlalb z16.s, z6.h, z2.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a240f1  // fmlalb z17.s, z7.h, z2.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140ce  // fmlalb z14.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d2  // fmlalb z18.s, z6.h, z2.h[0]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ef  // fmlalb z15.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240f3  // fmlalb z19.s, z7.h, z2.h[0]\n"
      "ble 40f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148cc  // fmlalb z12.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d0  // fmlalb z16.s, z6.h, z2.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ed  // fmlalb z13.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f1  // fmlalb z17.s, z7.h, z2.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148ce  // fmlalb z14.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d2  // fmlalb z18.s, z6.h, z2.h[1]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ef  // fmlalb z15.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f3  // fmlalb z19.s, z7.h, z2.h[1]\n"
      "ble 40f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940cc  // fmlalb z12.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d0  // fmlalb z16.s, z6.h, z2.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ed  // fmlalb z13.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f1  // fmlalb z17.s, z7.h, z2.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940ce  // fmlalb z14.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d2  // fmlalb z18.s, z6.h, z2.h[2]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ef  // fmlalb z15.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f3  // fmlalb z19.s, z7.h, z2.h[2]\n"
      "ble 40f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948cc  // fmlalb z12.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d0  // fmlalb z16.s, z6.h, z2.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ed  // fmlalb z13.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f1  // fmlalb z17.s, z7.h, z2.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948ce  // fmlalb z14.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d2  // fmlalb z18.s, z6.h, z2.h[3]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ef  // fmlalb z15.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f3  // fmlalb z19.s, z7.h, z2.h[3]\n"
      "ble 40f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140cc  // fmlalb z12.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d0  // fmlalb z16.s, z6.h, z2.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ed  // fmlalb z13.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f1  // fmlalb z17.s, z7.h, z2.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140ce  // fmlalb z14.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d2  // fmlalb z18.s, z6.h, z2.h[4]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ef  // fmlalb z15.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f3  // fmlalb z19.s, z7.h, z2.h[4]\n"
      "ble 40f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148cc  // fmlalb z12.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d0  // fmlalb z16.s, z6.h, z2.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ed  // fmlalb z13.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f1  // fmlalb z17.s, z7.h, z2.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148ce  // fmlalb z14.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d2  // fmlalb z18.s, z6.h, z2.h[5]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ef  // fmlalb z15.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f3  // fmlalb z19.s, z7.h, z2.h[5]\n"
      "ble 40f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940cc  // fmlalb z12.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d0  // fmlalb z16.s, z6.h, z2.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ed  // fmlalb z13.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f1  // fmlalb z17.s, z7.h, z2.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940ce  // fmlalb z14.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d2  // fmlalb z18.s, z6.h, z2.h[6]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ef  // fmlalb z15.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f3  // fmlalb z19.s, z7.h, z2.h[6]\n"
      "ble 40f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948cc  // fmlalb z12.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d0  // fmlalb z16.s, z6.h, z2.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ed  // fmlalb z13.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f1  // fmlalb z17.s, z7.h, z2.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948ce  // fmlalb z14.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d2  // fmlalb z18.s, z6.h, z2.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ef  // fmlalb z15.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f3  // fmlalb z19.s, z7.h, z2.h[7]\n"
      "40:"  // Height 3: Multiply loop: multiply skip
      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "add x28, x28, #0x1\n"
      "cmp x28, x20\n"
      "bne 35b\n"
      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
      "add x26, x11, x20, LSL #1\n"
      "add x25, x26, x20, LSL #1\n"
      "tbz %x[flags], #1, 41f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z21.s }, p5/Z, [x21]\n"
      "ld1rw { z20.s }, p5/Z, [x20]\n"
      "fmin z8.s, p5/M, z8.s, z21.s\n"
      "fmin z9.s, p5/M, z9.s, z21.s\n"
      "fmin z10.s, p5/M, z10.s, z21.s\n"
      "fmin z11.s, p5/M, z11.s, z21.s\n"
      "fmin z12.s, p5/M, z12.s, z21.s\n"
      "fmin z13.s, p5/M, z13.s, z21.s\n"
      "fmin z14.s, p5/M, z14.s, z21.s\n"
      "fmin z15.s, p5/M, z15.s, z21.s\n"
      "fmin z16.s, p5/M, z16.s, z21.s\n"
      "fmin z17.s, p5/M, z17.s, z21.s\n"
      "fmin z18.s, p5/M, z18.s, z21.s\n"
      "fmin z19.s, p5/M, z19.s, z21.s\n"
      "fmax z8.s, p5/M, z8.s, z20.s\n"
      "fmax z9.s, p5/M, z9.s, z20.s\n"
      "fmax z10.s, p5/M, z10.s, z20.s\n"
      "fmax z11.s, p5/M, z11.s, z20.s\n"
      "fmax z12.s, p5/M, z12.s, z20.s\n"
      "fmax z13.s, p5/M, z13.s, z20.s\n"
      "fmax z14.s, p5/M, z14.s, z20.s\n"
      "fmax z15.s, p5/M, z15.s, z20.s\n"
      "fmax z16.s, p5/M, z16.s, z20.s\n"
      "fmax z17.s, p5/M, z17.s, z20.s\n"
      "fmax z18.s, p5/M, z18.s, z20.s\n"
      "fmax z19.s, p5/M, z19.s, z20.s\n"
      "41:"  // Height 3: No activation
      "fcvt z8.h, p5/m, z8.s\n"
      "fcvt z9.h, p5/m, z9.s\n"
      "fcvt z10.h, p5/m, z10.s\n"
      "fcvt z11.h, p5/m, z11.s\n"
      "fcvt z12.h, p5/m, z12.s\n"
      "fcvt z13.h, p5/m, z13.s\n"
      "fcvt z14.h, p5/m, z14.s\n"
      "fcvt z15.h, p5/m, z15.s\n"
      "st1h { z8.s }, p4, [x11]\n"
      "fcvt z16.h, p5/m, z16.s\n"
      "fcvt z17.h, p5/m, z17.s\n"
      "st1h { z9.s }, p3, [x11, #1, MUL VL]\n"
      "fcvt z18.h, p5/m, z18.s\n"
      "fcvt z19.h, p5/m, z19.s\n"
      "st1h { z10.s }, p2, [x11, #2, MUL VL]\n"
      "st1h { z11.s }, p1, [x11, #3, MUL VL]\n"
      "addvl x11, x11, #2\n"
      "st1h { z12.s }, p4, [x26]\n"
      "st1h { z13.s }, p3, [x26, #1, MUL VL]\n"
      "st1h { z14.s }, p2, [x26, #2, MUL VL]\n"
      "st1h { z15.s }, p1, [x26, #3, MUL VL]\n"
      "st1h { z16.s }, p4, [x25]\n"
      "st1h { z17.s }, p3, [x25, #1, MUL VL]\n"
      "st1h { z18.s }, p2, [x25, #2, MUL VL]\n"
      "st1h { z19.s }, p1, [x25, #3, MUL VL]\n"
      "decw x13, ALL, MUL #4\n"
      "cmp x13, XZR\n"
      "bgt 30b\n"
      "b 86f\n"
      "43:"  // Height 4
      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
      "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
      "44:"  // Height 4: Column loop
      "ldr x10, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
      "cnth x21\n"
      "add x9, x10, x20, LSL #1\n"
      "add x20, x9, x20, LSL #1\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "cmp x13, x21\n"
      "bgt 45f\n"
      "mov x9, x10\n"
      "45:"  // Height 4: B setup done
      "mov x20, #0\n"
      "whilelt p4.s, x20, x13\n"
      "incw x20\n"
      "whilelt p3.s, x20, x13\n"
      "incw x20\n"
      "whilelt p2.s, x20, x13\n"
      "incw x20\n"
      "whilelt p1.s, x20, x13\n"
      "cbz x12, 46f\n"
      "ld1h { z8.s }, p4/Z, [x12]\n"
      "ld1h { z9.s }, p3/Z, [x12, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x12, #3, MUL VL]\n"
      "addvl x12, x12, #2\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "mov z12.d, z8.d\n"
      "mov z16.d, z8.d\n"
      "mov z20.d, z8.d\n"
      "mov z13.d, z9.d\n"
      "mov z14.d, z10.d\n"
      "mov z15.d, z11.d\n"
      "mov z17.d, z9.d\n"
      "mov z18.d, z10.d\n"
      "mov z19.d, z11.d\n"
      "mov z21.d, z9.d\n"
      "mov z22.d, z10.d\n"
      "mov z23.d, z11.d\n"
      "b 48f\n"
      "46:"  // Height 4: no bias
      "tbz %x[flags], #0, 47f\n"
      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
      "ld1h { z8.s }, p4/Z, [x11]\n"
      "ld1h { z9.s }, p3/Z, [x11, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x11, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x11, #3, MUL VL]\n"
      "add x22, x11, x20, LSL #1\n"
      "add x21, x22, x20, LSL #1\n"
      "add x20, x21, x20, LSL #1\n"
      "ld1h { z12.s }, p4/Z, [x22]\n"
      "ld1h { z13.s }, p3/Z, [x22, #1, MUL VL]\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "ld1h { z14.s }, p2/Z, [x22, #2, MUL VL]\n"
      "ld1h { z15.s }, p1/Z, [x22, #3, MUL VL]\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "ld1h { z16.s }, p4/Z, [x21]\n"
      "ld1h { z17.s }, p3/Z, [x21, #1, MUL VL]\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "ld1h { z18.s }, p2/Z, [x21, #2, MUL VL]\n"
      "ld1h { z19.s }, p1/Z, [x21, #3, MUL VL]\n"
      "fcvt z12.s, p5/m, z12.h\n"
      "fcvt z13.s, p5/m, z13.h\n"
      "ld1h { z20.s }, p4/Z, [x20]\n"
      "ld1h { z21.s }, p3/Z, [x20, #1, MUL VL]\n"
      "fcvt z14.s, p5/m, z14.h\n"
      "fcvt z15.s, p5/m, z15.h\n"
      "ld1h { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
      "ld1h { z23.s }, p1/Z, [x20, #3, MUL VL]\n"
      "fcvt z16.s, p5/m, z16.h\n"
      "fcvt z17.s, p5/m, z17.h\n"
      "fcvt z18.s, p5/m, z18.h\n"
      "fcvt z19.s, p5/m, z19.h\n"
      "fcvt z20.s, p5/m, z20.h\n"
      "fcvt z21.s, p5/m, z21.h\n"
      "fcvt z22.s, p5/m, z22.h\n"
      "fcvt z23.s, p5/m, z23.h\n"
      "b 48f\n"
      "47:"  // Height 4: no accumulate
      "mov z8.b, #0\n"
      "mov z9.b, #0\n"
      "mov z10.b, #0\n"
      "mov z11.b, #0\n"
      "mov z12.b, #0\n"
      "mov z13.b, #0\n"
      "mov z14.b, #0\n"
      "mov z15.b, #0\n"
      "mov z16.b, #0\n"
      "mov z17.b, #0\n"
      "mov z18.b, #0\n"
      "mov z19.b, #0\n"
      "mov z20.b, #0\n"
      "mov z21.b, #0\n"
      "mov z22.b, #0\n"
      "mov z23.b, #0\n"
      "48:"  // Height 4: setup done
      "mov x28, #0\n"
      "49:"  // Height 4: String loop
      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
      "ldr w27, [x20, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 50f\n"
      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x20, x20, x21, LSL #3\n"
      "ldr x26, [x20, #0]\n"
      "ldr x25, [x20, #0x8]\n"
      "ldr x24, [x20, #0x10]\n"
      "ldr x23, [x20, #0x18]\n"
      "cbnz x28, 51f\n"
      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x20, LSL #1\n"
      "add x25, x25, x20, LSL #1\n"
      "add x24, x24, x20, LSL #1\n"
      "add x23, x23, x20, LSL #1\n"
      "b 51f\n"
      "50:"  // Height 4: setup direct input
      "mov x26, %x[input_ptr]\n"
      "add x25, x26, x21, LSL #1\n"
      "add x24, x25, x21, LSL #1\n"
      "add x23, x24, x21, LSL #1\n"
      "51:"  // Height 4: input setup done
      "cmp x27, #0x8\n"
      "ble 53f\n"
      "52:"  // Height 4: Multiply loop: Main loop head
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      "sub x27, x27, #0x8\n"
      "cmp x27, #0x8\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      "add x26, x26, #0x10\n"
      "ld1rqh { z1.h }, p0/Z, [x25]\n"
      "add x25, x25, #0x10\n"
      "ld1rqh { z2.h }, p0/Z, [x24]\n"
      "ld1rqh { z3.h }, p0/Z, [x23]\n"
      "add x24, x24, #0x10\n"
      "add x23, x23, #0x10\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140cc  // fmlalb z12.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d0  // fmlalb z16.s, z6.h, z2.h[0]\n"
      ".inst 0x64a140ed  // fmlalb z13.s, z7.h, z1.h[0]\n"
      ".inst 0x64a340d4  // fmlalb z20.s, z6.h, z3.h[0]\n"
      ".inst 0x64a240f1  // fmlalb z17.s, z7.h, z2.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a340f5  // fmlalb z21.s, z7.h, z3.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140ce  // fmlalb z14.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d2  // fmlalb z18.s, z6.h, z2.h[0]\n"
      ".inst 0x64a340d6  // fmlalb z22.s, z6.h, z3.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ef  // fmlalb z15.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240f3  // fmlalb z19.s, z7.h, z2.h[0]\n"
      ".inst 0x64a340f7  // fmlalb z23.s, z7.h, z3.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148cc  // fmlalb z12.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d0  // fmlalb z16.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d4  // fmlalb z20.s, z6.h, z3.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ed  // fmlalb z13.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f1  // fmlalb z17.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f5  // fmlalb z21.s, z7.h, z3.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148ce  // fmlalb z14.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d2  // fmlalb z18.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d6  // fmlalb z22.s, z6.h, z3.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ef  // fmlalb z15.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f3  // fmlalb z19.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f7  // fmlalb z23.s, z7.h, z3.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940cc  // fmlalb z12.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d0  // fmlalb z16.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d4  // fmlalb z20.s, z6.h, z3.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ed  // fmlalb z13.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f1  // fmlalb z17.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f5  // fmlalb z21.s, z7.h, z3.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940ce  // fmlalb z14.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d2  // fmlalb z18.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d6  // fmlalb z22.s, z6.h, z3.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ef  // fmlalb z15.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f3  // fmlalb z19.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f7  // fmlalb z23.s, z7.h, z3.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948cc  // fmlalb z12.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d0  // fmlalb z16.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d4  // fmlalb z20.s, z6.h, z3.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ed  // fmlalb z13.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f1  // fmlalb z17.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f5  // fmlalb z21.s, z7.h, z3.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948ce  // fmlalb z14.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d2  // fmlalb z18.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d6  // fmlalb z22.s, z6.h, z3.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ef  // fmlalb z15.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f3  // fmlalb z19.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f7  // fmlalb z23.s, z7.h, z3.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140cc  // fmlalb z12.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d0  // fmlalb z16.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d4  // fmlalb z20.s, z6.h, z3.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ed  // fmlalb z13.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f1  // fmlalb z17.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f5  // fmlalb z21.s, z7.h, z3.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140ce  // fmlalb z14.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d2  // fmlalb z18.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d6  // fmlalb z22.s, z6.h, z3.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ef  // fmlalb z15.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f3  // fmlalb z19.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f7  // fmlalb z23.s, z7.h, z3.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148cc  // fmlalb z12.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d0  // fmlalb z16.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d4  // fmlalb z20.s, z6.h, z3.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ed  // fmlalb z13.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f1  // fmlalb z17.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f5  // fmlalb z21.s, z7.h, z3.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148ce  // fmlalb z14.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d2  // fmlalb z18.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d6  // fmlalb z22.s, z6.h, z3.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ef  // fmlalb z15.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f3  // fmlalb z19.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f7  // fmlalb z23.s, z7.h, z3.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940cc  // fmlalb z12.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d0  // fmlalb z16.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d4  // fmlalb z20.s, z6.h, z3.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ed  // fmlalb z13.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f1  // fmlalb z17.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f5  // fmlalb z21.s, z7.h, z3.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940ce  // fmlalb z14.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d2  // fmlalb z18.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d6  // fmlalb z22.s, z6.h, z3.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ef  // fmlalb z15.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f3  // fmlalb z19.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f7  // fmlalb z23.s, z7.h, z3.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948cc  // fmlalb z12.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d0  // fmlalb z16.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d4  // fmlalb z20.s, z6.h, z3.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ed  // fmlalb z13.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f1  // fmlalb z17.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f5  // fmlalb z21.s, z7.h, z3.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948ce  // fmlalb z14.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d2  // fmlalb z18.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d6  // fmlalb z22.s, z6.h, z3.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ef  // fmlalb z15.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f3  // fmlalb z19.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f7  // fmlalb z23.s, z7.h, z3.h[7]\n"
      "bgt 52b\n"
      "53:"  // Height 4: Multiply loop: Single iteration only
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      "ld1rqh { z1.h }, p0/Z, [x25]\n"
      "ld1rqh { z2.h }, p0/Z, [x24]\n"
      "ld1rqh { z3.h }, p0/Z, [x23]\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140cc  // fmlalb z12.s, z6.h, z1.h[0]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ed  // fmlalb z13.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240d0  // fmlalb z16.s, z6.h, z2.h[0]\n"
      ".inst 0x64a340d4  // fmlalb z20.s, z6.h, z3.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a240f1  // fmlalb z17.s, z7.h, z2.h[0]\n"
      ".inst 0x64a340f5  // fmlalb z21.s, z7.h, z3.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140ce  // fmlalb z14.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d2  // fmlalb z18.s, z6.h, z2.h[0]\n"
      ".inst 0x64a340d6  // fmlalb z22.s, z6.h, z3.h[0]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ef  // fmlalb z15.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240f3  // fmlalb z19.s, z7.h, z2.h[0]\n"
      ".inst 0x64a340f7  // fmlalb z23.s, z7.h, z3.h[0]\n"
      "ble 54f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148cc  // fmlalb z12.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d0  // fmlalb z16.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d4  // fmlalb z20.s, z6.h, z3.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ed  // fmlalb z13.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f1  // fmlalb z17.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f5  // fmlalb z21.s, z7.h, z3.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148ce  // fmlalb z14.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d2  // fmlalb z18.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d6  // fmlalb z22.s, z6.h, z3.h[1]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ef  // fmlalb z15.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f3  // fmlalb z19.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f7  // fmlalb z23.s, z7.h, z3.h[1]\n"
      "ble 54f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940cc  // fmlalb z12.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d0  // fmlalb z16.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d4  // fmlalb z20.s, z6.h, z3.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ed  // fmlalb z13.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f1  // fmlalb z17.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f5  // fmlalb z21.s, z7.h, z3.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940ce  // fmlalb z14.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d2  // fmlalb z18.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d6  // fmlalb z22.s, z6.h, z3.h[2]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ef  // fmlalb z15.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f3  // fmlalb z19.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f7  // fmlalb z23.s, z7.h, z3.h[2]\n"
      "ble 54f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948cc  // fmlalb z12.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d0  // fmlalb z16.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d4  // fmlalb z20.s, z6.h, z3.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ed  // fmlalb z13.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f1  // fmlalb z17.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f5  // fmlalb z21.s, z7.h, z3.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948ce  // fmlalb z14.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d2  // fmlalb z18.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d6  // fmlalb z22.s, z6.h, z3.h[3]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ef  // fmlalb z15.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f3  // fmlalb z19.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f7  // fmlalb z23.s, z7.h, z3.h[3]\n"
      "ble 54f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140cc  // fmlalb z12.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d0  // fmlalb z16.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d4  // fmlalb z20.s, z6.h, z3.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ed  // fmlalb z13.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f1  // fmlalb z17.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f5  // fmlalb z21.s, z7.h, z3.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140ce  // fmlalb z14.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d2  // fmlalb z18.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d6  // fmlalb z22.s, z6.h, z3.h[4]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ef  // fmlalb z15.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f3  // fmlalb z19.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f7  // fmlalb z23.s, z7.h, z3.h[4]\n"
      "ble 54f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148cc  // fmlalb z12.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d0  // fmlalb z16.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d4  // fmlalb z20.s, z6.h, z3.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ed  // fmlalb z13.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f1  // fmlalb z17.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f5  // fmlalb z21.s, z7.h, z3.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148ce  // fmlalb z14.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d2  // fmlalb z18.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d6  // fmlalb z22.s, z6.h, z3.h[5]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ef  // fmlalb z15.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f3  // fmlalb z19.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f7  // fmlalb z23.s, z7.h, z3.h[5]\n"
      "ble 54f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940cc  // fmlalb z12.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d0  // fmlalb z16.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d4  // fmlalb z20.s, z6.h, z3.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ed  // fmlalb z13.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f1  // fmlalb z17.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f5  // fmlalb z21.s, z7.h, z3.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940ce  // fmlalb z14.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d2  // fmlalb z18.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d6  // fmlalb z22.s, z6.h, z3.h[6]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ef  // fmlalb z15.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f3  // fmlalb z19.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f7  // fmlalb z23.s, z7.h, z3.h[6]\n"
      "ble 54f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948cc  // fmlalb z12.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d0  // fmlalb z16.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d4  // fmlalb z20.s, z6.h, z3.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ed  // fmlalb z13.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f1  // fmlalb z17.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f5  // fmlalb z21.s, z7.h, z3.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948ce  // fmlalb z14.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d2  // fmlalb z18.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d6  // fmlalb z22.s, z6.h, z3.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ef  // fmlalb z15.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f3  // fmlalb z19.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f7  // fmlalb z23.s, z7.h, z3.h[7]\n"
      "54:"  // Height 4: Multiply loop: multiply skip
      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "add x28, x28, #0x1\n"
      "cmp x28, x20\n"
      "bne 49b\n"
      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
      "add x26, x11, x20, LSL #1\n"
      "add x25, x26, x20, LSL #1\n"
      "add x24, x25, x20, LSL #1\n"
      "tbz %x[flags], #1, 55f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z25.s }, p5/Z, [x21]\n"
      "ld1rw { z24.s }, p5/Z, [x20]\n"
      "fmin z8.s, p5/M, z8.s, z25.s\n"
      "fmin z9.s, p5/M, z9.s, z25.s\n"
      "fmin z10.s, p5/M, z10.s, z25.s\n"
      "fmin z11.s, p5/M, z11.s, z25.s\n"
      "fmin z12.s, p5/M, z12.s, z25.s\n"
      "fmin z13.s, p5/M, z13.s, z25.s\n"
      "fmin z14.s, p5/M, z14.s, z25.s\n"
      "fmin z15.s, p5/M, z15.s, z25.s\n"
      "fmin z16.s, p5/M, z16.s, z25.s\n"
      "fmin z17.s, p5/M, z17.s, z25.s\n"
      "fmin z18.s, p5/M, z18.s, z25.s\n"
      "fmin z19.s, p5/M, z19.s, z25.s\n"
      "fmin z20.s, p5/M, z20.s, z25.s\n"
      "fmin z21.s, p5/M, z21.s, z25.s\n"
      "fmin z22.s, p5/M, z22.s, z25.s\n"
      "fmin z23.s, p5/M, z23.s, z25.s\n"
      "fmax z8.s, p5/M, z8.s, z24.s\n"
      "fmax z9.s, p5/M, z9.s, z24.s\n"
      "fmax z10.s, p5/M, z10.s, z24.s\n"
      "fmax z11.s, p5/M, z11.s, z24.s\n"
      "fmax z12.s, p5/M, z12.s, z24.s\n"
      "fmax z13.s, p5/M, z13.s, z24.s\n"
      "fmax z14.s, p5/M, z14.s, z24.s\n"
      "fmax z15.s, p5/M, z15.s, z24.s\n"
      "fmax z16.s, p5/M, z16.s, z24.s\n"
      "fmax z17.s, p5/M, z17.s, z24.s\n"
      "fmax z18.s, p5/M, z18.s, z24.s\n"
      "fmax z19.s, p5/M, z19.s, z24.s\n"
      "fmax z20.s, p5/M, z20.s, z24.s\n"
      "fmax z21.s, p5/M, z21.s, z24.s\n"
      "fmax z22.s, p5/M, z22.s, z24.s\n"
      "fmax z23.s, p5/M, z23.s, z24.s\n"
      "55:"  // Height 4: No activation
      "fcvt z8.h, p5/m, z8.s\n"
      "fcvt z9.h, p5/m, z9.s\n"
      "fcvt z10.h, p5/m, z10.s\n"
      "fcvt z11.h, p5/m, z11.s\n"
      "fcvt z12.h, p5/m, z12.s\n"
      "fcvt z13.h, p5/m, z13.s\n"
      "fcvt z14.h, p5/m, z14.s\n"
      "fcvt z15.h, p5/m, z15.s\n"
      "st1h { z8.s }, p4, [x11]\n"
      "fcvt z16.h, p5/m, z16.s\n"
      "fcvt z17.h, p5/m, z17.s\n"
      "st1h { z9.s }, p3, [x11, #1, MUL VL]\n"
      "fcvt z18.h, p5/m, z18.s\n"
      "fcvt z19.h, p5/m, z19.s\n"
      "st1h { z10.s }, p2, [x11, #2, MUL VL]\n"
      "fcvt z20.h, p5/m, z20.s\n"
      "fcvt z21.h, p5/m, z21.s\n"
      "st1h { z11.s }, p1, [x11, #3, MUL VL]\n"
      "addvl x11, x11, #2\n"
      "fcvt z22.h, p5/m, z22.s\n"
      "fcvt z23.h, p5/m, z23.s\n"
      "st1h { z12.s }, p4, [x26]\n"
      "st1h { z13.s }, p3, [x26, #1, MUL VL]\n"
      "st1h { z14.s }, p2, [x26, #2, MUL VL]\n"
      "st1h { z15.s }, p1, [x26, #3, MUL VL]\n"
      "st1h { z16.s }, p4, [x25]\n"
      "st1h { z17.s }, p3, [x25, #1, MUL VL]\n"
      "st1h { z18.s }, p2, [x25, #2, MUL VL]\n"
      "st1h { z19.s }, p1, [x25, #3, MUL VL]\n"
      "st1h { z20.s }, p4, [x24]\n"
      "st1h { z21.s }, p3, [x24, #1, MUL VL]\n"
      "st1h { z22.s }, p2, [x24, #2, MUL VL]\n"
      "st1h { z23.s }, p1, [x24, #3, MUL VL]\n"
      "decw x13, ALL, MUL #4\n"
      "cmp x13, XZR\n"
      "bgt 44b\n"
      "b 86f\n"
      "57:"  // Height 5
      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
      "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
      "58:"  // Height 5: Column loop
      "ldr x10, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
      "cnth x21\n"
      "add x9, x10, x20, LSL #1\n"
      "add x20, x9, x20, LSL #1\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "cmp x13, x21\n"
      "bgt 59f\n"
      "mov x9, x10\n"
      "59:"  // Height 5: B setup done
      "mov x20, #0\n"
      "whilelt p4.s, x20, x13\n"
      "incw x20\n"
      "whilelt p3.s, x20, x13\n"
      "incw x20\n"
      "whilelt p2.s, x20, x13\n"
      "incw x20\n"
      "whilelt p1.s, x20, x13\n"
      "cbz x12, 60f\n"
      "ld1h { z8.s }, p4/Z, [x12]\n"
      "ld1h { z9.s }, p3/Z, [x12, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x12, #3, MUL VL]\n"
      "addvl x12, x12, #2\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "mov z12.d, z8.d\n"
      "mov z16.d, z8.d\n"
      "mov z20.d, z8.d\n"
      "mov z13.d, z9.d\n"
      "mov z14.d, z10.d\n"
      "mov z15.d, z11.d\n"
      "mov z17.d, z9.d\n"
      "mov z18.d, z10.d\n"
      "mov z19.d, z11.d\n"
      "mov z21.d, z9.d\n"
      "mov z22.d, z10.d\n"
      "mov z23.d, z11.d\n"
      "mov z24.d, z8.d\n"
      "mov z25.d, z9.d\n"
      "mov z26.d, z10.d\n"
      "mov z27.d, z11.d\n"
      "b 62f\n"
      "60:"  // Height 5: no bias
      "tbz %x[flags], #0, 61f\n"
      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
      "ld1h { z8.s }, p4/Z, [x11]\n"
      "ld1h { z9.s }, p3/Z, [x11, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x11, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x11, #3, MUL VL]\n"
      "add x23, x11, x20, LSL #1\n"
      "add x22, x23, x20, LSL #1\n"
      "add x21, x22, x20, LSL #1\n"
      "ld1h { z12.s }, p4/Z, [x23]\n"
      "ld1h { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "add x20, x21, x20, LSL #1\n"
      "ld1h { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
      "ld1h { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "ld1h { z16.s }, p4/Z, [x22]\n"
      "ld1h { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "ld1h { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
      "ld1h { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
      "fcvt z12.s, p5/m, z12.h\n"
      "fcvt z13.s, p5/m, z13.h\n"
      "ld1h { z20.s }, p4/Z, [x21]\n"
      "ld1h { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
      "fcvt z14.s, p5/m, z14.h\n"
      "fcvt z15.s, p5/m, z15.h\n"
      "ld1h { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
      "ld1h { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
      "fcvt z16.s, p5/m, z16.h\n"
      "fcvt z17.s, p5/m, z17.h\n"
      "ld1h { z24.s }, p4/Z, [x20]\n"
      "ld1h { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
      "fcvt z18.s, p5/m, z18.h\n"
      "fcvt z19.s, p5/m, z19.h\n"
      "ld1h { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
      "ld1h { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
      "fcvt z20.s, p5/m, z20.h\n"
      "fcvt z21.s, p5/m, z21.h\n"
      "fcvt z22.s, p5/m, z22.h\n"
      "fcvt z23.s, p5/m, z23.h\n"
      "fcvt z24.s, p5/m, z24.h\n"
      "fcvt z25.s, p5/m, z25.h\n"
      "fcvt z26.s, p5/m, z26.h\n"
      "fcvt z27.s, p5/m, z27.h\n"
      "b 62f\n"
      "61:"  // Height 5: no accumulate
      "mov z8.b, #0\n"
      "mov z9.b, #0\n"
      "mov z10.b, #0\n"
      "mov z11.b, #0\n"
      "mov z12.b, #0\n"
      "mov z13.b, #0\n"
      "mov z14.b, #0\n"
      "mov z15.b, #0\n"
      "mov z16.b, #0\n"
      "mov z17.b, #0\n"
      "mov z18.b, #0\n"
      "mov z19.b, #0\n"
      "mov z20.b, #0\n"
      "mov z21.b, #0\n"
      "mov z22.b, #0\n"
      "mov z23.b, #0\n"
      "mov z24.b, #0\n"
      "mov z25.b, #0\n"
      "mov z26.b, #0\n"
      "mov z27.b, #0\n"
      "62:"  // Height 5: setup done
      "mov x28, #0\n"
      "63:"  // Height 5: String loop
      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
      "ldr w27, [x20, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 64f\n"
      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x20, x20, x21, LSL #3\n"
      "ldr x26, [x20, #0]\n"
      "ldr x25, [x20, #0x8]\n"
      "ldr x24, [x20, #0x10]\n"
      "ldr x23, [x20, #0x18]\n"
      "ldr x22, [x20, #0x20]\n"
      "cbnz x28, 65f\n"
      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x20, LSL #1\n"
      "add x25, x25, x20, LSL #1\n"
      "add x24, x24, x20, LSL #1\n"
      "add x23, x23, x20, LSL #1\n"
      "add x22, x22, x20, LSL #1\n"
      "b 65f\n"
      "64:"  // Height 5: setup direct input
      "mov x26, %x[input_ptr]\n"
      "add x25, x26, x21, LSL #1\n"
      "add x24, x25, x21, LSL #1\n"
      "add x23, x24, x21, LSL #1\n"
      "add x22, x23, x21, LSL #1\n"
      "65:"  // Height 5: input setup done
      "cmp x27, #0x8\n"
      "ble 67f\n"
      "66:"  // Height 5: Multiply loop: Main loop head
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      "sub x27, x27, #0x8\n"
      "cmp x27, #0x8\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      "add x26, x26, #0x10\n"
      "ld1rqh { z1.h }, p0/Z, [x25]\n"
      "add x25, x25, #0x10\n"
      "ld1rqh { z2.h }, p0/Z, [x24]\n"
      "ld1rqh { z3.h }, p0/Z, [x23]\n"
      "ld1rqh { z4.h }, p0/Z, [x22]\n"
      "add x24, x24, #0x10\n"
      "add x23, x23, #0x10\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      "add x22, x22, #0x10\n"
      ".inst 0x64a140cc  // fmlalb z12.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d0  // fmlalb z16.s, z6.h, z2.h[0]\n"
      ".inst 0x64a340d4  // fmlalb z20.s, z6.h, z3.h[0]\n"
      ".inst 0x64a440d8  // fmlalb z24.s, z6.h, z4.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a140ed  // fmlalb z13.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240f1  // fmlalb z17.s, z7.h, z2.h[0]\n"
      ".inst 0x64a340f5  // fmlalb z21.s, z7.h, z3.h[0]\n"
      ".inst 0x64a440f9  // fmlalb z25.s, z7.h, z4.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140ce  // fmlalb z14.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d2  // fmlalb z18.s, z6.h, z2.h[0]\n"
      ".inst 0x64a340d6  // fmlalb z22.s, z6.h, z3.h[0]\n"
      ".inst 0x64a440da  // fmlalb z26.s, z6.h, z4.h[0]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a140ef  // fmlalb z15.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240f3  // fmlalb z19.s, z7.h, z2.h[0]\n"
      ".inst 0x64a340f7  // fmlalb z23.s, z7.h, z3.h[0]\n"
      ".inst 0x64a440fb  // fmlalb z27.s, z7.h, z4.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148cc  // fmlalb z12.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d0  // fmlalb z16.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d4  // fmlalb z20.s, z6.h, z3.h[1]\n"
      ".inst 0x64a448d8  // fmlalb z24.s, z6.h, z4.h[1]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a148ed  // fmlalb z13.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f1  // fmlalb z17.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f5  // fmlalb z21.s, z7.h, z3.h[1]\n"
      ".inst 0x64a448f9  // fmlalb z25.s, z7.h, z4.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148ce  // fmlalb z14.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d2  // fmlalb z18.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d6  // fmlalb z22.s, z6.h, z3.h[1]\n"
      ".inst 0x64a448da  // fmlalb z26.s, z6.h, z4.h[1]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a148ef  // fmlalb z15.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f3  // fmlalb z19.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f7  // fmlalb z23.s, z7.h, z3.h[1]\n"
      ".inst 0x64a448fb  // fmlalb z27.s, z7.h, z4.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940cc  // fmlalb z12.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d0  // fmlalb z16.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d4  // fmlalb z20.s, z6.h, z3.h[2]\n"
      ".inst 0x64ac40d8  // fmlalb z24.s, z6.h, z4.h[2]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a940ed  // fmlalb z13.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f1  // fmlalb z17.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f5  // fmlalb z21.s, z7.h, z3.h[2]\n"
      ".inst 0x64ac40f9  // fmlalb z25.s, z7.h, z4.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940ce  // fmlalb z14.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d2  // fmlalb z18.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d6  // fmlalb z22.s, z6.h, z3.h[2]\n"
      ".inst 0x64ac40da  // fmlalb z26.s, z6.h, z4.h[2]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a940ef  // fmlalb z15.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f3  // fmlalb z19.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f7  // fmlalb z23.s, z7.h, z3.h[2]\n"
      ".inst 0x64ac40fb  // fmlalb z27.s, z7.h, z4.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948cc  // fmlalb z12.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d0  // fmlalb z16.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d4  // fmlalb z20.s, z6.h, z3.h[3]\n"
      ".inst 0x64ac48d8  // fmlalb z24.s, z6.h, z4.h[3]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a948ed  // fmlalb z13.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f1  // fmlalb z17.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f5  // fmlalb z21.s, z7.h, z3.h[3]\n"
      ".inst 0x64ac48f9  // fmlalb z25.s, z7.h, z4.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948ce  // fmlalb z14.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d2  // fmlalb z18.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d6  // fmlalb z22.s, z6.h, z3.h[3]\n"
      ".inst 0x64ac48da  // fmlalb z26.s, z6.h, z4.h[3]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a948ef  // fmlalb z15.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f3  // fmlalb z19.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f7  // fmlalb z23.s, z7.h, z3.h[3]\n"
      ".inst 0x64ac48fb  // fmlalb z27.s, z7.h, z4.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140cc  // fmlalb z12.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d0  // fmlalb z16.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d4  // fmlalb z20.s, z6.h, z3.h[4]\n"
      ".inst 0x64b440d8  // fmlalb z24.s, z6.h, z4.h[4]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b140ed  // fmlalb z13.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f1  // fmlalb z17.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f5  // fmlalb z21.s, z7.h, z3.h[4]\n"
      ".inst 0x64b440f9  // fmlalb z25.s, z7.h, z4.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140ce  // fmlalb z14.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d2  // fmlalb z18.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d6  // fmlalb z22.s, z6.h, z3.h[4]\n"
      ".inst 0x64b440da  // fmlalb z26.s, z6.h, z4.h[4]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b140ef  // fmlalb z15.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f3  // fmlalb z19.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f7  // fmlalb z23.s, z7.h, z3.h[4]\n"
      ".inst 0x64b440fb  // fmlalb z27.s, z7.h, z4.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148cc  // fmlalb z12.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d0  // fmlalb z16.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d4  // fmlalb z20.s, z6.h, z3.h[5]\n"
      ".inst 0x64b448d8  // fmlalb z24.s, z6.h, z4.h[5]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b148ed  // fmlalb z13.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f1  // fmlalb z17.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f5  // fmlalb z21.s, z7.h, z3.h[5]\n"
      ".inst 0x64b448f9  // fmlalb z25.s, z7.h, z4.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148ce  // fmlalb z14.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d2  // fmlalb z18.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d6  // fmlalb z22.s, z6.h, z3.h[5]\n"
      ".inst 0x64b448da  // fmlalb z26.s, z6.h, z4.h[5]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b148ef  // fmlalb z15.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f3  // fmlalb z19.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f7  // fmlalb z23.s, z7.h, z3.h[5]\n"
      ".inst 0x64b448fb  // fmlalb z27.s, z7.h, z4.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940cc  // fmlalb z12.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d0  // fmlalb z16.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d4  // fmlalb z20.s, z6.h, z3.h[6]\n"
      ".inst 0x64bc40d8  // fmlalb z24.s, z6.h, z4.h[6]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b940ed  // fmlalb z13.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f1  // fmlalb z17.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f5  // fmlalb z21.s, z7.h, z3.h[6]\n"
      ".inst 0x64bc40f9  // fmlalb z25.s, z7.h, z4.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940ce  // fmlalb z14.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d2  // fmlalb z18.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d6  // fmlalb z22.s, z6.h, z3.h[6]\n"
      ".inst 0x64bc40da  // fmlalb z26.s, z6.h, z4.h[6]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b940ef  // fmlalb z15.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f3  // fmlalb z19.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f7  // fmlalb z23.s, z7.h, z3.h[6]\n"
      ".inst 0x64bc40fb  // fmlalb z27.s, z7.h, z4.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948cc  // fmlalb z12.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d0  // fmlalb z16.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d4  // fmlalb z20.s, z6.h, z3.h[7]\n"
      ".inst 0x64bc48d8  // fmlalb z24.s, z6.h, z4.h[7]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b948ed  // fmlalb z13.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f1  // fmlalb z17.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f5  // fmlalb z21.s, z7.h, z3.h[7]\n"
      ".inst 0x64bc48f9  // fmlalb z25.s, z7.h, z4.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948ce  // fmlalb z14.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d2  // fmlalb z18.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d6  // fmlalb z22.s, z6.h, z3.h[7]\n"
      ".inst 0x64bc48da  // fmlalb z26.s, z6.h, z4.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ef  // fmlalb z15.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f3  // fmlalb z19.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f7  // fmlalb z23.s, z7.h, z3.h[7]\n"
      ".inst 0x64bc48fb  // fmlalb z27.s, z7.h, z4.h[7]\n"
      "bgt 66b\n"
      "67:"  // Height 5: Multiply loop: Single iteration only
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      "ld1rqh { z1.h }, p0/Z, [x25]\n"
      "ld1rqh { z2.h }, p0/Z, [x24]\n"
      "ld1rqh { z3.h }, p0/Z, [x23]\n"
      "ld1rqh { z4.h }, p0/Z, [x22]\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140cc  // fmlalb z12.s, z6.h, z1.h[0]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ed  // fmlalb z13.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240d0  // fmlalb z16.s, z6.h, z2.h[0]\n"
      ".inst 0x64a340d4  // fmlalb z20.s, z6.h, z3.h[0]\n"
      ".inst 0x64a440d8  // fmlalb z24.s, z6.h, z4.h[0]\n"
      ".inst 0x64a240f1  // fmlalb z17.s, z7.h, z2.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a340f5  // fmlalb z21.s, z7.h, z3.h[0]\n"
      ".inst 0x64a440f9  // fmlalb z25.s, z7.h, z4.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140ce  // fmlalb z14.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d2  // fmlalb z18.s, z6.h, z2.h[0]\n"
      ".inst 0x64a340d6  // fmlalb z22.s, z6.h, z3.h[0]\n"
      ".inst 0x64a440da  // fmlalb z26.s, z6.h, z4.h[0]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ef  // fmlalb z15.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240f3  // fmlalb z19.s, z7.h, z2.h[0]\n"
      ".inst 0x64a340f7  // fmlalb z23.s, z7.h, z3.h[0]\n"
      ".inst 0x64a440fb  // fmlalb z27.s, z7.h, z4.h[0]\n"
      "ble 68f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148cc  // fmlalb z12.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d0  // fmlalb z16.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d4  // fmlalb z20.s, z6.h, z3.h[1]\n"
      ".inst 0x64a448d8  // fmlalb z24.s, z6.h, z4.h[1]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a148ed  // fmlalb z13.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f1  // fmlalb z17.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f5  // fmlalb z21.s, z7.h, z3.h[1]\n"
      ".inst 0x64a448f9  // fmlalb z25.s, z7.h, z4.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148ce  // fmlalb z14.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d2  // fmlalb z18.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d6  // fmlalb z22.s, z6.h, z3.h[1]\n"
      ".inst 0x64a448da  // fmlalb z26.s, z6.h, z4.h[1]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ef  // fmlalb z15.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f3  // fmlalb z19.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f7  // fmlalb z23.s, z7.h, z3.h[1]\n"
      ".inst 0x64a448fb  // fmlalb z27.s, z7.h, z4.h[1]\n"
      "ble 68f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940cc  // fmlalb z12.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d0  // fmlalb z16.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d4  // fmlalb z20.s, z6.h, z3.h[2]\n"
      ".inst 0x64ac40d8  // fmlalb z24.s, z6.h, z4.h[2]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a940ed  // fmlalb z13.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f1  // fmlalb z17.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f5  // fmlalb z21.s, z7.h, z3.h[2]\n"
      ".inst 0x64ac40f9  // fmlalb z25.s, z7.h, z4.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940ce  // fmlalb z14.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d2  // fmlalb z18.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d6  // fmlalb z22.s, z6.h, z3.h[2]\n"
      ".inst 0x64ac40da  // fmlalb z26.s, z6.h, z4.h[2]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ef  // fmlalb z15.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f3  // fmlalb z19.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f7  // fmlalb z23.s, z7.h, z3.h[2]\n"
      ".inst 0x64ac40fb  // fmlalb z27.s, z7.h, z4.h[2]\n"
      "ble 68f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948cc  // fmlalb z12.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d0  // fmlalb z16.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d4  // fmlalb z20.s, z6.h, z3.h[3]\n"
      ".inst 0x64ac48d8  // fmlalb z24.s, z6.h, z4.h[3]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a948ed  // fmlalb z13.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f1  // fmlalb z17.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f5  // fmlalb z21.s, z7.h, z3.h[3]\n"
      ".inst 0x64ac48f9  // fmlalb z25.s, z7.h, z4.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948ce  // fmlalb z14.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d2  // fmlalb z18.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d6  // fmlalb z22.s, z6.h, z3.h[3]\n"
      ".inst 0x64ac48da  // fmlalb z26.s, z6.h, z4.h[3]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ef  // fmlalb z15.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f3  // fmlalb z19.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f7  // fmlalb z23.s, z7.h, z3.h[3]\n"
      ".inst 0x64ac48fb  // fmlalb z27.s, z7.h, z4.h[3]\n"
      "ble 68f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140cc  // fmlalb z12.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d0  // fmlalb z16.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d4  // fmlalb z20.s, z6.h, z3.h[4]\n"
      ".inst 0x64b440d8  // fmlalb z24.s, z6.h, z4.h[4]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b140ed  // fmlalb z13.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f1  // fmlalb z17.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f5  // fmlalb z21.s, z7.h, z3.h[4]\n"
      ".inst 0x64b440f9  // fmlalb z25.s, z7.h, z4.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140ce  // fmlalb z14.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d2  // fmlalb z18.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d6  // fmlalb z22.s, z6.h, z3.h[4]\n"
      ".inst 0x64b440da  // fmlalb z26.s, z6.h, z4.h[4]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ef  // fmlalb z15.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f3  // fmlalb z19.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f7  // fmlalb z23.s, z7.h, z3.h[4]\n"
      ".inst 0x64b440fb  // fmlalb z27.s, z7.h, z4.h[4]\n"
      "ble 68f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148cc  // fmlalb z12.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d0  // fmlalb z16.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d4  // fmlalb z20.s, z6.h, z3.h[5]\n"
      ".inst 0x64b448d8  // fmlalb z24.s, z6.h, z4.h[5]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b148ed  // fmlalb z13.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f1  // fmlalb z17.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f5  // fmlalb z21.s, z7.h, z3.h[5]\n"
      ".inst 0x64b448f9  // fmlalb z25.s, z7.h, z4.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148ce  // fmlalb z14.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d2  // fmlalb z18.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d6  // fmlalb z22.s, z6.h, z3.h[5]\n"
      ".inst 0x64b448da  // fmlalb z26.s, z6.h, z4.h[5]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ef  // fmlalb z15.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f3  // fmlalb z19.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f7  // fmlalb z23.s, z7.h, z3.h[5]\n"
      ".inst 0x64b448fb  // fmlalb z27.s, z7.h, z4.h[5]\n"
      "ble 68f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940cc  // fmlalb z12.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d0  // fmlalb z16.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d4  // fmlalb z20.s, z6.h, z3.h[6]\n"
      ".inst 0x64bc40d8  // fmlalb z24.s, z6.h, z4.h[6]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b940ed  // fmlalb z13.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f1  // fmlalb z17.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f5  // fmlalb z21.s, z7.h, z3.h[6]\n"
      ".inst 0x64bc40f9  // fmlalb z25.s, z7.h, z4.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940ce  // fmlalb z14.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d2  // fmlalb z18.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d6  // fmlalb z22.s, z6.h, z3.h[6]\n"
      ".inst 0x64bc40da  // fmlalb z26.s, z6.h, z4.h[6]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ef  // fmlalb z15.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f3  // fmlalb z19.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f7  // fmlalb z23.s, z7.h, z3.h[6]\n"
      ".inst 0x64bc40fb  // fmlalb z27.s, z7.h, z4.h[6]\n"
      "ble 68f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948cc  // fmlalb z12.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d0  // fmlalb z16.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d4  // fmlalb z20.s, z6.h, z3.h[7]\n"
      ".inst 0x64bc48d8  // fmlalb z24.s, z6.h, z4.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ed  // fmlalb z13.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f1  // fmlalb z17.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f5  // fmlalb z21.s, z7.h, z3.h[7]\n"
      ".inst 0x64bc48f9  // fmlalb z25.s, z7.h, z4.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b948ce  // fmlalb z14.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d2  // fmlalb z18.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d6  // fmlalb z22.s, z6.h, z3.h[7]\n"
      ".inst 0x64bc48da  // fmlalb z26.s, z6.h, z4.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ef  // fmlalb z15.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f3  // fmlalb z19.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f7  // fmlalb z23.s, z7.h, z3.h[7]\n"
      ".inst 0x64bc48fb  // fmlalb z27.s, z7.h, z4.h[7]\n"
      "68:"  // Height 5: Multiply loop: multiply skip
      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "add x28, x28, #0x1\n"
      "cmp x28, x20\n"
      "bne 63b\n"
      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
      "add x26, x11, x20, LSL #1\n"
      "add x25, x26, x20, LSL #1\n"
      "add x24, x25, x20, LSL #1\n"
      "add x23, x24, x20, LSL #1\n"
      "tbz %x[flags], #1, 69f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z29.s }, p5/Z, [x21]\n"
      "ld1rw { z28.s }, p5/Z, [x20]\n"
      "fmin z8.s, p5/M, z8.s, z29.s\n"
      "fmin z9.s, p5/M, z9.s, z29.s\n"
      "fmin z10.s, p5/M, z10.s, z29.s\n"
      "fmin z11.s, p5/M, z11.s, z29.s\n"
      "fmin z12.s, p5/M, z12.s, z29.s\n"
      "fmin z13.s, p5/M, z13.s, z29.s\n"
      "fmin z14.s, p5/M, z14.s, z29.s\n"
      "fmin z15.s, p5/M, z15.s, z29.s\n"
      "fmin z16.s, p5/M, z16.s, z29.s\n"
      "fmin z17.s, p5/M, z17.s, z29.s\n"
      "fmin z18.s, p5/M, z18.s, z29.s\n"
      "fmin z19.s, p5/M, z19.s, z29.s\n"
      "fmin z20.s, p5/M, z20.s, z29.s\n"
      "fmin z21.s, p5/M, z21.s, z29.s\n"
      "fmin z22.s, p5/M, z22.s, z29.s\n"
      "fmin z23.s, p5/M, z23.s, z29.s\n"
      "fmin z24.s, p5/M, z24.s, z29.s\n"
      "fmin z25.s, p5/M, z25.s, z29.s\n"
      "fmin z26.s, p5/M, z26.s, z29.s\n"
      "fmin z27.s, p5/M, z27.s, z29.s\n"
      "fmax z8.s, p5/M, z8.s, z28.s\n"
      "fmax z9.s, p5/M, z9.s, z28.s\n"
      "fmax z10.s, p5/M, z10.s, z28.s\n"
      "fmax z11.s, p5/M, z11.s, z28.s\n"
      "fmax z12.s, p5/M, z12.s, z28.s\n"
      "fmax z13.s, p5/M, z13.s, z28.s\n"
      "fmax z14.s, p5/M, z14.s, z28.s\n"
      "fmax z15.s, p5/M, z15.s, z28.s\n"
      "fmax z16.s, p5/M, z16.s, z28.s\n"
      "fmax z17.s, p5/M, z17.s, z28.s\n"
      "fmax z18.s, p5/M, z18.s, z28.s\n"
      "fmax z19.s, p5/M, z19.s, z28.s\n"
      "fmax z20.s, p5/M, z20.s, z28.s\n"
      "fmax z21.s, p5/M, z21.s, z28.s\n"
      "fmax z22.s, p5/M, z22.s, z28.s\n"
      "fmax z23.s, p5/M, z23.s, z28.s\n"
      "fmax z24.s, p5/M, z24.s, z28.s\n"
      "fmax z25.s, p5/M, z25.s, z28.s\n"
      "fmax z26.s, p5/M, z26.s, z28.s\n"
      "fmax z27.s, p5/M, z27.s, z28.s\n"
      "69:"  // Height 5: No activation
      "fcvt z8.h, p5/m, z8.s\n"
      "fcvt z9.h, p5/m, z9.s\n"
      "fcvt z10.h, p5/m, z10.s\n"
      "fcvt z11.h, p5/m, z11.s\n"
      "fcvt z12.h, p5/m, z12.s\n"
      "fcvt z13.h, p5/m, z13.s\n"
      "fcvt z14.h, p5/m, z14.s\n"
      "fcvt z15.h, p5/m, z15.s\n"
      "st1h { z8.s }, p4, [x11]\n"
      "fcvt z16.h, p5/m, z16.s\n"
      "fcvt z17.h, p5/m, z17.s\n"
      "st1h { z9.s }, p3, [x11, #1, MUL VL]\n"
      "fcvt z18.h, p5/m, z18.s\n"
      "fcvt z19.h, p5/m, z19.s\n"
      "st1h { z10.s }, p2, [x11, #2, MUL VL]\n"
      "fcvt z20.h, p5/m, z20.s\n"
      "fcvt z21.h, p5/m, z21.s\n"
      "st1h { z11.s }, p1, [x11, #3, MUL VL]\n"
      "addvl x11, x11, #2\n"
      "fcvt z22.h, p5/m, z22.s\n"
      "fcvt z23.h, p5/m, z23.s\n"
      "st1h { z12.s }, p4, [x26]\n"
      "fcvt z24.h, p5/m, z24.s\n"
      "fcvt z25.h, p5/m, z25.s\n"
      "st1h { z13.s }, p3, [x26, #1, MUL VL]\n"
      "fcvt z26.h, p5/m, z26.s\n"
      "fcvt z27.h, p5/m, z27.s\n"
      "st1h { z14.s }, p2, [x26, #2, MUL VL]\n"
      "st1h { z15.s }, p1, [x26, #3, MUL VL]\n"
      "st1h { z16.s }, p4, [x25]\n"
      "st1h { z17.s }, p3, [x25, #1, MUL VL]\n"
      "st1h { z18.s }, p2, [x25, #2, MUL VL]\n"
      "st1h { z19.s }, p1, [x25, #3, MUL VL]\n"
      "st1h { z20.s }, p4, [x24]\n"
      "st1h { z21.s }, p3, [x24, #1, MUL VL]\n"
      "st1h { z22.s }, p2, [x24, #2, MUL VL]\n"
      "st1h { z23.s }, p1, [x24, #3, MUL VL]\n"
      "st1h { z24.s }, p4, [x23]\n"
      "st1h { z25.s }, p3, [x23, #1, MUL VL]\n"
      "st1h { z26.s }, p2, [x23, #2, MUL VL]\n"
      "st1h { z27.s }, p1, [x23, #3, MUL VL]\n"
      "decw x13, ALL, MUL #4\n"
      "cmp x13, XZR\n"
      "bgt 58b\n"
      "b 86f\n"
      "71:"  // Height 6
      "ldr x20, [%x[args_ptr], %[offsetof_B_ptr]]\n"
      "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
      "mov x21, #0xc\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
      "ldr x11, [%x[args_ptr], %[offsetof_output_ptr]]\n"
      "ldr x12, [%x[args_ptr], %[offsetof_bias]]\n"
      "madd x21, x20, x21, x11\n"
      "str x21, [%x[args_ptr], %[offsetof_output_ptr]]\n"
      "72:"  // Height 6: Column loop
      "ldr x10, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "ldr x20, [%x[args_ptr], %[offsetof_B_stride]]\n"
      "cnth x21\n"
      "add x9, x10, x20, LSL #1\n"
      "add x20, x9, x20, LSL #1\n"
      "str x20, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
      "cmp x13, x21\n"
      "bgt 73f\n"
      "mov x9, x10\n"
      "73:"  // Height 6: B setup done
      "mov x20, #0\n"
      "whilelt p4.s, x20, x13\n"
      "incw x20\n"
      "whilelt p3.s, x20, x13\n"
      "incw x20\n"
      "whilelt p2.s, x20, x13\n"
      "incw x20\n"
      "whilelt p1.s, x20, x13\n"
      "cbz x12, 74f\n"
      "ld1h { z8.s }, p4/Z, [x12]\n"
      "ld1h { z9.s }, p3/Z, [x12, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x12, #3, MUL VL]\n"
      "addvl x12, x12, #2\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "mov z12.d, z8.d\n"
      "mov z16.d, z8.d\n"
      "mov z20.d, z8.d\n"
      "mov z13.d, z9.d\n"
      "mov z14.d, z10.d\n"
      "mov z15.d, z11.d\n"
      "mov z17.d, z9.d\n"
      "mov z18.d, z10.d\n"
      "mov z19.d, z11.d\n"
      "mov z21.d, z9.d\n"
      "mov z22.d, z10.d\n"
      "mov z23.d, z11.d\n"
      "mov z24.d, z8.d\n"
      "mov z25.d, z9.d\n"
      "mov z26.d, z10.d\n"
      "mov z27.d, z11.d\n"
      "mov z28.d, z8.d\n"
      "mov z29.d, z9.d\n"
      "mov z30.d, z10.d\n"
      "mov z31.d, z11.d\n"
      "b 76f\n"
      "74:"  // Height 6: no bias
      "tbz %x[flags], #0, 75f\n"
      "ldr x24, [%x[args_ptr], %[offsetof_output_offset]]\n"
      "ld1h { z8.s }, p4/Z, [x11]\n"
      "ld1h { z9.s }, p3/Z, [x11, #1, MUL VL]\n"
      "ld1h { z10.s }, p2/Z, [x11, #2, MUL VL]\n"
      "ld1h { z11.s }, p1/Z, [x11, #3, MUL VL]\n"
      "add x20, x11, x24, LSL #1\n"
      "add x23, x20, x24, LSL #1\n"
      "add x22, x23, x24, LSL #1\n"
      "ld1h { z12.s }, p4/Z, [x20]\n"
      "ld1h { z13.s }, p3/Z, [x20, #1, MUL VL]\n"
      "fcvt z8.s, p5/m, z8.h\n"
      "add x21, x22, x24, LSL #1\n"
      "ld1h { z14.s }, p2/Z, [x20, #2, MUL VL]\n"
      "ld1h { z15.s }, p1/Z, [x20, #3, MUL VL]\n"
      "fcvt z9.s, p5/m, z9.h\n"
      "add x20, x21, x24, LSL #1\n"
      "ld1h { z16.s }, p4/Z, [x23]\n"
      "ld1h { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
      "fcvt z10.s, p5/m, z10.h\n"
      "ld1h { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
      "ld1h { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
      "fcvt z11.s, p5/m, z11.h\n"
      "fcvt z12.s, p5/m, z12.h\n"
      "ld1h { z20.s }, p4/Z, [x22]\n"
      "ld1h { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
      "fcvt z13.s, p5/m, z13.h\n"
      "fcvt z14.s, p5/m, z14.h\n"
      "ld1h { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
      "ld1h { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
      "fcvt z15.s, p5/m, z15.h\n"
      "fcvt z16.s, p5/m, z16.h\n"
      "ld1h { z24.s }, p4/Z, [x21]\n"
      "ld1h { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
      "fcvt z17.s, p5/m, z17.h\n"
      "fcvt z18.s, p5/m, z18.h\n"
      "ld1h { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
      "ld1h { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
      "fcvt z19.s, p5/m, z19.h\n"
      "fcvt z20.s, p5/m, z20.h\n"
      "ld1h { z28.s }, p4/Z, [x20]\n"
      "ld1h { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
      "fcvt z21.s, p5/m, z21.h\n"
      "fcvt z22.s, p5/m, z22.h\n"
      "ld1h { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
      "ld1h { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
      "fcvt z23.s, p5/m, z23.h\n"
      "fcvt z24.s, p5/m, z24.h\n"
      "fcvt z25.s, p5/m, z25.h\n"
      "fcvt z26.s, p5/m, z26.h\n"
      "fcvt z27.s, p5/m, z27.h\n"
      "fcvt z28.s, p5/m, z28.h\n"
      "fcvt z29.s, p5/m, z29.h\n"
      "fcvt z30.s, p5/m, z30.h\n"
      "fcvt z31.s, p5/m, z31.h\n"
      "b 76f\n"
      "75:"  // Height 6: no accumulate
      "mov z8.b, #0\n"
      "mov z9.b, #0\n"
      "mov z10.b, #0\n"
      "mov z11.b, #0\n"
      "mov z12.b, #0\n"
      "mov z13.b, #0\n"
      "mov z14.b, #0\n"
      "mov z15.b, #0\n"
      "mov z16.b, #0\n"
      "mov z17.b, #0\n"
      "mov z18.b, #0\n"
      "mov z19.b, #0\n"
      "mov z20.b, #0\n"
      "mov z21.b, #0\n"
      "mov z22.b, #0\n"
      "mov z23.b, #0\n"
      "mov z24.b, #0\n"
      "mov z25.b, #0\n"
      "mov z26.b, #0\n"
      "mov z27.b, #0\n"
      "mov z28.b, #0\n"
      "mov z29.b, #0\n"
      "mov z30.b, #0\n"
      "mov z31.b, #0\n"
      "76:"  // Height 6: setup done
      "mov x28, #0\n"
      "77:"  // Height 6: String loop
      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
      "ldr w27, [x20, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 78f\n"
      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x20, x20, x21, LSL #3\n"
      "ldr x26, [x20, #0]\n"
      "ldr x25, [x20, #0x8]\n"
      "ldr x24, [x20, #0x10]\n"
      "ldr x23, [x20, #0x18]\n"
      "ldr x22, [x20, #0x20]\n"
      "ldr x21, [x20, #0x28]\n"
      "cbnz x28, 79f\n"
      "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x20, LSL #1\n"
      "add x25, x25, x20, LSL #1\n"
      "add x24, x24, x20, LSL #1\n"
      "add x23, x23, x20, LSL #1\n"
      "add x22, x22, x20, LSL #1\n"
      "add x21, x21, x20, LSL #1\n"
      "b 79f\n"
      "78:"  // Height 6: setup direct input
      "mov x26, %x[input_ptr]\n"
      "add x25, x26, x21, LSL #1\n"
      "add x24, x25, x21, LSL #1\n"
      "add x23, x24, x21, LSL #1\n"
      "add x22, x23, x21, LSL #1\n"
      "add x21, x22, x21, LSL #1\n"
      "79:"  // Height 6: input setup done
      "cmp x27, #0x8\n"
      "ble 81f\n"
      "80:"  // Height 6: Multiply loop: Main loop head
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      "sub x27, x27, #0x8\n"
      "cmp x27, #0x8\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      "add x26, x26, #0x10\n"
      "ld1rqh { z1.h }, p0/Z, [x25]\n"
      "add x25, x25, #0x10\n"
      "ld1rqh { z2.h }, p0/Z, [x24]\n"
      "ld1rqh { z3.h }, p0/Z, [x23]\n"
      "ld1rqh { z4.h }, p0/Z, [x22]\n"
      "add x24, x24, #0x10\n"
      "add x23, x23, #0x10\n"
      "ld1rqh { z5.h }, p0/Z, [x21]\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      "add x22, x22, #0x10\n"
      ".inst 0x64a140cc  // fmlalb z12.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d0  // fmlalb z16.s, z6.h, z2.h[0]\n"
      "add x21, x21, #0x10\n"
      ".inst 0x64a340d4  // fmlalb z20.s, z6.h, z3.h[0]\n"
      ".inst 0x64a440d8  // fmlalb z24.s, z6.h, z4.h[0]\n"
      ".inst 0x64a540dc  // fmlalb z28.s, z6.h, z5.h[0]\n"
      ".inst 0x64a140ed  // fmlalb z13.s, z7.h, z1.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a240f1  // fmlalb z17.s, z7.h, z2.h[0]\n"
      ".inst 0x64a340f5  // fmlalb z21.s, z7.h, z3.h[0]\n"
      ".inst 0x64a440f9  // fmlalb z25.s, z7.h, z4.h[0]\n"
      ".inst 0x64a540fd  // fmlalb z29.s, z7.h, z5.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140ce  // fmlalb z14.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d2  // fmlalb z18.s, z6.h, z2.h[0]\n"
      ".inst 0x64a340d6  // fmlalb z22.s, z6.h, z3.h[0]\n"
      ".inst 0x64a440da  // fmlalb z26.s, z6.h, z4.h[0]\n"
      ".inst 0x64a540de  // fmlalb z30.s, z6.h, z5.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ef  // fmlalb z15.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240f3  // fmlalb z19.s, z7.h, z2.h[0]\n"
      ".inst 0x64a340f7  // fmlalb z23.s, z7.h, z3.h[0]\n"
      ".inst 0x64a440fb  // fmlalb z27.s, z7.h, z4.h[0]\n"
      ".inst 0x64a540ff  // fmlalb z31.s, z7.h, z5.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148cc  // fmlalb z12.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d0  // fmlalb z16.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d4  // fmlalb z20.s, z6.h, z3.h[1]\n"
      ".inst 0x64a448d8  // fmlalb z24.s, z6.h, z4.h[1]\n"
      ".inst 0x64a548dc  // fmlalb z28.s, z6.h, z5.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ed  // fmlalb z13.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f1  // fmlalb z17.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f5  // fmlalb z21.s, z7.h, z3.h[1]\n"
      ".inst 0x64a448f9  // fmlalb z25.s, z7.h, z4.h[1]\n"
      ".inst 0x64a548fd  // fmlalb z29.s, z7.h, z5.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148ce  // fmlalb z14.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d2  // fmlalb z18.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d6  // fmlalb z22.s, z6.h, z3.h[1]\n"
      ".inst 0x64a448da  // fmlalb z26.s, z6.h, z4.h[1]\n"
      ".inst 0x64a548de  // fmlalb z30.s, z6.h, z5.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ef  // fmlalb z15.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f3  // fmlalb z19.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f7  // fmlalb z23.s, z7.h, z3.h[1]\n"
      ".inst 0x64a448fb  // fmlalb z27.s, z7.h, z4.h[1]\n"
      ".inst 0x64a548ff  // fmlalb z31.s, z7.h, z5.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940cc  // fmlalb z12.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d0  // fmlalb z16.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d4  // fmlalb z20.s, z6.h, z3.h[2]\n"
      ".inst 0x64ac40d8  // fmlalb z24.s, z6.h, z4.h[2]\n"
      ".inst 0x64ad40dc  // fmlalb z28.s, z6.h, z5.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ed  // fmlalb z13.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f1  // fmlalb z17.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f5  // fmlalb z21.s, z7.h, z3.h[2]\n"
      ".inst 0x64ac40f9  // fmlalb z25.s, z7.h, z4.h[2]\n"
      ".inst 0x64ad40fd  // fmlalb z29.s, z7.h, z5.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940ce  // fmlalb z14.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d2  // fmlalb z18.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d6  // fmlalb z22.s, z6.h, z3.h[2]\n"
      ".inst 0x64ac40da  // fmlalb z26.s, z6.h, z4.h[2]\n"
      ".inst 0x64ad40de  // fmlalb z30.s, z6.h, z5.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ef  // fmlalb z15.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f3  // fmlalb z19.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f7  // fmlalb z23.s, z7.h, z3.h[2]\n"
      ".inst 0x64ac40fb  // fmlalb z27.s, z7.h, z4.h[2]\n"
      ".inst 0x64ad40ff  // fmlalb z31.s, z7.h, z5.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948cc  // fmlalb z12.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d0  // fmlalb z16.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d4  // fmlalb z20.s, z6.h, z3.h[3]\n"
      ".inst 0x64ac48d8  // fmlalb z24.s, z6.h, z4.h[3]\n"
      ".inst 0x64ad48dc  // fmlalb z28.s, z6.h, z5.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ed  // fmlalb z13.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f1  // fmlalb z17.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f5  // fmlalb z21.s, z7.h, z3.h[3]\n"
      ".inst 0x64ac48f9  // fmlalb z25.s, z7.h, z4.h[3]\n"
      ".inst 0x64ad48fd  // fmlalb z29.s, z7.h, z5.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948ce  // fmlalb z14.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d2  // fmlalb z18.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d6  // fmlalb z22.s, z6.h, z3.h[3]\n"
      ".inst 0x64ac48da  // fmlalb z26.s, z6.h, z4.h[3]\n"
      ".inst 0x64ad48de  // fmlalb z30.s, z6.h, z5.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ef  // fmlalb z15.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f3  // fmlalb z19.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f7  // fmlalb z23.s, z7.h, z3.h[3]\n"
      ".inst 0x64ac48fb  // fmlalb z27.s, z7.h, z4.h[3]\n"
      ".inst 0x64ad48ff  // fmlalb z31.s, z7.h, z5.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140cc  // fmlalb z12.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d0  // fmlalb z16.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d4  // fmlalb z20.s, z6.h, z3.h[4]\n"
      ".inst 0x64b440d8  // fmlalb z24.s, z6.h, z4.h[4]\n"
      ".inst 0x64b540dc  // fmlalb z28.s, z6.h, z5.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ed  // fmlalb z13.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f1  // fmlalb z17.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f5  // fmlalb z21.s, z7.h, z3.h[4]\n"
      ".inst 0x64b440f9  // fmlalb z25.s, z7.h, z4.h[4]\n"
      ".inst 0x64b540fd  // fmlalb z29.s, z7.h, z5.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140ce  // fmlalb z14.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d2  // fmlalb z18.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d6  // fmlalb z22.s, z6.h, z3.h[4]\n"
      ".inst 0x64b440da  // fmlalb z26.s, z6.h, z4.h[4]\n"
      ".inst 0x64b540de  // fmlalb z30.s, z6.h, z5.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ef  // fmlalb z15.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f3  // fmlalb z19.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f7  // fmlalb z23.s, z7.h, z3.h[4]\n"
      ".inst 0x64b440fb  // fmlalb z27.s, z7.h, z4.h[4]\n"
      ".inst 0x64b540ff  // fmlalb z31.s, z7.h, z5.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148cc  // fmlalb z12.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d0  // fmlalb z16.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d4  // fmlalb z20.s, z6.h, z3.h[5]\n"
      ".inst 0x64b448d8  // fmlalb z24.s, z6.h, z4.h[5]\n"
      ".inst 0x64b548dc  // fmlalb z28.s, z6.h, z5.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ed  // fmlalb z13.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f1  // fmlalb z17.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f5  // fmlalb z21.s, z7.h, z3.h[5]\n"
      ".inst 0x64b448f9  // fmlalb z25.s, z7.h, z4.h[5]\n"
      ".inst 0x64b548fd  // fmlalb z29.s, z7.h, z5.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148ce  // fmlalb z14.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d2  // fmlalb z18.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d6  // fmlalb z22.s, z6.h, z3.h[5]\n"
      ".inst 0x64b448da  // fmlalb z26.s, z6.h, z4.h[5]\n"
      ".inst 0x64b548de  // fmlalb z30.s, z6.h, z5.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ef  // fmlalb z15.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f3  // fmlalb z19.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f7  // fmlalb z23.s, z7.h, z3.h[5]\n"
      ".inst 0x64b448fb  // fmlalb z27.s, z7.h, z4.h[5]\n"
      ".inst 0x64b548ff  // fmlalb z31.s, z7.h, z5.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940cc  // fmlalb z12.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d0  // fmlalb z16.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d4  // fmlalb z20.s, z6.h, z3.h[6]\n"
      ".inst 0x64bc40d8  // fmlalb z24.s, z6.h, z4.h[6]\n"
      ".inst 0x64bd40dc  // fmlalb z28.s, z6.h, z5.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ed  // fmlalb z13.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f1  // fmlalb z17.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f5  // fmlalb z21.s, z7.h, z3.h[6]\n"
      ".inst 0x64bc40f9  // fmlalb z25.s, z7.h, z4.h[6]\n"
      ".inst 0x64bd40fd  // fmlalb z29.s, z7.h, z5.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940ce  // fmlalb z14.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d2  // fmlalb z18.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d6  // fmlalb z22.s, z6.h, z3.h[6]\n"
      ".inst 0x64bc40da  // fmlalb z26.s, z6.h, z4.h[6]\n"
      ".inst 0x64bd40de  // fmlalb z30.s, z6.h, z5.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ef  // fmlalb z15.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f3  // fmlalb z19.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f7  // fmlalb z23.s, z7.h, z3.h[6]\n"
      ".inst 0x64bc40fb  // fmlalb z27.s, z7.h, z4.h[6]\n"
      ".inst 0x64bd40ff  // fmlalb z31.s, z7.h, z5.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948cc  // fmlalb z12.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d0  // fmlalb z16.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d4  // fmlalb z20.s, z6.h, z3.h[7]\n"
      ".inst 0x64bc48d8  // fmlalb z24.s, z6.h, z4.h[7]\n"
      ".inst 0x64bd48dc  // fmlalb z28.s, z6.h, z5.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ed  // fmlalb z13.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f1  // fmlalb z17.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f5  // fmlalb z21.s, z7.h, z3.h[7]\n"
      ".inst 0x64bc48f9  // fmlalb z25.s, z7.h, z4.h[7]\n"
      ".inst 0x64bd48fd  // fmlalb z29.s, z7.h, z5.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948ce  // fmlalb z14.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d2  // fmlalb z18.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d6  // fmlalb z22.s, z6.h, z3.h[7]\n"
      ".inst 0x64bc48da  // fmlalb z26.s, z6.h, z4.h[7]\n"
      ".inst 0x64bd48de  // fmlalb z30.s, z6.h, z5.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ef  // fmlalb z15.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f3  // fmlalb z19.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f7  // fmlalb z23.s, z7.h, z3.h[7]\n"
      ".inst 0x64bc48fb  // fmlalb z27.s, z7.h, z4.h[7]\n"
      ".inst 0x64bd48ff  // fmlalb z31.s, z7.h, z5.h[7]\n"
      "bgt 80b\n"
      "81:"  // Height 6: Multiply loop: Single iteration only
      "whilelt p0.h, XZR, x27\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      "ld1rqh { z0.h }, p0/Z, [x26]\n"
      "ld1rqh { z1.h }, p0/Z, [x25]\n"
      "ld1rqh { z2.h }, p0/Z, [x24]\n"
      "ld1rqh { z3.h }, p0/Z, [x23]\n"
      "ld1rqh { z4.h }, p0/Z, [x22]\n"
      "ld1rqh { z5.h }, p0/Z, [x21]\n"
      ".inst 0x64a040c8  // fmlalb z8.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140cc  // fmlalb z12.s, z6.h, z1.h[0]\n"
      ".inst 0x64a040e9  // fmlalb z9.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ed  // fmlalb z13.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240d0  // fmlalb z16.s, z6.h, z2.h[0]\n"
      ".inst 0x64a340d4  // fmlalb z20.s, z6.h, z3.h[0]\n"
      ".inst 0x64a440d8  // fmlalb z24.s, z6.h, z4.h[0]\n"
      ".inst 0x64a540dc  // fmlalb z28.s, z6.h, z5.h[0]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a240f1  // fmlalb z17.s, z7.h, z2.h[0]\n"
      ".inst 0x64a340f5  // fmlalb z21.s, z7.h, z3.h[0]\n"
      ".inst 0x64a440f9  // fmlalb z25.s, z7.h, z4.h[0]\n"
      ".inst 0x64a540fd  // fmlalb z29.s, z7.h, z5.h[0]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a040ca  // fmlalb z10.s, z6.h, z0.h[0]\n"
      ".inst 0x64a140ce  // fmlalb z14.s, z6.h, z1.h[0]\n"
      ".inst 0x64a240d2  // fmlalb z18.s, z6.h, z2.h[0]\n"
      ".inst 0x64a340d6  // fmlalb z22.s, z6.h, z3.h[0]\n"
      ".inst 0x64a440da  // fmlalb z26.s, z6.h, z4.h[0]\n"
      ".inst 0x64a540de  // fmlalb z30.s, z6.h, z5.h[0]\n"
      ".inst 0x64a040eb  // fmlalb z11.s, z7.h, z0.h[0]\n"
      ".inst 0x64a140ef  // fmlalb z15.s, z7.h, z1.h[0]\n"
      ".inst 0x64a240f3  // fmlalb z19.s, z7.h, z2.h[0]\n"
      ".inst 0x64a340f7  // fmlalb z23.s, z7.h, z3.h[0]\n"
      ".inst 0x64a440fb  // fmlalb z27.s, z7.h, z4.h[0]\n"
      ".inst 0x64a540ff  // fmlalb z31.s, z7.h, z5.h[0]\n"
      "ble 82f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a048c8  // fmlalb z8.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148cc  // fmlalb z12.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d0  // fmlalb z16.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d4  // fmlalb z20.s, z6.h, z3.h[1]\n"
      ".inst 0x64a448d8  // fmlalb z24.s, z6.h, z4.h[1]\n"
      ".inst 0x64a548dc  // fmlalb z28.s, z6.h, z5.h[1]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a048e9  // fmlalb z9.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ed  // fmlalb z13.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f1  // fmlalb z17.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f5  // fmlalb z21.s, z7.h, z3.h[1]\n"
      ".inst 0x64a448f9  // fmlalb z25.s, z7.h, z4.h[1]\n"
      ".inst 0x64a548fd  // fmlalb z29.s, z7.h, z5.h[1]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a048ca  // fmlalb z10.s, z6.h, z0.h[1]\n"
      ".inst 0x64a148ce  // fmlalb z14.s, z6.h, z1.h[1]\n"
      ".inst 0x64a248d2  // fmlalb z18.s, z6.h, z2.h[1]\n"
      ".inst 0x64a348d6  // fmlalb z22.s, z6.h, z3.h[1]\n"
      ".inst 0x64a448da  // fmlalb z26.s, z6.h, z4.h[1]\n"
      ".inst 0x64a548de  // fmlalb z30.s, z6.h, z5.h[1]\n"
      ".inst 0x64a048eb  // fmlalb z11.s, z7.h, z0.h[1]\n"
      ".inst 0x64a148ef  // fmlalb z15.s, z7.h, z1.h[1]\n"
      ".inst 0x64a248f3  // fmlalb z19.s, z7.h, z2.h[1]\n"
      ".inst 0x64a348f7  // fmlalb z23.s, z7.h, z3.h[1]\n"
      ".inst 0x64a448fb  // fmlalb z27.s, z7.h, z4.h[1]\n"
      ".inst 0x64a548ff  // fmlalb z31.s, z7.h, z5.h[1]\n"
      "ble 82f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a840c8  // fmlalb z8.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940cc  // fmlalb z12.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d0  // fmlalb z16.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d4  // fmlalb z20.s, z6.h, z3.h[2]\n"
      ".inst 0x64ac40d8  // fmlalb z24.s, z6.h, z4.h[2]\n"
      ".inst 0x64ad40dc  // fmlalb z28.s, z6.h, z5.h[2]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a840e9  // fmlalb z9.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ed  // fmlalb z13.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f1  // fmlalb z17.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f5  // fmlalb z21.s, z7.h, z3.h[2]\n"
      ".inst 0x64ac40f9  // fmlalb z25.s, z7.h, z4.h[2]\n"
      ".inst 0x64ad40fd  // fmlalb z29.s, z7.h, z5.h[2]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a840ca  // fmlalb z10.s, z6.h, z0.h[2]\n"
      ".inst 0x64a940ce  // fmlalb z14.s, z6.h, z1.h[2]\n"
      ".inst 0x64aa40d2  // fmlalb z18.s, z6.h, z2.h[2]\n"
      ".inst 0x64ab40d6  // fmlalb z22.s, z6.h, z3.h[2]\n"
      ".inst 0x64ac40da  // fmlalb z26.s, z6.h, z4.h[2]\n"
      ".inst 0x64ad40de  // fmlalb z30.s, z6.h, z5.h[2]\n"
      ".inst 0x64a840eb  // fmlalb z11.s, z7.h, z0.h[2]\n"
      ".inst 0x64a940ef  // fmlalb z15.s, z7.h, z1.h[2]\n"
      ".inst 0x64aa40f3  // fmlalb z19.s, z7.h, z2.h[2]\n"
      ".inst 0x64ab40f7  // fmlalb z23.s, z7.h, z3.h[2]\n"
      ".inst 0x64ac40fb  // fmlalb z27.s, z7.h, z4.h[2]\n"
      ".inst 0x64ad40ff  // fmlalb z31.s, z7.h, z5.h[2]\n"
      "ble 82f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64a848c8  // fmlalb z8.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948cc  // fmlalb z12.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d0  // fmlalb z16.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d4  // fmlalb z20.s, z6.h, z3.h[3]\n"
      ".inst 0x64ac48d8  // fmlalb z24.s, z6.h, z4.h[3]\n"
      ".inst 0x64ad48dc  // fmlalb z28.s, z6.h, z5.h[3]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64a848e9  // fmlalb z9.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ed  // fmlalb z13.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f1  // fmlalb z17.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f5  // fmlalb z21.s, z7.h, z3.h[3]\n"
      ".inst 0x64ac48f9  // fmlalb z25.s, z7.h, z4.h[3]\n"
      ".inst 0x64ad48fd  // fmlalb z29.s, z7.h, z5.h[3]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64a848ca  // fmlalb z10.s, z6.h, z0.h[3]\n"
      ".inst 0x64a948ce  // fmlalb z14.s, z6.h, z1.h[3]\n"
      ".inst 0x64aa48d2  // fmlalb z18.s, z6.h, z2.h[3]\n"
      ".inst 0x64ab48d6  // fmlalb z22.s, z6.h, z3.h[3]\n"
      ".inst 0x64ac48da  // fmlalb z26.s, z6.h, z4.h[3]\n"
      ".inst 0x64ad48de  // fmlalb z30.s, z6.h, z5.h[3]\n"
      ".inst 0x64a848eb  // fmlalb z11.s, z7.h, z0.h[3]\n"
      ".inst 0x64a948ef  // fmlalb z15.s, z7.h, z1.h[3]\n"
      ".inst 0x64aa48f3  // fmlalb z19.s, z7.h, z2.h[3]\n"
      ".inst 0x64ab48f7  // fmlalb z23.s, z7.h, z3.h[3]\n"
      ".inst 0x64ac48fb  // fmlalb z27.s, z7.h, z4.h[3]\n"
      ".inst 0x64ad48ff  // fmlalb z31.s, z7.h, z5.h[3]\n"
      "ble 82f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b040c8  // fmlalb z8.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140cc  // fmlalb z12.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d0  // fmlalb z16.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d4  // fmlalb z20.s, z6.h, z3.h[4]\n"
      ".inst 0x64b440d8  // fmlalb z24.s, z6.h, z4.h[4]\n"
      ".inst 0x64b540dc  // fmlalb z28.s, z6.h, z5.h[4]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b040e9  // fmlalb z9.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ed  // fmlalb z13.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f1  // fmlalb z17.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f5  // fmlalb z21.s, z7.h, z3.h[4]\n"
      ".inst 0x64b440f9  // fmlalb z25.s, z7.h, z4.h[4]\n"
      ".inst 0x64b540fd  // fmlalb z29.s, z7.h, z5.h[4]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b040ca  // fmlalb z10.s, z6.h, z0.h[4]\n"
      ".inst 0x64b140ce  // fmlalb z14.s, z6.h, z1.h[4]\n"
      ".inst 0x64b240d2  // fmlalb z18.s, z6.h, z2.h[4]\n"
      ".inst 0x64b340d6  // fmlalb z22.s, z6.h, z3.h[4]\n"
      ".inst 0x64b440da  // fmlalb z26.s, z6.h, z4.h[4]\n"
      ".inst 0x64b540de  // fmlalb z30.s, z6.h, z5.h[4]\n"
      ".inst 0x64b040eb  // fmlalb z11.s, z7.h, z0.h[4]\n"
      ".inst 0x64b140ef  // fmlalb z15.s, z7.h, z1.h[4]\n"
      ".inst 0x64b240f3  // fmlalb z19.s, z7.h, z2.h[4]\n"
      ".inst 0x64b340f7  // fmlalb z23.s, z7.h, z3.h[4]\n"
      ".inst 0x64b440fb  // fmlalb z27.s, z7.h, z4.h[4]\n"
      ".inst 0x64b540ff  // fmlalb z31.s, z7.h, z5.h[4]\n"
      "ble 82f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b048c8  // fmlalb z8.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148cc  // fmlalb z12.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d0  // fmlalb z16.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d4  // fmlalb z20.s, z6.h, z3.h[5]\n"
      ".inst 0x64b448d8  // fmlalb z24.s, z6.h, z4.h[5]\n"
      ".inst 0x64b548dc  // fmlalb z28.s, z6.h, z5.h[5]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b048e9  // fmlalb z9.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ed  // fmlalb z13.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f1  // fmlalb z17.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f5  // fmlalb z21.s, z7.h, z3.h[5]\n"
      ".inst 0x64b448f9  // fmlalb z25.s, z7.h, z4.h[5]\n"
      ".inst 0x64b548fd  // fmlalb z29.s, z7.h, z5.h[5]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b048ca  // fmlalb z10.s, z6.h, z0.h[5]\n"
      ".inst 0x64b148ce  // fmlalb z14.s, z6.h, z1.h[5]\n"
      ".inst 0x64b248d2  // fmlalb z18.s, z6.h, z2.h[5]\n"
      ".inst 0x64b348d6  // fmlalb z22.s, z6.h, z3.h[5]\n"
      ".inst 0x64b448da  // fmlalb z26.s, z6.h, z4.h[5]\n"
      ".inst 0x64b548de  // fmlalb z30.s, z6.h, z5.h[5]\n"
      ".inst 0x64b048eb  // fmlalb z11.s, z7.h, z0.h[5]\n"
      ".inst 0x64b148ef  // fmlalb z15.s, z7.h, z1.h[5]\n"
      ".inst 0x64b248f3  // fmlalb z19.s, z7.h, z2.h[5]\n"
      ".inst 0x64b348f7  // fmlalb z23.s, z7.h, z3.h[5]\n"
      ".inst 0x64b448fb  // fmlalb z27.s, z7.h, z4.h[5]\n"
      ".inst 0x64b548ff  // fmlalb z31.s, z7.h, z5.h[5]\n"
      "ble 82f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "subs x27, x27, #0x1\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b840c8  // fmlalb z8.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940cc  // fmlalb z12.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d0  // fmlalb z16.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d4  // fmlalb z20.s, z6.h, z3.h[6]\n"
      ".inst 0x64bc40d8  // fmlalb z24.s, z6.h, z4.h[6]\n"
      ".inst 0x64bd40dc  // fmlalb z28.s, z6.h, z5.h[6]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b840e9  // fmlalb z9.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ed  // fmlalb z13.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f1  // fmlalb z17.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f5  // fmlalb z21.s, z7.h, z3.h[6]\n"
      ".inst 0x64bc40f9  // fmlalb z25.s, z7.h, z4.h[6]\n"
      ".inst 0x64bd40fd  // fmlalb z29.s, z7.h, z5.h[6]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b840ca  // fmlalb z10.s, z6.h, z0.h[6]\n"
      ".inst 0x64b940ce  // fmlalb z14.s, z6.h, z1.h[6]\n"
      ".inst 0x64ba40d2  // fmlalb z18.s, z6.h, z2.h[6]\n"
      ".inst 0x64bb40d6  // fmlalb z22.s, z6.h, z3.h[6]\n"
      ".inst 0x64bc40da  // fmlalb z26.s, z6.h, z4.h[6]\n"
      ".inst 0x64bd40de  // fmlalb z30.s, z6.h, z5.h[6]\n"
      ".inst 0x64b840eb  // fmlalb z11.s, z7.h, z0.h[6]\n"
      ".inst 0x64b940ef  // fmlalb z15.s, z7.h, z1.h[6]\n"
      ".inst 0x64ba40f3  // fmlalb z19.s, z7.h, z2.h[6]\n"
      ".inst 0x64bb40f7  // fmlalb z23.s, z7.h, z3.h[6]\n"
      ".inst 0x64bc40fb  // fmlalb z27.s, z7.h, z4.h[6]\n"
      ".inst 0x64bd40ff  // fmlalb z31.s, z7.h, z5.h[6]\n"
      "ble 82f\n"
      "ld1h { z6.s }, p5/Z, [x10]\n"
      "ld1h { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
      "addvl x10, x10, #1\n"
      ".inst 0x64b848c8  // fmlalb z8.s, z6.h, z0.h[7]\n"
      ".inst 0x64b948cc  // fmlalb z12.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d0  // fmlalb z16.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d4  // fmlalb z20.s, z6.h, z3.h[7]\n"
      ".inst 0x64bc48d8  // fmlalb z24.s, z6.h, z4.h[7]\n"
      ".inst 0x64bd48dc  // fmlalb z28.s, z6.h, z5.h[7]\n"
      ".inst 0x64b848e9  // fmlalb z9.s, z7.h, z0.h[7]\n"
      "ld1h { z6.s }, p5/Z, [x9]\n"
      ".inst 0x64b948ed  // fmlalb z13.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f1  // fmlalb z17.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f5  // fmlalb z21.s, z7.h, z3.h[7]\n"
      ".inst 0x64bc48f9  // fmlalb z25.s, z7.h, z4.h[7]\n"
      ".inst 0x64bd48fd  // fmlalb z29.s, z7.h, z5.h[7]\n"
      "ld1h { z7.s }, p5/Z, [x9, #1, MUL VL]\n"
      ".inst 0x64b848ca  // fmlalb z10.s, z6.h, z0.h[7]\n"
      "addvl x9, x9, #1\n"
      ".inst 0x64b948ce  // fmlalb z14.s, z6.h, z1.h[7]\n"
      ".inst 0x64ba48d2  // fmlalb z18.s, z6.h, z2.h[7]\n"
      ".inst 0x64bb48d6  // fmlalb z22.s, z6.h, z3.h[7]\n"
      ".inst 0x64bc48da  // fmlalb z26.s, z6.h, z4.h[7]\n"
      ".inst 0x64bd48de  // fmlalb z30.s, z6.h, z5.h[7]\n"
      ".inst 0x64b848eb  // fmlalb z11.s, z7.h, z0.h[7]\n"
      ".inst 0x64b948ef  // fmlalb z15.s, z7.h, z1.h[7]\n"
      ".inst 0x64ba48f3  // fmlalb z19.s, z7.h, z2.h[7]\n"
      ".inst 0x64bb48f7  // fmlalb z23.s, z7.h, z3.h[7]\n"
      ".inst 0x64bc48fb  // fmlalb z27.s, z7.h, z4.h[7]\n"
      ".inst 0x64bd48ff  // fmlalb z31.s, z7.h, z5.h[7]\n"
      "82:"  // Height 6: Multiply loop: multiply skip
      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "add x28, x28, #0x1\n"
      "cmp x28, x20\n"
      "bne 77b\n"
      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
      "add x26, x11, x20, LSL #1\n"
      "add x25, x26, x20, LSL #1\n"
      "add x24, x25, x20, LSL #1\n"
      "add x23, x24, x20, LSL #1\n"
      "add x22, x23, x20, LSL #1\n"
      "tbz %x[flags], #1, 83f\n"
      "add x21, %x[args_ptr], %[offsetof_maxval]\n"
      "add x20, %x[args_ptr], %[offsetof_minval]\n"
      "ld1rw { z1.s }, p5/Z, [x21]\n"
      "ld1rw { z0.s }, p5/Z, [x20]\n"
      "fmin z8.s, p5/M, z8.s, z1.s\n"
      "fmin z9.s, p5/M, z9.s, z1.s\n"
      "fmin z10.s, p5/M, z10.s, z1.s\n"
      "fmin z11.s, p5/M, z11.s, z1.s\n"
      "fmin z12.s, p5/M, z12.s, z1.s\n"
      "fmin z13.s, p5/M, z13.s, z1.s\n"
      "fmin z14.s, p5/M, z14.s, z1.s\n"
      "fmin z15.s, p5/M, z15.s, z1.s\n"
      "fmin z16.s, p5/M, z16.s, z1.s\n"
      "fmin z17.s, p5/M, z17.s, z1.s\n"
      "fmin z18.s, p5/M, z18.s, z1.s\n"
      "fmin z19.s, p5/M, z19.s, z1.s\n"
      "fmin z20.s, p5/M, z20.s, z1.s\n"
      "fmin z21.s, p5/M, z21.s, z1.s\n"
      "fmin z22.s, p5/M, z22.s, z1.s\n"
      "fmin z23.s, p5/M, z23.s, z1.s\n"
      "fmin z24.s, p5/M, z24.s, z1.s\n"
      "fmin z25.s, p5/M, z25.s, z1.s\n"
      "fmin z26.s, p5/M, z26.s, z1.s\n"
      "fmin z27.s, p5/M, z27.s, z1.s\n"
      "fmin z28.s, p5/M, z28.s, z1.s\n"
      "fmin z29.s, p5/M, z29.s, z1.s\n"
      "fmin z30.s, p5/M, z30.s, z1.s\n"
      "fmin z31.s, p5/M, z31.s, z1.s\n"
      "fmax z8.s, p5/M, z8.s, z0.s\n"
      "fmax z9.s, p5/M, z9.s, z0.s\n"
      "fmax z10.s, p5/M, z10.s, z0.s\n"
      "fmax z11.s, p5/M, z11.s, z0.s\n"
      "fmax z12.s, p5/M, z12.s, z0.s\n"
      "fmax z13.s, p5/M, z13.s, z0.s\n"
      "fmax z14.s, p5/M, z14.s, z0.s\n"
      "fmax z15.s, p5/M, z15.s, z0.s\n"
      "fmax z16.s, p5/M, z16.s, z0.s\n"
      "fmax z17.s, p5/M, z17.s, z0.s\n"
      "fmax z18.s, p5/M, z18.s, z0.s\n"
      "fmax z19.s, p5/M, z19.s, z0.s\n"
      "fmax z20.s, p5/M, z20.s, z0.s\n"
      "fmax z21.s, p5/M, z21.s, z0.s\n"
      "fmax z22.s, p5/M, z22.s, z0.s\n"
      "fmax z23.s, p5/M, z23.s, z0.s\n"
      "fmax z24.s, p5/M, z24.s, z0.s\n"
      "fmax z25.s, p5/M, z25.s, z0.s\n"
      "fmax z26.s, p5/M, z26.s, z0.s\n"
      "fmax z27.s, p5/M, z27.s, z0.s\n"
      "fmax z28.s, p5/M, z28.s, z0.s\n"
      "fmax z29.s, p5/M, z29.s, z0.s\n"
      "fmax z30.s, p5/M, z30.s, z0.s\n"
      "fmax z31.s, p5/M, z31.s, z0.s\n"
      "83:"  // Height 6: No activation
      "fcvt z8.h, p5/m, z8.s\n"
      "fcvt z9.h, p5/m, z9.s\n"
      "fcvt z10.h, p5/m, z10.s\n"
      "fcvt z11.h, p5/m, z11.s\n"
      "fcvt z12.h, p5/m, z12.s\n"
      "fcvt z13.h, p5/m, z13.s\n"
      "fcvt z14.h, p5/m, z14.s\n"
      "fcvt z15.h, p5/m, z15.s\n"
      "st1h { z8.s }, p4, [x11]\n"
      "fcvt z16.h, p5/m, z16.s\n"
      "fcvt z17.h, p5/m, z17.s\n"
      "st1h { z9.s }, p3, [x11, #1, MUL VL]\n"
      "fcvt z18.h, p5/m, z18.s\n"
      "fcvt z19.h, p5/m, z19.s\n"
      "st1h { z10.s }, p2, [x11, #2, MUL VL]\n"
      "fcvt z20.h, p5/m, z20.s\n"
      "fcvt z21.h, p5/m, z21.s\n"
      "st1h { z11.s }, p1, [x11, #3, MUL VL]\n"
      "addvl x11, x11, #2\n"
      "fcvt z22.h, p5/m, z22.s\n"
      "fcvt z23.h, p5/m, z23.s\n"
      "st1h { z12.s }, p4, [x26]\n"
      "fcvt z24.h, p5/m, z24.s\n"
      "fcvt z25.h, p5/m, z25.s\n"
      "st1h { z13.s }, p3, [x26, #1, MUL VL]\n"
      "fcvt z26.h, p5/m, z26.s\n"
      "fcvt z27.h, p5/m, z27.s\n"
      "st1h { z14.s }, p2, [x26, #2, MUL VL]\n"
      "fcvt z28.h, p5/m, z28.s\n"
      "fcvt z29.h, p5/m, z29.s\n"
      "st1h { z15.s }, p1, [x26, #3, MUL VL]\n"
      "fcvt z30.h, p5/m, z30.s\n"
      "fcvt z31.h, p5/m, z31.s\n"
      "st1h { z16.s }, p4, [x25]\n"
      "st1h { z17.s }, p3, [x25, #1, MUL VL]\n"
      "st1h { z18.s }, p2, [x25, #2, MUL VL]\n"
      "st1h { z19.s }, p1, [x25, #3, MUL VL]\n"
      "st1h { z20.s }, p4, [x24]\n"
      "st1h { z21.s }, p3, [x24, #1, MUL VL]\n"
      "st1h { z22.s }, p2, [x24, #2, MUL VL]\n"
      "st1h { z23.s }, p1, [x24, #3, MUL VL]\n"
      "st1h { z24.s }, p4, [x23]\n"
      "st1h { z25.s }, p3, [x23, #1, MUL VL]\n"
      "st1h { z26.s }, p2, [x23, #2, MUL VL]\n"
      "st1h { z27.s }, p1, [x23, #3, MUL VL]\n"
      "st1h { z28.s }, p4, [x22]\n"
      "st1h { z29.s }, p3, [x22, #1, MUL VL]\n"
      "st1h { z30.s }, p2, [x22, #2, MUL VL]\n"
      "st1h { z31.s }, p1, [x22, #3, MUL VL]\n"
      "decw x13, ALL, MUL #4\n"
      "cmp x13, XZR\n"
      "bgt 72b\n"
      "subs %x[M], %x[M], #0x6\n"
      "beq 86f\n"
      "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
      "tbz %x[flags], #3, 85f\n"
      "add x21, x21, #0x6\n"
      "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
      "b 1b\n"
      "85:"  // Update direct input
      "mov x20, #0xc\n"
      "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
      "b 1b\n"
      "86:"  // Exit
      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr)
      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_maxval] "I" (offsetof(KernelArgs, maxval)), [offsetof_minval] "I" (offsetof(KernelArgs, minval)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_output_ptr] "I" (offsetof(KernelArgs, output_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
    );
}

} // namespace arm_gemm

#endif // (defined(ENABLE_FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) && defined(ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__aarch64__)

