/*
 * Copyright (c) 2021-2024, 2026 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#if defined(__aarch64__)

#include "arm_gemm/arm_gemm.hpp"
#include <cstdint>

namespace arm_conv {
namespace depthwise {

void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
{
  __asm__ __volatile__(
    "mov x17, #0x1\n"
    "lsr x16, %x[n_channels], #0x4\n"
    "ldp x15, x14, [%x[inptrs], #0x0]\n"
    "ldp x27, x26, [%x[inptrs], #0x10]\n"
    "add x21, %x[qp], %[offsetof_Requantize32_minval]\n"
    "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
    "ldp x25, x24, [%x[inptrs], #0x20]\n"
    "ldp x23, x22, [%x[inptrs], #0x30]\n"
    "ld1r { v7.4s }, [x21]\n"
    "ld1r { v11.4s }, [x20]\n"
    "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
    "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
    "ld1r { v24.4s }, [x21]\n"
    "ld1r { v12.4s }, [x20]\n"
    "orr x17, x17, #0x100\n"
    "mov x13, #0x0\n"
    "mov x12, #0x0\n"
    "ldp x11, x10, [%x[outptrs], #0x0]\n"
    "ldp x9, x28, [%x[outptrs], #0x10]\n"
    "orr x17, x17, #0x10000\n"
    "dup v15.4s, w17\n"
    "cbz x16, 3f\n"
    "ldr q13, [x15, x13]\n"
    "ldr q5, [x14, x13]\n"
    "subs x16, x16, #0x1\n"
    "ldr q27, [x27, x13]\n"
    "ldr q9, [x26, x13]\n"
    "ldr q1, [x25, x13]\n"
    "ldr q28, [x24, x13]\n"
    "ldr q26, [x23, x13]\n"
    "ldr q4, [x22, x13]\n"
    "ldr q30, [%x[params], #0x10]\n"
    "ldr q8, [%x[params], #0x20]\n"
    "zip2 v19.16b, v13.16b, v27.16b\n"
    "zip1 v13.16b, v13.16b, v27.16b\n"
    "ldr q17, [%x[params], #0x30]\n"
    "ldp x27, x26, [%x[inptrs], #0x40]\n"
    "zip1 v3.16b, v5.16b, v9.16b\n"
    "zip2 v9.16b, v5.16b, v9.16b\n"
    "ldp x25, x24, [%x[inptrs], #0x50]\n"
    "ldp x23, x22, [%x[inptrs], #0x60]\n"
    "zip2 v18.16b, v1.16b, v26.16b\n"
    "zip1 v1.16b, v1.16b, v26.16b\n"
    "ldp x21, x20, [%x[inptrs], #0x70]\n"
    "zip1 v16.16b, v28.16b, v4.16b\n"
    "zip2 v4.16b, v28.16b, v4.16b\n"
    "ldr q10, [x27, x13]\n"
    "ldr q14, [x26, x13]\n"
    "zip2 v2.16b, v13.16b, v3.16b\n"
    "zip1 v13.16b, v13.16b, v3.16b\n"
    "ldp x15, x14, [%x[inptrs], #0x0]\n"
    "ldr q3, [x25, x13]\n"
    "ldr q6, [x24, x13]\n"
    "zip1 v0.16b, v19.16b, v9.16b\n"
    "zip2 v9.16b, v19.16b, v9.16b\n"
    "ldr q5, [x23, x13]\n"
    "ldr q20, [x22, x13]\n"
    "zip2 v21.16b, v1.16b, v16.16b\n"
    "zip1 v1.16b, v1.16b, v16.16b\n"
    "ldr q16, [x21, x13]\n"
    "ldr q25, [x20, x13]\n"
    "zip1 v28.16b, v18.16b, v4.16b\n"
    "zip2 v4.16b, v18.16b, v4.16b\n"
    "ldr q31, [%x[params], #0x0]\n"
    "zip2 v19.16b, v10.16b, v3.16b\n"
    "zip1 v10.16b, v10.16b, v3.16b\n"
    "ldp x27, x26, [%x[inptrs], #0x10]\n"
    "zip1 v18.16b, v14.16b, v6.16b\n"
    "zip2 v6.16b, v14.16b, v6.16b\n"
    "ldp x25, x24, [%x[inptrs], #0x20]\n"
    "ldp x23, x22, [%x[inptrs], #0x30]\n"
    "zip2 v23.16b, v5.16b, v16.16b\n"
    "zip1 v5.16b, v5.16b, v16.16b\n"
    "add %x[params], %x[params], #0x40\n"
    "zip1 v16.16b, v20.16b, v25.16b\n"
    "zip2 v25.16b, v20.16b, v25.16b\n"
    "zip2 v29.16b, v10.16b, v18.16b\n"
    "zip1 v10.16b, v10.16b, v18.16b\n"
    "zip1 v27.16b, v19.16b, v6.16b\n"
    "zip2 v6.16b, v19.16b, v6.16b\n"
    "zip2 v18.16b, v5.16b, v16.16b\n"
    "zip1 v5.16b, v5.16b, v16.16b\n"
    "zip1 v14.16b, v23.16b, v25.16b\n"
    "zip2 v25.16b, v23.16b, v25.16b\n"
    "mov v26.16b, v31.16b\n"
    "mov v3.16b, v31.16b\n"
    "mov v23.16b, v31.16b\n"
    "beq 2f\n"
    "1:"  // Loop
    "movi v19.4s, #0x0\n"
    ".inst 0x6e8d97df  // udot v31.4s, v30.16b, v13.16b\n"
    ".inst 0x6e8197c3  // udot v3.4s, v30.16b, v1.16b\n"
    "add x13, x13, #0x10\n"
    "movi v22.4s, #0x0\n"
    "subs x16, x16, #0x1\n"
    ".inst 0x6e8195f3  // udot v19.4s, v15.16b, v1.16b\n"
    ".inst 0x6e81951f  // udot v31.4s, v8.16b, v1.16b\n"
    "ext v1.16b, v1.16b, v1.16b, #0x1\n"
    ".inst 0x6e8a9503  // udot v3.4s, v8.16b, v10.16b\n"
    ".inst 0x6e8a95f3  // udot v19.4s, v15.16b, v10.16b\n"
    ".inst 0x6e8195f6  // udot v22.4s, v15.16b, v1.16b\n"
    ".inst 0x6e8a963f  // udot v31.4s, v17.16b, v10.16b\n"
    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
    ".inst 0x6e8197d7  // udot v23.4s, v30.16b, v1.16b\n"
    "mov v16.16b, v19.16b\n .inst 0x6e8595f0  // udot v16.4s, v15.16b, v5.16b\n"
    ".inst 0x6e8d95f3  // udot v19.4s, v15.16b, v13.16b\n"
    "ext v13.16b, v13.16b, v13.16b, #0x1\n"
    ".inst 0x6e859623  // udot v3.4s, v17.16b, v5.16b\n"
    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
    ".inst 0x6e8a95f6  // udot v22.4s, v15.16b, v10.16b\n"
    ".inst 0x6e8d97da  // udot v26.4s, v30.16b, v13.16b\n"
    ".inst 0x6e8a9517  // udot v23.4s, v8.16b, v10.16b\n"
    "mls v31.4s, v19.4s, v24.4s\n"
    "movi v19.4s, #0x0\n"
    "mls v3.4s, v16.4s, v24.4s\n"
    ".inst 0x6e81951a  // udot v26.4s, v8.16b, v1.16b\n"
    "ldr q8, [%x[params], #0x10]\n"
    "mov v16.16b, v22.16b\n .inst 0x6e8595f0  // udot v16.4s, v15.16b, v5.16b\n"
    ".inst 0x6e8d95f6  // udot v22.4s, v15.16b, v13.16b\n"
    "ldr q1, [%x[params], #0x0]\n"
    ".inst 0x6e9595f3  // udot v19.4s, v15.16b, v21.16b\n"
    ".inst 0x6e859637  // udot v23.4s, v17.16b, v5.16b\n"
    ".inst 0x6e8a963a  // udot v26.4s, v17.16b, v10.16b\n"
    "sqrdmulh v31.4s, v31.4s, v1.4s\n"
    "sqrdmulh v3.4s, v3.4s, v1.4s\n"
    "mls v23.4s, v16.4s, v24.4s\n"
    "and v16.16b, v31.16b, v8.16b\n"
    ".inst 0x6e9d95f3  // udot v19.4s, v15.16b, v29.16b\n"
    "mls v26.4s, v22.4s, v24.4s\n"
    "movi v20.4s, #0x0\n"
    "sqrdmulh v23.4s, v23.4s, v1.4s\n"
    "and v30.16b, v3.16b, v8.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v26.4s, v26.4s, v1.4s\n"
    "ldr q10, [%x[params], #0x60]\n"
    "mov v22.16b, v19.16b\n .inst 0x6e9295f6  // udot v22.4s, v15.16b, v18.16b\n"
    ".inst 0x6e8295f3  // udot v19.4s, v15.16b, v2.16b\n"
    "sshr v30.4s, v30.4s, #0x1f\n"
    "sqadd v31.4s, v31.4s, v16.4s\n"
    "and v17.16b, v23.16b, v8.16b\n"
    "and v16.16b, v26.16b, v8.16b\n"
    "sqadd v3.4s, v3.4s, v30.4s\n"
    "ldr q5, [%x[params], #0x50]\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "srshl v31.4s, v31.4s, v8.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "srshl v3.4s, v3.4s, v8.4s\n"
    "sqadd v23.4s, v23.4s, v17.4s\n"
    "ldr q17, [%x[params], #0x30]\n"
    "add v31.4s, v31.4s, v12.4s\n"
    "sqadd v26.4s, v26.4s, v16.4s\n"
    "ldr q30, [%x[params], #0x40]\n"
    "add v3.4s, v3.4s, v12.4s\n"
    "srshl v23.4s, v23.4s, v8.4s\n"
    "smax v31.4s, v31.4s, v7.4s\n"
    "srshl v26.4s, v26.4s, v8.4s\n"
    "ldr q1, [%x[params], #0x70]\n"
    "smax v3.4s, v3.4s, v7.4s\n"
    "add v23.4s, v23.4s, v12.4s\n"
    "smin v31.4s, v31.4s, v11.4s\n"
    "add v26.4s, v26.4s, v12.4s\n"
    "smin v3.4s, v3.4s, v11.4s\n"
    "smax v23.4s, v23.4s, v7.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smax v26.4s, v26.4s, v7.4s\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "smin v23.4s, v23.4s, v11.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smin v26.4s, v26.4s, v11.4s\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "str s31, [x11, x12]\n"
    "ldr q31, [%x[params], #0x20]\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "str s3, [x9, x12]\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "mov v8.16b, v31.16b\n"
    "str s26, [x10, x12]\n"
    "mov v16.16b, v31.16b\n"
    "str s23, [x28, x12]\n"
    "mov v26.16b, v31.16b\n"
    ".inst 0x6e82963f  // udot v31.4s, v17.16b, v2.16b\n"
    "add x12, x12, #0x4\n"
    ".inst 0x6e959628  // udot v8.4s, v17.16b, v21.16b\n"
    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
    ".inst 0x6e9597df  // udot v31.4s, v30.16b, v21.16b\n"
    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
    ".inst 0x6e829630  // udot v16.4s, v17.16b, v2.16b\n"
    ".inst 0x6e95963a  // udot v26.4s, v17.16b, v21.16b\n"
    ".inst 0x6e9595f4  // udot v20.4s, v15.16b, v21.16b\n"
    ".inst 0x6e9d97c8  // udot v8.4s, v30.16b, v29.16b\n"
    ".inst 0x6e9d94bf  // udot v31.4s, v5.16b, v29.16b\n"
    "ext v29.16b, v29.16b, v29.16b, #0x1\n"
    ".inst 0x6e9597d0  // udot v16.4s, v30.16b, v21.16b\n"
    "ldr q3, [x24, x13]\n"
    ".inst 0x6e9d97da  // udot v26.4s, v30.16b, v29.16b\n"
    ".inst 0x6e9d95f4  // udot v20.4s, v15.16b, v29.16b\n"
    ".inst 0x6e9294a8  // udot v8.4s, v5.16b, v18.16b\n"
    "ext v18.16b, v18.16b, v18.16b, #0x1\n"
    "mls v31.4s, v19.4s, v24.4s\n"
    "movi v23.4s, #0x0\n"
    ".inst 0x6e9d94b0  // udot v16.4s, v5.16b, v29.16b\n"
    ".inst 0x6e9294ba  // udot v26.4s, v5.16b, v18.16b\n"
    "mov v17.16b, v20.16b\n .inst 0x6e9295f1  // udot v17.4s, v15.16b, v18.16b\n"
    ".inst 0x6e8295f4  // udot v20.4s, v15.16b, v2.16b\n"
    "ldr q2, [x14, x13]\n"
    ".inst 0x6e9c95f7  // udot v23.4s, v15.16b, v28.16b\n"
    "sqrdmulh v31.4s, v31.4s, v10.4s\n"
    "mls v8.4s, v22.4s, v24.4s\n"
    "mls v26.4s, v17.4s, v24.4s\n"
    "and v18.16b, v31.16b, v1.16b\n"
    "mls v16.4s, v20.4s, v24.4s\n"
    "movi v21.4s, #0x0\n"
    "sqrdmulh v8.4s, v8.4s, v10.4s\n"
    "sqrdmulh v26.4s, v26.4s, v10.4s\n"
    ".inst 0x6e9b95f7  // udot v23.4s, v15.16b, v27.16b\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "sqrdmulh v16.4s, v16.4s, v10.4s\n"
    "ldr q13, [%x[params], #0xc0]\n"
    "and v17.16b, v8.16b, v1.16b\n"
    "sqadd v31.4s, v31.4s, v18.4s\n"
    "and v20.16b, v26.16b, v1.16b\n"
    "and v10.16b, v16.16b, v1.16b\n"
    "mov v19.16b, v23.16b\n .inst 0x6e8e95f3  // udot v19.4s, v15.16b, v14.16b\n"
    ".inst 0x6e8095f7  // udot v23.4s, v15.16b, v0.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "srshl v31.4s, v31.4s, v1.4s\n"
    "sshr v10.4s, v10.4s, #0x1f\n"
    "sshr v20.4s, v20.4s, #0x1f\n"
    "sqadd v8.4s, v8.4s, v17.4s\n"
    "ldr q30, [%x[params], #0xb0]\n"
    "sqadd v16.4s, v16.4s, v10.4s\n"
    "ldr q17, [%x[params], #0xa0]\n"
    "add v31.4s, v31.4s, v12.4s\n"
    "sqadd v26.4s, v26.4s, v20.4s\n"
    "ldr q20, [%x[params], #0x90]\n"
    "srshl v8.4s, v8.4s, v1.4s\n"
    "srshl v16.4s, v16.4s, v1.4s\n"
    "smax v31.4s, v31.4s, v7.4s\n"
    "srshl v26.4s, v26.4s, v1.4s\n"
    "ldr q22, [%x[params], #0xd0]\n"
    "add v8.4s, v8.4s, v12.4s\n"
    "add v16.4s, v16.4s, v12.4s\n"
    "smin v31.4s, v31.4s, v11.4s\n"
    "add v26.4s, v26.4s, v12.4s\n"
    "smax v8.4s, v8.4s, v7.4s\n"
    "smax v16.4s, v16.4s, v7.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smax v26.4s, v26.4s, v7.4s\n"
    "smin v8.4s, v8.4s, v11.4s\n"
    "smin v16.4s, v16.4s, v11.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smin v26.4s, v26.4s, v11.4s\n"
    "uzp1 v8.16b, v8.16b, v8.16b\n"
    "uzp1 v16.16b, v16.16b, v16.16b\n"
    "str s31, [x11, x12]\n"
    "ldr q10, [%x[params], #0x80]\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v8.16b, v8.16b, v8.16b\n"
    "uzp1 v16.16b, v16.16b, v16.16b\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "str s16, [x10, x12]\n"
    "mov v18.16b, v10.16b\n"
    "str s8, [x9, x12]\n"
    "mov v8.16b, v10.16b\n"
    "str s26, [x28, x12]\n"
    "mov v26.16b, v10.16b\n"
    ".inst 0x6e80968a  // udot v10.4s, v20.16b, v0.16b\n"
    "add x12, x12, #0x4\n"
    ".inst 0x6e9c9688  // udot v8.4s, v20.16b, v28.16b\n"
    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
    ".inst 0x6e9c962a  // udot v10.4s, v17.16b, v28.16b\n"
    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
    ".inst 0x6e809692  // udot v18.4s, v20.16b, v0.16b\n"
    ".inst 0x6e9c969a  // udot v26.4s, v20.16b, v28.16b\n"
    ".inst 0x6e9c95f5  // udot v21.4s, v15.16b, v28.16b\n"
    ".inst 0x6e9b9628  // udot v8.4s, v17.16b, v27.16b\n"
    ".inst 0x6e9b97ca  // udot v10.4s, v30.16b, v27.16b\n"
    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
    ".inst 0x6e9c9632  // udot v18.4s, v17.16b, v28.16b\n"
    "ldr q28, [x23, x13]\n"
    ".inst 0x6e9b963a  // udot v26.4s, v17.16b, v27.16b\n"
    ".inst 0x6e9b95f5  // udot v21.4s, v15.16b, v27.16b\n"
    ".inst 0x6e8e97c8  // udot v8.4s, v30.16b, v14.16b\n"
    "ext v14.16b, v14.16b, v14.16b, #0x1\n"
    "mls v10.4s, v23.4s, v24.4s\n"
    "movi v1.4s, #0x0\n"
    ".inst 0x6e9b97d2  // udot v18.4s, v30.16b, v27.16b\n"
    ".inst 0x6e8e97da  // udot v26.4s, v30.16b, v14.16b\n"
    "mov v16.16b, v21.16b\n .inst 0x6e8e95f0  // udot v16.4s, v15.16b, v14.16b\n"
    ".inst 0x6e8095f5  // udot v21.4s, v15.16b, v0.16b\n"
    "ldr q29, [x27, x13]\n"
    ".inst 0x6e8495e1  // udot v1.4s, v15.16b, v4.16b\n"
    "sqrdmulh v10.4s, v10.4s, v13.4s\n"
    "mls v8.4s, v19.4s, v24.4s\n"
    "mls v26.4s, v16.4s, v24.4s\n"
    "and v16.16b, v10.16b, v22.16b\n"
    "mls v18.4s, v21.4s, v24.4s\n"
    "movi v5.4s, #0x0\n"
    "sqrdmulh v8.4s, v8.4s, v13.4s\n"
    "sqrdmulh v26.4s, v26.4s, v13.4s\n"
    ".inst 0x6e8695e1  // udot v1.4s, v15.16b, v6.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v18.4s, v18.4s, v13.4s\n"
    "ldr q30, [%x[params], #0x120]\n"
    "and v17.16b, v8.16b, v22.16b\n"
    "sqadd v10.4s, v10.4s, v16.4s\n"
    "and v20.16b, v26.16b, v22.16b\n"
    "and v16.16b, v18.16b, v22.16b\n"
    "mov v19.16b, v1.16b\n .inst 0x6e9995f3  // udot v19.4s, v15.16b, v25.16b\n"
    ".inst 0x6e8995e1  // udot v1.4s, v15.16b, v9.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "srshl v10.4s, v10.4s, v22.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshr v20.4s, v20.4s, #0x1f\n"
    "sqadd v8.4s, v8.4s, v17.4s\n"
    "ldr q27, [%x[params], #0x110]\n"
    "sqadd v18.4s, v18.4s, v16.4s\n"
    "ldr q17, [%x[params], #0x100]\n"
    "add v10.4s, v10.4s, v12.4s\n"
    "sqadd v26.4s, v26.4s, v20.4s\n"
    "ldr q16, [%x[params], #0xf0]\n"
    "srshl v8.4s, v8.4s, v22.4s\n"
    "srshl v18.4s, v18.4s, v22.4s\n"
    "smax v10.4s, v10.4s, v7.4s\n"
    "srshl v26.4s, v26.4s, v22.4s\n"
    "ldr q31, [%x[params], #0x130]\n"
    "add v8.4s, v8.4s, v12.4s\n"
    "add v18.4s, v18.4s, v12.4s\n"
    "smin v10.4s, v10.4s, v11.4s\n"
    "add v26.4s, v26.4s, v12.4s\n"
    "smax v8.4s, v8.4s, v7.4s\n"
    "smax v18.4s, v18.4s, v7.4s\n"
    "uzp1 v10.16b, v10.16b, v10.16b\n"
    "smax v26.4s, v26.4s, v7.4s\n"
    "smin v8.4s, v8.4s, v11.4s\n"
    "smin v18.4s, v18.4s, v11.4s\n"
    "uzp1 v10.16b, v10.16b, v10.16b\n"
    "smin v26.4s, v26.4s, v11.4s\n"
    "uzp1 v8.16b, v8.16b, v8.16b\n"
    "uzp1 v18.16b, v18.16b, v18.16b\n"
    "str s10, [x11, x12]\n"
    "ldr q0, [%x[params], #0xe0]\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v8.16b, v8.16b, v8.16b\n"
    "uzp1 v18.16b, v18.16b, v18.16b\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "str s18, [x10, x12]\n"
    "mov v22.16b, v0.16b\n"
    "str s8, [x9, x12]\n"
    "mov v23.16b, v0.16b\n"
    "str s26, [x28, x12]\n"
    "mov v14.16b, v0.16b\n"
    ".inst 0x6e899600  // udot v0.4s, v16.16b, v9.16b\n"
    "add x12, x12, #0x4\n"
    ".inst 0x6e849617  // udot v23.4s, v16.16b, v4.16b\n"
    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
    ".inst 0x6e849620  // udot v0.4s, v17.16b, v4.16b\n"
    "ext v4.16b, v4.16b, v4.16b, #0x1\n"
    ".inst 0x6e899616  // udot v22.4s, v16.16b, v9.16b\n"
    ".inst 0x6e84960e  // udot v14.4s, v16.16b, v4.16b\n"
    ".inst 0x6e8495e5  // udot v5.4s, v15.16b, v4.16b\n"
    ".inst 0x6e869637  // udot v23.4s, v17.16b, v6.16b\n"
    ".inst 0x6e869760  // udot v0.4s, v27.16b, v6.16b\n"
    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
    ".inst 0x6e849636  // udot v22.4s, v17.16b, v4.16b\n"
    "ldr q4, [x22, x13]\n"
    ".inst 0x6e86962e  // udot v14.4s, v17.16b, v6.16b\n"
    ".inst 0x6e8695e5  // udot v5.4s, v15.16b, v6.16b\n"
    ".inst 0x6e999777  // udot v23.4s, v27.16b, v25.16b\n"
    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
    "mls v0.4s, v1.4s, v24.4s\n"
    ".inst 0x6e869776  // udot v22.4s, v27.16b, v6.16b\n"
    ".inst 0x6e99976e  // udot v14.4s, v27.16b, v25.16b\n"
    "mov v17.16b, v5.16b\n .inst 0x6e9995f1  // udot v17.4s, v15.16b, v25.16b\n"
    ".inst 0x6e8995e5  // udot v5.4s, v15.16b, v9.16b\n"
    "ldr q9, [x26, x13]\n"
    "sqrdmulh v0.4s, v0.4s, v30.4s\n"
    "mls v23.4s, v19.4s, v24.4s\n"
    "and v16.16b, v0.16b, v31.16b\n"
    "mls v22.4s, v5.4s, v24.4s\n"
    "mls v14.4s, v17.4s, v24.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v23.4s, v23.4s, v30.4s\n"
    "sqrdmulh v22.4s, v22.4s, v30.4s\n"
    "sqrdmulh v14.4s, v14.4s, v30.4s\n"
    "ldr q13, [x15, x13]\n"
    "ldp x23, x22, [%x[inptrs], #0x40]\n"
    "ldp x21, x20, [%x[inptrs], #0x50]\n"
    "sqadd v0.4s, v0.4s, v16.4s\n"
    "and v19.16b, v23.16b, v31.16b\n"
    "ldr q10, [x23, x13]\n"
    "ldr q26, [x22, x13]\n"
    "and v21.16b, v22.16b, v31.16b\n"
    "and v16.16b, v14.16b, v31.16b\n"
    "ldr q20, [x21, x13]\n"
    "ldr q6, [x20, x13]\n"
    "sshr v19.4s, v19.4s, #0x1f\n"
    "srshl v0.4s, v0.4s, v31.4s\n"
    "sshr v21.4s, v21.4s, #0x1f\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqadd v23.4s, v23.4s, v19.4s\n"
    "ldr q17, [%x[params], #0x170]\n"
    "add v0.4s, v0.4s, v12.4s\n"
    "sqadd v22.4s, v22.4s, v21.4s\n"
    "ldr q8, [%x[params], #0x160]\n"
    "sqadd v14.4s, v14.4s, v16.4s\n"
    "ldr q30, [%x[params], #0x150]\n"
    "srshl v23.4s, v23.4s, v31.4s\n"
    "smax v0.4s, v0.4s, v7.4s\n"
    "srshl v22.4s, v22.4s, v31.4s\n"
    "srshl v14.4s, v14.4s, v31.4s\n"
    "ldr q1, [x25, x13]\n"
    "ldp x23, x22, [%x[inptrs], #0x60]\n"
    "ldp x21, x20, [%x[inptrs], #0x70]\n"
    "ldp x15, x14, [%x[inptrs], #0x0]\n"
    "add v23.4s, v23.4s, v12.4s\n"
    "smin v0.4s, v0.4s, v11.4s\n"
    "ldp x27, x26, [%x[inptrs], #0x10]\n"
    "ldr q5, [x23, x13]\n"
    "ldr q27, [x22, x13]\n"
    "add v22.4s, v22.4s, v12.4s\n"
    "add v14.4s, v14.4s, v12.4s\n"
    "ldp x25, x24, [%x[inptrs], #0x20]\n"
    "ldr q16, [x21, x13]\n"
    "ldr q25, [x20, x13]\n"
    "smax v23.4s, v23.4s, v7.4s\n"
    "uzp1 v0.16b, v0.16b, v0.16b\n"
    "ldp x23, x22, [%x[inptrs], #0x30]\n"
    "smax v22.4s, v22.4s, v7.4s\n"
    "smax v14.4s, v14.4s, v7.4s\n"
    "smin v23.4s, v23.4s, v11.4s\n"
    "uzp1 v0.16b, v0.16b, v0.16b\n"
    "smin v22.4s, v22.4s, v11.4s\n"
    "smin v14.4s, v14.4s, v11.4s\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "str s0, [x11, x12]\n"
    "zip2 v18.16b, v13.16b, v29.16b\n"
    "zip1 v13.16b, v13.16b, v29.16b\n"
    "zip1 v0.16b, v2.16b, v9.16b\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "uzp1 v14.16b, v14.16b, v14.16b\n"
    "zip2 v9.16b, v2.16b, v9.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "zip2 v2.16b, v13.16b, v0.16b\n"
    "zip1 v13.16b, v13.16b, v0.16b\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "uzp1 v14.16b, v14.16b, v14.16b\n"
    "str s23, [x9, x12]\n"
    "zip1 v0.16b, v18.16b, v9.16b\n"
    "zip2 v9.16b, v18.16b, v9.16b\n"
    "ldr q31, [%x[params], #0x140]\n"
    "add %x[params], %x[params], #0x180\n"
    "zip2 v23.16b, v10.16b, v20.16b\n"
    "zip1 v10.16b, v10.16b, v20.16b\n"
    "str s22, [x10, x12]\n"
    "str s14, [x28, x12]\n"
    "zip2 v22.16b, v1.16b, v28.16b\n"
    "zip1 v1.16b, v1.16b, v28.16b\n"
    "add x12, x12, #0x4\n"
    "zip1 v20.16b, v3.16b, v4.16b\n"
    "zip2 v4.16b, v3.16b, v4.16b\n"
    "zip1 v14.16b, v26.16b, v6.16b\n"
    "zip2 v6.16b, v26.16b, v6.16b\n"
    "zip2 v19.16b, v5.16b, v16.16b\n"
    "zip1 v5.16b, v5.16b, v16.16b\n"
    "zip1 v16.16b, v27.16b, v25.16b\n"
    "zip2 v25.16b, v27.16b, v25.16b\n"
    "zip2 v21.16b, v1.16b, v20.16b\n"
    "zip1 v1.16b, v1.16b, v20.16b\n"
    "zip1 v28.16b, v22.16b, v4.16b\n"
    "zip2 v4.16b, v22.16b, v4.16b\n"
    "zip2 v29.16b, v10.16b, v14.16b\n"
    "zip1 v10.16b, v10.16b, v14.16b\n"
    "zip1 v27.16b, v23.16b, v6.16b\n"
    "zip2 v6.16b, v23.16b, v6.16b\n"
    "zip2 v18.16b, v5.16b, v16.16b\n"
    "zip1 v5.16b, v5.16b, v16.16b\n"
    "zip1 v14.16b, v19.16b, v25.16b\n"
    "zip2 v25.16b, v19.16b, v25.16b\n"
    "mov v26.16b, v31.16b\n"
    "mov v3.16b, v31.16b\n"
    "mov v23.16b, v31.16b\n"
    "bgt 1b\n"
    "2:"  // Detached iteration
    "movi v19.4s, #0x0\n"
    ".inst 0x6e8d97df  // udot v31.4s, v30.16b, v13.16b\n"
    ".inst 0x6e8197c3  // udot v3.4s, v30.16b, v1.16b\n"
    "tst %x[n_channels], #0xf\n"
    "movi v20.4s, #0x0\n"
    "add x13, x13, #0x10\n"
    ".inst 0x6e8195f3  // udot v19.4s, v15.16b, v1.16b\n"
    ".inst 0x6e81951f  // udot v31.4s, v8.16b, v1.16b\n"
    "ext v1.16b, v1.16b, v1.16b, #0x1\n"
    ".inst 0x6e8a9503  // udot v3.4s, v8.16b, v10.16b\n"
    ".inst 0x6e8a95f3  // udot v19.4s, v15.16b, v10.16b\n"
    ".inst 0x6e8195f4  // udot v20.4s, v15.16b, v1.16b\n"
    ".inst 0x6e8a963f  // udot v31.4s, v17.16b, v10.16b\n"
    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
    ".inst 0x6e8197d7  // udot v23.4s, v30.16b, v1.16b\n"
    "mov v16.16b, v19.16b\n .inst 0x6e8595f0  // udot v16.4s, v15.16b, v5.16b\n"
    ".inst 0x6e8d95f3  // udot v19.4s, v15.16b, v13.16b\n"
    "ext v13.16b, v13.16b, v13.16b, #0x1\n"
    ".inst 0x6e859623  // udot v3.4s, v17.16b, v5.16b\n"
    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
    ".inst 0x6e8a95f4  // udot v20.4s, v15.16b, v10.16b\n"
    ".inst 0x6e8d97da  // udot v26.4s, v30.16b, v13.16b\n"
    ".inst 0x6e8a9517  // udot v23.4s, v8.16b, v10.16b\n"
    "mls v31.4s, v19.4s, v24.4s\n"
    "movi v30.4s, #0x0\n"
    "mls v3.4s, v16.4s, v24.4s\n"
    ".inst 0x6e81951a  // udot v26.4s, v8.16b, v1.16b\n"
    "ldr q1, [%x[params], #0x10]\n"
    "mov v16.16b, v20.16b\n .inst 0x6e8595f0  // udot v16.4s, v15.16b, v5.16b\n"
    ".inst 0x6e8d95f4  // udot v20.4s, v15.16b, v13.16b\n"
    "ldr q8, [%x[params], #0x0]\n"
    ".inst 0x6e9595fe  // udot v30.4s, v15.16b, v21.16b\n"
    ".inst 0x6e859637  // udot v23.4s, v17.16b, v5.16b\n"
    ".inst 0x6e8a963a  // udot v26.4s, v17.16b, v10.16b\n"
    "sqrdmulh v31.4s, v31.4s, v8.4s\n"
    "sqrdmulh v3.4s, v3.4s, v8.4s\n"
    "mls v23.4s, v16.4s, v24.4s\n"
    "and v16.16b, v31.16b, v1.16b\n"
    ".inst 0x6e9d95fe  // udot v30.4s, v15.16b, v29.16b\n"
    "mls v26.4s, v20.4s, v24.4s\n"
    "movi v5.4s, #0x0\n"
    "sqrdmulh v23.4s, v23.4s, v8.4s\n"
    "and v22.16b, v3.16b, v1.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v26.4s, v26.4s, v8.4s\n"
    "ldr q20, [%x[params], #0x60]\n"
    "mov v19.16b, v30.16b\n .inst 0x6e9295f3  // udot v19.4s, v15.16b, v18.16b\n"
    ".inst 0x6e8295fe  // udot v30.4s, v15.16b, v2.16b\n"
    "sshr v22.4s, v22.4s, #0x1f\n"
    "sqadd v31.4s, v31.4s, v16.4s\n"
    "and v17.16b, v23.16b, v1.16b\n"
    "and v16.16b, v26.16b, v1.16b\n"
    "sqadd v3.4s, v3.4s, v22.4s\n"
    "ldr q8, [%x[params], #0x50]\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "srshl v31.4s, v31.4s, v1.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "srshl v3.4s, v3.4s, v1.4s\n"
    "sqadd v23.4s, v23.4s, v17.4s\n"
    "ldr q17, [%x[params], #0x30]\n"
    "add v31.4s, v31.4s, v12.4s\n"
    "sqadd v26.4s, v26.4s, v16.4s\n"
    "ldr q16, [%x[params], #0x40]\n"
    "add v3.4s, v3.4s, v12.4s\n"
    "srshl v23.4s, v23.4s, v1.4s\n"
    "smax v31.4s, v31.4s, v7.4s\n"
    "srshl v26.4s, v26.4s, v1.4s\n"
    "ldr q22, [%x[params], #0x70]\n"
    "smax v3.4s, v3.4s, v7.4s\n"
    "add v23.4s, v23.4s, v12.4s\n"
    "smin v31.4s, v31.4s, v11.4s\n"
    "add v26.4s, v26.4s, v12.4s\n"
    "smin v3.4s, v3.4s, v11.4s\n"
    "smax v23.4s, v23.4s, v7.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smax v26.4s, v26.4s, v7.4s\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "smin v23.4s, v23.4s, v11.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smin v26.4s, v26.4s, v11.4s\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "str s31, [x11, x12]\n"
    "ldr q31, [%x[params], #0x20]\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "str s3, [x9, x12]\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "mov v10.16b, v31.16b\n"
    "str s26, [x10, x12]\n"
    "mov v1.16b, v31.16b\n"
    "str s23, [x28, x12]\n"
    "mov v26.16b, v31.16b\n"
    ".inst 0x6e82963f  // udot v31.4s, v17.16b, v2.16b\n"
    "add x12, x12, #0x4\n"
    ".inst 0x6e95962a  // udot v10.4s, v17.16b, v21.16b\n"
    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
    ".inst 0x6e95961f  // udot v31.4s, v16.16b, v21.16b\n"
    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
    ".inst 0x6e829621  // udot v1.4s, v17.16b, v2.16b\n"
    ".inst 0x6e95963a  // udot v26.4s, v17.16b, v21.16b\n"
    ".inst 0x6e9595e5  // udot v5.4s, v15.16b, v21.16b\n"
    ".inst 0x6e9d960a  // udot v10.4s, v16.16b, v29.16b\n"
    ".inst 0x6e9d951f  // udot v31.4s, v8.16b, v29.16b\n"
    "ext v29.16b, v29.16b, v29.16b, #0x1\n"
    ".inst 0x6e959601  // udot v1.4s, v16.16b, v21.16b\n"
    ".inst 0x6e9d961a  // udot v26.4s, v16.16b, v29.16b\n"
    ".inst 0x6e9d95e5  // udot v5.4s, v15.16b, v29.16b\n"
    ".inst 0x6e92950a  // udot v10.4s, v8.16b, v18.16b\n"
    "ext v18.16b, v18.16b, v18.16b, #0x1\n"
    "mls v31.4s, v30.4s, v24.4s\n"
    "movi v3.4s, #0x0\n"
    ".inst 0x6e9d9501  // udot v1.4s, v8.16b, v29.16b\n"
    ".inst 0x6e92951a  // udot v26.4s, v8.16b, v18.16b\n"
    "mov v16.16b, v5.16b\n .inst 0x6e9295f0  // udot v16.4s, v15.16b, v18.16b\n"
    ".inst 0x6e8295e5  // udot v5.4s, v15.16b, v2.16b\n"
    ".inst 0x6e9c95e3  // udot v3.4s, v15.16b, v28.16b\n"
    "sqrdmulh v31.4s, v31.4s, v20.4s\n"
    "mls v10.4s, v19.4s, v24.4s\n"
    "mls v26.4s, v16.4s, v24.4s\n"
    "and v16.16b, v31.16b, v22.16b\n"
    "mls v1.4s, v5.4s, v24.4s\n"
    "movi v2.4s, #0x0\n"
    "sqrdmulh v10.4s, v10.4s, v20.4s\n"
    "sqrdmulh v26.4s, v26.4s, v20.4s\n"
    ".inst 0x6e9b95e3  // udot v3.4s, v15.16b, v27.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v1.4s, v1.4s, v20.4s\n"
    "ldr q23, [%x[params], #0xc0]\n"
    "and v17.16b, v10.16b, v22.16b\n"
    "sqadd v31.4s, v31.4s, v16.4s\n"
    "and v20.16b, v26.16b, v22.16b\n"
    "and v16.16b, v1.16b, v22.16b\n"
    "mov v19.16b, v3.16b\n .inst 0x6e8e95f3  // udot v19.4s, v15.16b, v14.16b\n"
    ".inst 0x6e8095e3  // udot v3.4s, v15.16b, v0.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "srshl v31.4s, v31.4s, v22.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshr v20.4s, v20.4s, #0x1f\n"
    "sqadd v10.4s, v10.4s, v17.4s\n"
    "ldr q18, [%x[params], #0xb0]\n"
    "sqadd v1.4s, v1.4s, v16.4s\n"
    "ldr q17, [%x[params], #0xa0]\n"
    "add v31.4s, v31.4s, v12.4s\n"
    "sqadd v26.4s, v26.4s, v20.4s\n"
    "ldr q16, [%x[params], #0x90]\n"
    "srshl v10.4s, v10.4s, v22.4s\n"
    "srshl v1.4s, v1.4s, v22.4s\n"
    "smax v31.4s, v31.4s, v7.4s\n"
    "srshl v26.4s, v26.4s, v22.4s\n"
    "ldr q22, [%x[params], #0xd0]\n"
    "add v10.4s, v10.4s, v12.4s\n"
    "add v1.4s, v1.4s, v12.4s\n"
    "smin v31.4s, v31.4s, v11.4s\n"
    "add v26.4s, v26.4s, v12.4s\n"
    "smax v10.4s, v10.4s, v7.4s\n"
    "smax v1.4s, v1.4s, v7.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smax v26.4s, v26.4s, v7.4s\n"
    "smin v10.4s, v10.4s, v11.4s\n"
    "smin v1.4s, v1.4s, v11.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smin v26.4s, v26.4s, v11.4s\n"
    "uzp1 v10.16b, v10.16b, v10.16b\n"
    "uzp1 v1.16b, v1.16b, v1.16b\n"
    "str s31, [x11, x12]\n"
    "ldr q21, [%x[params], #0x80]\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v10.16b, v10.16b, v10.16b\n"
    "uzp1 v1.16b, v1.16b, v1.16b\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "str s1, [x10, x12]\n"
    "mov v30.16b, v21.16b\n"
    "str s10, [x9, x12]\n"
    "mov v20.16b, v21.16b\n"
    "str s26, [x28, x12]\n"
    "mov v29.16b, v21.16b\n"
    ".inst 0x6e809615  // udot v21.4s, v16.16b, v0.16b\n"
    "add x12, x12, #0x4\n"
    ".inst 0x6e9c9614  // udot v20.4s, v16.16b, v28.16b\n"
    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
    ".inst 0x6e9c9635  // udot v21.4s, v17.16b, v28.16b\n"
    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
    ".inst 0x6e80961e  // udot v30.4s, v16.16b, v0.16b\n"
    ".inst 0x6e9c961d  // udot v29.4s, v16.16b, v28.16b\n"
    ".inst 0x6e9c95e2  // udot v2.4s, v15.16b, v28.16b\n"
    ".inst 0x6e9b9634  // udot v20.4s, v17.16b, v27.16b\n"
    ".inst 0x6e9b9655  // udot v21.4s, v18.16b, v27.16b\n"
    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
    ".inst 0x6e9c963e  // udot v30.4s, v17.16b, v28.16b\n"
    ".inst 0x6e9b963d  // udot v29.4s, v17.16b, v27.16b\n"
    ".inst 0x6e9b95e2  // udot v2.4s, v15.16b, v27.16b\n"
    ".inst 0x6e8e9654  // udot v20.4s, v18.16b, v14.16b\n"
    "ext v14.16b, v14.16b, v14.16b, #0x1\n"
    "mls v21.4s, v3.4s, v24.4s\n"
    "movi v5.4s, #0x0\n"
    ".inst 0x6e9b965e  // udot v30.4s, v18.16b, v27.16b\n"
    ".inst 0x6e8e965d  // udot v29.4s, v18.16b, v14.16b\n"
    "mov v16.16b, v2.16b\n .inst 0x6e8e95f0  // udot v16.4s, v15.16b, v14.16b\n"
    ".inst 0x6e8095e2  // udot v2.4s, v15.16b, v0.16b\n"
    ".inst 0x6e8495e5  // udot v5.4s, v15.16b, v4.16b\n"
    "sqrdmulh v21.4s, v21.4s, v23.4s\n"
    "mls v20.4s, v19.4s, v24.4s\n"
    "mls v29.4s, v16.4s, v24.4s\n"
    "and v16.16b, v21.16b, v22.16b\n"
    "mls v30.4s, v2.4s, v24.4s\n"
    "movi v27.4s, #0x0\n"
    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
    "sqrdmulh v29.4s, v29.4s, v23.4s\n"
    ".inst 0x6e8695e5  // udot v5.4s, v15.16b, v6.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
    "ldr q26, [%x[params], #0x120]\n"
    "and v17.16b, v20.16b, v22.16b\n"
    "sqadd v21.4s, v21.4s, v16.4s\n"
    "and v19.16b, v29.16b, v22.16b\n"
    "and v16.16b, v30.16b, v22.16b\n"
    "mov v14.16b, v5.16b\n .inst 0x6e9995ee  // udot v14.4s, v15.16b, v25.16b\n"
    ".inst 0x6e8995e5  // udot v5.4s, v15.16b, v9.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "srshl v21.4s, v21.4s, v22.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshr v19.4s, v19.4s, #0x1f\n"
    "sqadd v20.4s, v20.4s, v17.4s\n"
    "ldr q18, [%x[params], #0x110]\n"
    "sqadd v30.4s, v30.4s, v16.4s\n"
    "ldr q17, [%x[params], #0x100]\n"
    "add v21.4s, v21.4s, v12.4s\n"
    "sqadd v29.4s, v29.4s, v19.4s\n"
    "ldr q16, [%x[params], #0xf0]\n"
    "srshl v20.4s, v20.4s, v22.4s\n"
    "srshl v30.4s, v30.4s, v22.4s\n"
    "smax v21.4s, v21.4s, v7.4s\n"
    "srshl v29.4s, v29.4s, v22.4s\n"
    "ldr q23, [%x[params], #0x130]\n"
    "add v20.4s, v20.4s, v12.4s\n"
    "add v30.4s, v30.4s, v12.4s\n"
    "smin v21.4s, v21.4s, v11.4s\n"
    "add v29.4s, v29.4s, v12.4s\n"
    "smax v20.4s, v20.4s, v7.4s\n"
    "smax v30.4s, v30.4s, v7.4s\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "smax v29.4s, v29.4s, v7.4s\n"
    "smin v20.4s, v20.4s, v11.4s\n"
    "smin v30.4s, v30.4s, v11.4s\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "smin v29.4s, v29.4s, v11.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "str s21, [x11, x12]\n"
    "ldr q22, [%x[params], #0xe0]\n"
    "add %x[params], %x[params], #0x140\n"
    "uzp1 v29.16b, v29.16b, v29.16b\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "uzp1 v29.16b, v29.16b, v29.16b\n"
    "str s20, [x9, x12]\n"
    "mov v21.16b, v22.16b\n"
    "str s30, [x10, x12]\n"
    "mov v20.16b, v22.16b\n"
    "str s29, [x28, x12]\n"
    "mov v19.16b, v22.16b\n"
    ".inst 0x6e899616  // udot v22.4s, v16.16b, v9.16b\n"
    "add x12, x12, #0x4\n"
    ".inst 0x6e849615  // udot v21.4s, v16.16b, v4.16b\n"
    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
    ".inst 0x6e849636  // udot v22.4s, v17.16b, v4.16b\n"
    "ext v4.16b, v4.16b, v4.16b, #0x1\n"
    ".inst 0x6e899614  // udot v20.4s, v16.16b, v9.16b\n"
    ".inst 0x6e849613  // udot v19.4s, v16.16b, v4.16b\n"
    ".inst 0x6e8495fb  // udot v27.4s, v15.16b, v4.16b\n"
    ".inst 0x6e869635  // udot v21.4s, v17.16b, v6.16b\n"
    ".inst 0x6e869656  // udot v22.4s, v18.16b, v6.16b\n"
    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
    ".inst 0x6e849634  // udot v20.4s, v17.16b, v4.16b\n"
    ".inst 0x6e869633  // udot v19.4s, v17.16b, v6.16b\n"
    ".inst 0x6e8695fb  // udot v27.4s, v15.16b, v6.16b\n"
    ".inst 0x6e999655  // udot v21.4s, v18.16b, v25.16b\n"
    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
    "mls v22.4s, v5.4s, v24.4s\n"
    ".inst 0x6e869654  // udot v20.4s, v18.16b, v6.16b\n"
    ".inst 0x6e999653  // udot v19.4s, v18.16b, v25.16b\n"
    "mov v17.16b, v27.16b\n .inst 0x6e9995f1  // udot v17.4s, v15.16b, v25.16b\n"
    ".inst 0x6e8995fb  // udot v27.4s, v15.16b, v9.16b\n"
    "sqrdmulh v22.4s, v22.4s, v26.4s\n"
    "mls v21.4s, v14.4s, v24.4s\n"
    "and v16.16b, v22.16b, v23.16b\n"
    "mls v20.4s, v27.4s, v24.4s\n"
    "mls v19.4s, v17.4s, v24.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v21.4s, v21.4s, v26.4s\n"
    "sqrdmulh v20.4s, v20.4s, v26.4s\n"
    "sqrdmulh v19.4s, v19.4s, v26.4s\n"
    "sqadd v22.4s, v22.4s, v16.4s\n"
    "and v18.16b, v21.16b, v23.16b\n"
    "and v17.16b, v20.16b, v23.16b\n"
    "and v16.16b, v19.16b, v23.16b\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "srshl v22.4s, v22.4s, v23.4s\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqadd v21.4s, v21.4s, v18.4s\n"
    "add v22.4s, v22.4s, v12.4s\n"
    "sqadd v20.4s, v20.4s, v17.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "srshl v21.4s, v21.4s, v23.4s\n"
    "smax v22.4s, v22.4s, v7.4s\n"
    "srshl v20.4s, v20.4s, v23.4s\n"
    "srshl v19.4s, v19.4s, v23.4s\n"
    "add v21.4s, v21.4s, v12.4s\n"
    "smin v22.4s, v22.4s, v11.4s\n"
    "add v20.4s, v20.4s, v12.4s\n"
    "add v19.4s, v19.4s, v12.4s\n"
    "smax v21.4s, v21.4s, v7.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "smax v20.4s, v20.4s, v7.4s\n"
    "smax v19.4s, v19.4s, v7.4s\n"
    "smin v21.4s, v21.4s, v11.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "smin v20.4s, v20.4s, v11.4s\n"
    "smin v19.4s, v19.4s, v11.4s\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "str s22, [x11, x12]\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "uzp1 v21.16b, v21.16b, v21.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "str s20, [x10, x12]\n"
    "str s21, [x9, x12]\n"
    "str s19, [x28, x12]\n"
    "add x12, x12, #0x4\n"
    "beq 35f\n"
    "3:"  // Oddments
    "and x20, %x[n_channels], #0xf\n"
    "add x15, x15, x13\n"
    "add x14, x14, x13\n"
    "add x27, x27, x13\n"
    "add x26, x26, x13\n"
    "add x25, x25, x13\n"
    "add x24, x24, x13\n"
    "add x23, x23, x13\n"
    "add x22, x22, x13\n"
    "tbz %x[n_channels], #3, 7f\n"
    "ldr d13, [x15], #0x8\n"
    "ldr d2, [x14], #0x8\n"
    "ldr d0, [x27], #0x8\n"
    "ldr d9, [x26], #0x8\n"
    "ldr d1, [x25], #0x8\n"
    "ldr d21, [x24], #0x8\n"
    "ldr d28, [x23], #0x8\n"
    "ldr d4, [x22], #0x8\n"
    "tbz %x[n_channels], #2, 5f\n"
    "ld1 { v13.s }[2], [x15], #0x4\n"
    "ld1 { v2.s }[2], [x14], #0x4\n"
    "ld1 { v0.s }[2], [x27], #0x4\n"
    "ld1 { v9.s }[2], [x26], #0x4\n"
    "ld1 { v1.s }[2], [x25], #0x4\n"
    "ld1 { v21.s }[2], [x24], #0x4\n"
    "ld1 { v28.s }[2], [x23], #0x4\n"
    "ld1 { v4.s }[2], [x22], #0x4\n"
    "tbz %x[n_channels], #1, 4f\n"
    "ld1 { v13.h }[6], [x15], #0x2\n"
    "ld1 { v2.h }[6], [x14], #0x2\n"
    "ld1 { v0.h }[6], [x27], #0x2\n"
    "ld1 { v9.h }[6], [x26], #0x2\n"
    "ld1 { v1.h }[6], [x25], #0x2\n"
    "ld1 { v21.h }[6], [x24], #0x2\n"
    "ld1 { v28.h }[6], [x23], #0x2\n"
    "ld1 { v4.h }[6], [x22], #0x2\n"
    "tbz %x[n_channels], #0, 11f\n"
    "ld1 { v13.b }[14], [x15], #0x1\n"
    "ld1 { v2.b }[14], [x14], #0x1\n"
    "ld1 { v0.b }[14], [x27], #0x1\n"
    "ld1 { v9.b }[14], [x26], #0x1\n"
    "ld1 { v1.b }[14], [x25], #0x1\n"
    "ld1 { v21.b }[14], [x24], #0x1\n"
    "ld1 { v28.b }[14], [x23], #0x1\n"
    "ld1 { v4.b }[14], [x22], #0x1\n"
    "b 11f\n"
    "4:"  // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
    "tbz %x[n_channels], #0, 11f\n"
    "ld1 { v13.b }[12], [x15], #0x1\n"
    "ld1 { v2.b }[12], [x14], #0x1\n"
    "ld1 { v0.b }[12], [x27], #0x1\n"
    "ld1 { v9.b }[12], [x26], #0x1\n"
    "ld1 { v1.b }[12], [x25], #0x1\n"
    "ld1 { v21.b }[12], [x24], #0x1\n"
    "ld1 { v28.b }[12], [x23], #0x1\n"
    "ld1 { v4.b }[12], [x22], #0x1\n"
    "b 11f\n"
    "5:"  // Oddments: Load (A): Bit 3: Bit 2: Unset
    "tbz %x[n_channels], #1, 6f\n"
    "ld1 { v13.h }[4], [x15], #0x2\n"
    "ld1 { v2.h }[4], [x14], #0x2\n"
    "ld1 { v0.h }[4], [x27], #0x2\n"
    "ld1 { v9.h }[4], [x26], #0x2\n"
    "ld1 { v1.h }[4], [x25], #0x2\n"
    "ld1 { v21.h }[4], [x24], #0x2\n"
    "ld1 { v28.h }[4], [x23], #0x2\n"
    "ld1 { v4.h }[4], [x22], #0x2\n"
    "tbz %x[n_channels], #0, 11f\n"
    "ld1 { v13.b }[10], [x15], #0x1\n"
    "ld1 { v2.b }[10], [x14], #0x1\n"
    "ld1 { v0.b }[10], [x27], #0x1\n"
    "ld1 { v9.b }[10], [x26], #0x1\n"
    "ld1 { v1.b }[10], [x25], #0x1\n"
    "ld1 { v21.b }[10], [x24], #0x1\n"
    "ld1 { v28.b }[10], [x23], #0x1\n"
    "ld1 { v4.b }[10], [x22], #0x1\n"
    "b 11f\n"
    "6:"  // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
    "tbz %x[n_channels], #0, 11f\n"
    "ld1 { v13.b }[8], [x15], #0x1\n"
    "ld1 { v2.b }[8], [x14], #0x1\n"
    "ld1 { v0.b }[8], [x27], #0x1\n"
    "ld1 { v9.b }[8], [x26], #0x1\n"
    "ld1 { v1.b }[8], [x25], #0x1\n"
    "ld1 { v21.b }[8], [x24], #0x1\n"
    "ld1 { v28.b }[8], [x23], #0x1\n"
    "ld1 { v4.b }[8], [x22], #0x1\n"
    "b 11f\n"
    "7:"  // Oddments: Load (A): Bit 3: Unset
    "tbz %x[n_channels], #2, 9f\n"
    "ldr s13, [x15], #0x4\n"
    "ldr s2, [x14], #0x4\n"
    "ldr s0, [x27], #0x4\n"
    "ldr s9, [x26], #0x4\n"
    "ldr s1, [x25], #0x4\n"
    "ldr s21, [x24], #0x4\n"
    "ldr s28, [x23], #0x4\n"
    "ldr s4, [x22], #0x4\n"
    "tbz %x[n_channels], #1, 8f\n"
    "ld1 { v13.h }[2], [x15], #0x2\n"
    "ld1 { v2.h }[2], [x14], #0x2\n"
    "ld1 { v0.h }[2], [x27], #0x2\n"
    "ld1 { v9.h }[2], [x26], #0x2\n"
    "ld1 { v1.h }[2], [x25], #0x2\n"
    "ld1 { v21.h }[2], [x24], #0x2\n"
    "ld1 { v28.h }[2], [x23], #0x2\n"
    "ld1 { v4.h }[2], [x22], #0x2\n"
    "tbz %x[n_channels], #0, 11f\n"
    "ld1 { v13.b }[6], [x15], #0x1\n"
    "ld1 { v2.b }[6], [x14], #0x1\n"
    "ld1 { v0.b }[6], [x27], #0x1\n"
    "ld1 { v9.b }[6], [x26], #0x1\n"
    "ld1 { v1.b }[6], [x25], #0x1\n"
    "ld1 { v21.b }[6], [x24], #0x1\n"
    "ld1 { v28.b }[6], [x23], #0x1\n"
    "ld1 { v4.b }[6], [x22], #0x1\n"
    "b 11f\n"
    "8:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
    "tbz %x[n_channels], #0, 11f\n"
    "ld1 { v13.b }[4], [x15], #0x1\n"
    "ld1 { v2.b }[4], [x14], #0x1\n"
    "ld1 { v0.b }[4], [x27], #0x1\n"
    "ld1 { v9.b }[4], [x26], #0x1\n"
    "ld1 { v1.b }[4], [x25], #0x1\n"
    "ld1 { v21.b }[4], [x24], #0x1\n"
    "ld1 { v28.b }[4], [x23], #0x1\n"
    "ld1 { v4.b }[4], [x22], #0x1\n"
    "b 11f\n"
    "9:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
    "tbz %x[n_channels], #1, 10f\n"
    "ldr h13, [x15], #0x2\n"
    "ldr h2, [x14], #0x2\n"
    "ldr h0, [x27], #0x2\n"
    "ldr h9, [x26], #0x2\n"
    "ldr h1, [x25], #0x2\n"
    "ldr h21, [x24], #0x2\n"
    "ldr h28, [x23], #0x2\n"
    "ldr h4, [x22], #0x2\n"
    "tbz %x[n_channels], #0, 11f\n"
    "ld1 { v13.b }[2], [x15], #0x1\n"
    "ld1 { v2.b }[2], [x14], #0x1\n"
    "ld1 { v0.b }[2], [x27], #0x1\n"
    "ld1 { v9.b }[2], [x26], #0x1\n"
    "ld1 { v1.b }[2], [x25], #0x1\n"
    "ld1 { v21.b }[2], [x24], #0x1\n"
    "ld1 { v28.b }[2], [x23], #0x1\n"
    "ld1 { v4.b }[2], [x22], #0x1\n"
    "b 11f\n"
    "10:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
    "ldr b13, [x15], #0x1\n"
    "ldr b2, [x14], #0x1\n"
    "ldr b0, [x27], #0x1\n"
    "ldr b9, [x26], #0x1\n"
    "ldr b1, [x25], #0x1\n"
    "ldr b21, [x24], #0x1\n"
    "ldr b28, [x23], #0x1\n"
    "ldr b4, [x22], #0x1\n"
    "11:"  // Oddments: Load (A): Bit 3: End
    "ldp x15, x14, [%x[inptrs], #0x40]\n"
    "ldp x27, x26, [%x[inptrs], #0x50]\n"
    "ldp x25, x24, [%x[inptrs], #0x60]\n"
    "ldp x23, x22, [%x[inptrs], #0x70]\n"
    "add x15, x15, x13\n"
    "add x14, x14, x13\n"
    "add x27, x27, x13\n"
    "add x26, x26, x13\n"
    "add x25, x25, x13\n"
    "add x24, x24, x13\n"
    "add x23, x23, x13\n"
    "add x22, x22, x13\n"
    "tbz %x[n_channels], #3, 15f\n"
    "ldr d10, [x15], #0x8\n"
    "ldr d29, [x14], #0x8\n"
    "ldr d27, [x27], #0x8\n"
    "ldr d6, [x26], #0x8\n"
    "ldr d5, [x25], #0x8\n"
    "ldr d18, [x24], #0x8\n"
    "ldr d14, [x23], #0x8\n"
    "ldr d25, [x22], #0x8\n"
    "tbz %x[n_channels], #2, 13f\n"
    "ld1 { v10.s }[2], [x15], #0x4\n"
    "ld1 { v29.s }[2], [x14], #0x4\n"
    "ld1 { v27.s }[2], [x27], #0x4\n"
    "ld1 { v6.s }[2], [x26], #0x4\n"
    "ld1 { v5.s }[2], [x25], #0x4\n"
    "ld1 { v18.s }[2], [x24], #0x4\n"
    "ld1 { v14.s }[2], [x23], #0x4\n"
    "ld1 { v25.s }[2], [x22], #0x4\n"
    "tbz %x[n_channels], #1, 12f\n"
    "ld1 { v10.h }[6], [x15], #0x2\n"
    "ld1 { v29.h }[6], [x14], #0x2\n"
    "ld1 { v27.h }[6], [x27], #0x2\n"
    "ld1 { v6.h }[6], [x26], #0x2\n"
    "ld1 { v5.h }[6], [x25], #0x2\n"
    "ld1 { v18.h }[6], [x24], #0x2\n"
    "ld1 { v14.h }[6], [x23], #0x2\n"
    "ld1 { v25.h }[6], [x22], #0x2\n"
    "tbz %x[n_channels], #0, 19f\n"
    "ld1 { v10.b }[14], [x15], #0x1\n"
    "ld1 { v29.b }[14], [x14], #0x1\n"
    "ld1 { v27.b }[14], [x27], #0x1\n"
    "ld1 { v6.b }[14], [x26], #0x1\n"
    "ld1 { v5.b }[14], [x25], #0x1\n"
    "ld1 { v18.b }[14], [x24], #0x1\n"
    "ld1 { v14.b }[14], [x23], #0x1\n"
    "ld1 { v25.b }[14], [x22], #0x1\n"
    "b 19f\n"
    "12:"  // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
    "tbz %x[n_channels], #0, 19f\n"
    "ld1 { v10.b }[12], [x15], #0x1\n"
    "ld1 { v29.b }[12], [x14], #0x1\n"
    "ld1 { v27.b }[12], [x27], #0x1\n"
    "ld1 { v6.b }[12], [x26], #0x1\n"
    "ld1 { v5.b }[12], [x25], #0x1\n"
    "ld1 { v18.b }[12], [x24], #0x1\n"
    "ld1 { v14.b }[12], [x23], #0x1\n"
    "ld1 { v25.b }[12], [x22], #0x1\n"
    "b 19f\n"
    "13:"  // Oddments: Load (B): Bit 3: Bit 2: Unset
    "tbz %x[n_channels], #1, 14f\n"
    "ld1 { v10.h }[4], [x15], #0x2\n"
    "ld1 { v29.h }[4], [x14], #0x2\n"
    "ld1 { v27.h }[4], [x27], #0x2\n"
    "ld1 { v6.h }[4], [x26], #0x2\n"
    "ld1 { v5.h }[4], [x25], #0x2\n"
    "ld1 { v18.h }[4], [x24], #0x2\n"
    "ld1 { v14.h }[4], [x23], #0x2\n"
    "ld1 { v25.h }[4], [x22], #0x2\n"
    "tbz %x[n_channels], #0, 19f\n"
    "ld1 { v10.b }[10], [x15], #0x1\n"
    "ld1 { v29.b }[10], [x14], #0x1\n"
    "ld1 { v27.b }[10], [x27], #0x1\n"
    "ld1 { v6.b }[10], [x26], #0x1\n"
    "ld1 { v5.b }[10], [x25], #0x1\n"
    "ld1 { v18.b }[10], [x24], #0x1\n"
    "ld1 { v14.b }[10], [x23], #0x1\n"
    "ld1 { v25.b }[10], [x22], #0x1\n"
    "b 19f\n"
    "14:"  // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
    "tbz %x[n_channels], #0, 19f\n"
    "ld1 { v10.b }[8], [x15], #0x1\n"
    "ld1 { v29.b }[8], [x14], #0x1\n"
    "ld1 { v27.b }[8], [x27], #0x1\n"
    "ld1 { v6.b }[8], [x26], #0x1\n"
    "ld1 { v5.b }[8], [x25], #0x1\n"
    "ld1 { v18.b }[8], [x24], #0x1\n"
    "ld1 { v14.b }[8], [x23], #0x1\n"
    "ld1 { v25.b }[8], [x22], #0x1\n"
    "b 19f\n"
    "15:"  // Oddments: Load (B): Bit 3: Unset
    "tbz %x[n_channels], #2, 17f\n"
    "ldr s10, [x15], #0x4\n"
    "ldr s29, [x14], #0x4\n"
    "ldr s27, [x27], #0x4\n"
    "ldr s6, [x26], #0x4\n"
    "ldr s5, [x25], #0x4\n"
    "ldr s18, [x24], #0x4\n"
    "ldr s14, [x23], #0x4\n"
    "ldr s25, [x22], #0x4\n"
    "tbz %x[n_channels], #1, 16f\n"
    "ld1 { v10.h }[2], [x15], #0x2\n"
    "ld1 { v29.h }[2], [x14], #0x2\n"
    "ld1 { v27.h }[2], [x27], #0x2\n"
    "ld1 { v6.h }[2], [x26], #0x2\n"
    "ld1 { v5.h }[2], [x25], #0x2\n"
    "ld1 { v18.h }[2], [x24], #0x2\n"
    "ld1 { v14.h }[2], [x23], #0x2\n"
    "ld1 { v25.h }[2], [x22], #0x2\n"
    "tbz %x[n_channels], #0, 19f\n"
    "ld1 { v10.b }[6], [x15], #0x1\n"
    "ld1 { v29.b }[6], [x14], #0x1\n"
    "ld1 { v27.b }[6], [x27], #0x1\n"
    "ld1 { v6.b }[6], [x26], #0x1\n"
    "ld1 { v5.b }[6], [x25], #0x1\n"
    "ld1 { v18.b }[6], [x24], #0x1\n"
    "ld1 { v14.b }[6], [x23], #0x1\n"
    "ld1 { v25.b }[6], [x22], #0x1\n"
    "b 19f\n"
    "16:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
    "tbz %x[n_channels], #0, 19f\n"
    "ld1 { v10.b }[4], [x15], #0x1\n"
    "ld1 { v29.b }[4], [x14], #0x1\n"
    "ld1 { v27.b }[4], [x27], #0x1\n"
    "ld1 { v6.b }[4], [x26], #0x1\n"
    "ld1 { v5.b }[4], [x25], #0x1\n"
    "ld1 { v18.b }[4], [x24], #0x1\n"
    "ld1 { v14.b }[4], [x23], #0x1\n"
    "ld1 { v25.b }[4], [x22], #0x1\n"
    "b 19f\n"
    "17:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
    "tbz %x[n_channels], #1, 18f\n"
    "ldr h10, [x15], #0x2\n"
    "ldr h29, [x14], #0x2\n"
    "ldr h27, [x27], #0x2\n"
    "ldr h6, [x26], #0x2\n"
    "ldr h5, [x25], #0x2\n"
    "ldr h18, [x24], #0x2\n"
    "ldr h14, [x23], #0x2\n"
    "ldr h25, [x22], #0x2\n"
    "tbz %x[n_channels], #0, 19f\n"
    "ld1 { v10.b }[2], [x15], #0x1\n"
    "ld1 { v29.b }[2], [x14], #0x1\n"
    "ld1 { v27.b }[2], [x27], #0x1\n"
    "ld1 { v6.b }[2], [x26], #0x1\n"
    "ld1 { v5.b }[2], [x25], #0x1\n"
    "ld1 { v18.b }[2], [x24], #0x1\n"
    "ld1 { v14.b }[2], [x23], #0x1\n"
    "ld1 { v25.b }[2], [x22], #0x1\n"
    "b 19f\n"
    "18:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
    "ldr b10, [x15], #0x1\n"
    "ldr b29, [x14], #0x1\n"
    "ldr b27, [x27], #0x1\n"
    "ldr b6, [x26], #0x1\n"
    "ldr b5, [x25], #0x1\n"
    "ldr b18, [x24], #0x1\n"
    "ldr b14, [x23], #0x1\n"
    "ldr b25, [x22], #0x1\n"
    "19:"  // Oddments: Load (B): Bit 3: End
    "ldr q20, [%x[params], #0x10]\n"
    "ldr q17, [%x[params], #0x20]\n"
    "zip2 v26.16b, v1.16b, v28.16b\n"
    "zip1 v1.16b, v1.16b, v28.16b\n"
    "ldr q30, [%x[params], #0x30]\n"
    "zip1 v19.16b, v21.16b, v4.16b\n"
    "zip2 v23.16b, v13.16b, v0.16b\n"
    "cmp x20, #0x4\n"
    "zip1 v13.16b, v13.16b, v0.16b\n"
    "zip1 v22.16b, v2.16b, v9.16b\n"
    "zip2 v9.16b, v2.16b, v9.16b\n"
    "zip2 v4.16b, v21.16b, v4.16b\n"
    "zip2 v21.16b, v1.16b, v19.16b\n"
    "zip1 v1.16b, v1.16b, v19.16b\n"
    "zip2 v16.16b, v10.16b, v27.16b\n"
    "zip1 v10.16b, v10.16b, v27.16b\n"
    "zip1 v19.16b, v29.16b, v6.16b\n"
    "movi v8.4s, #0x0\n"
    "zip2 v2.16b, v13.16b, v22.16b\n"
    "zip1 v13.16b, v13.16b, v22.16b\n"
    "zip1 v0.16b, v23.16b, v9.16b\n"
    "zip2 v9.16b, v23.16b, v9.16b\n"
    "ldr q31, [%x[params], #0x0]\n"
    ".inst 0x6e8195e8  // udot v8.4s, v15.16b, v1.16b\n"
    "zip2 v6.16b, v29.16b, v6.16b\n"
    "zip2 v22.16b, v5.16b, v14.16b\n"
    "zip1 v5.16b, v5.16b, v14.16b\n"
    "zip1 v3.16b, v18.16b, v25.16b\n"
    "zip2 v25.16b, v18.16b, v25.16b\n"
    "zip2 v29.16b, v10.16b, v19.16b\n"
    "zip1 v10.16b, v10.16b, v19.16b\n"
    "zip1 v28.16b, v26.16b, v4.16b\n"
    "zip2 v4.16b, v26.16b, v4.16b\n"
    "zip1 v27.16b, v16.16b, v6.16b\n"
    "zip2 v6.16b, v16.16b, v6.16b\n"
    "zip2 v18.16b, v5.16b, v3.16b\n"
    "zip1 v5.16b, v5.16b, v3.16b\n"
    "zip1 v14.16b, v22.16b, v25.16b\n"
    ".inst 0x6e8a95e8  // udot v8.4s, v15.16b, v10.16b\n"
    "zip2 v25.16b, v22.16b, v25.16b\n"
    "mov v26.16b, v31.16b\n"
    "mov v3.16b, v31.16b\n"
    "mov v23.16b, v31.16b\n"
    ".inst 0x6e8d969f  // udot v31.4s, v20.16b, v13.16b\n"
    "movi v22.4s, #0x0\n"
    ".inst 0x6e819683  // udot v3.4s, v20.16b, v1.16b\n"
    "mov v16.16b, v8.16b\n .inst 0x6e8595f0  // udot v16.4s, v15.16b, v5.16b\n"
    ".inst 0x6e8d95e8  // udot v8.4s, v15.16b, v13.16b\n"
    "ext v13.16b, v13.16b, v13.16b, #0x1\n"
    ".inst 0x6e81963f  // udot v31.4s, v17.16b, v1.16b\n"
    "ext v1.16b, v1.16b, v1.16b, #0x1\n"
    ".inst 0x6e8a9623  // udot v3.4s, v17.16b, v10.16b\n"
    ".inst 0x6e8d969a  // udot v26.4s, v20.16b, v13.16b\n"
    ".inst 0x6e8195f6  // udot v22.4s, v15.16b, v1.16b\n"
    ".inst 0x6e8a97df  // udot v31.4s, v30.16b, v10.16b\n"
    "ext v10.16b, v10.16b, v10.16b, #0x1\n"
    ".inst 0x6e819697  // udot v23.4s, v20.16b, v1.16b\n"
    ".inst 0x6e8597c3  // udot v3.4s, v30.16b, v5.16b\n"
    "ext v5.16b, v5.16b, v5.16b, #0x1\n"
    ".inst 0x6e81963a  // udot v26.4s, v17.16b, v1.16b\n"
    "ldr q20, [%x[params], #0x50]\n"
    ".inst 0x6e8a95f6  // udot v22.4s, v15.16b, v10.16b\n"
    "mls v31.4s, v8.4s, v24.4s\n"
    ".inst 0x6e8a9637  // udot v23.4s, v17.16b, v10.16b\n"
    "mls v3.4s, v16.4s, v24.4s\n"
    "mov v19.16b, v22.16b\n .inst 0x6e8595f3  // udot v19.4s, v15.16b, v5.16b\n"
    ".inst 0x6e8d95f6  // udot v22.4s, v15.16b, v13.16b\n"
    "ldr q17, [%x[params], #0x40]\n"
    "add %x[params], %x[params], #0x60\n"
    ".inst 0x6e8a97da  // udot v26.4s, v30.16b, v10.16b\n"
    ".inst 0x6e8597d7  // udot v23.4s, v30.16b, v5.16b\n"
    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
    "sqrdmulh v3.4s, v3.4s, v17.4s\n"
    "mls v26.4s, v22.4s, v24.4s\n"
    "and v16.16b, v31.16b, v20.16b\n"
    "mls v23.4s, v19.4s, v24.4s\n"
    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
    "and v19.16b, v3.16b, v20.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v23.4s, v23.4s, v17.4s\n"
    "and v17.16b, v26.16b, v20.16b\n"
    "sshr v19.4s, v19.4s, #0x1f\n"
    "sqadd v31.4s, v31.4s, v16.4s\n"
    "and v16.16b, v23.16b, v20.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "sqadd v3.4s, v3.4s, v19.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "srshl v31.4s, v31.4s, v20.4s\n"
    "sqadd v26.4s, v26.4s, v17.4s\n"
    "srshl v3.4s, v3.4s, v20.4s\n"
    "sqadd v23.4s, v23.4s, v16.4s\n"
    "add v31.4s, v31.4s, v12.4s\n"
    "srshl v26.4s, v26.4s, v20.4s\n"
    "add v3.4s, v3.4s, v12.4s\n"
    "srshl v23.4s, v23.4s, v20.4s\n"
    "smax v31.4s, v31.4s, v7.4s\n"
    "add v26.4s, v26.4s, v12.4s\n"
    "smax v3.4s, v3.4s, v7.4s\n"
    "add v23.4s, v23.4s, v12.4s\n"
    "smin v31.4s, v31.4s, v11.4s\n"
    "smax v26.4s, v26.4s, v7.4s\n"
    "smin v3.4s, v3.4s, v11.4s\n"
    "smax v23.4s, v23.4s, v7.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smin v26.4s, v26.4s, v11.4s\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "smin v23.4s, v23.4s, v11.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "blt 20f\n"
    "str s31, [x11, x12]\n"
    "str s26, [x10, x12]\n"
    "str s3, [x9, x12]\n"
    "str s23, [x28, x12]\n"
    "b 23f\n"
    "20:"  // Oddments: Unroll 0: Oddment store
    "add x11, x11, x12\n"
    "add x10, x10, x12\n"
    "add x9, x9, x12\n"
    "add x28, x28, x12\n"
    "tbz x20, #1, 21f\n"
    "st1 { v31.h }[0], [x11], #0x2\n"
    "st1 { v26.h }[0], [x10], #0x2\n"
    "st1 { v3.h }[0], [x9], #0x2\n"
    "st1 { v23.h }[0], [x28], #0x2\n"
    "tbz x20, #0, 22f\n"
    "st1 { v31.b }[2], [x11], #0x1\n"
    "st1 { v26.b }[2], [x10], #0x1\n"
    "st1 { v3.b }[2], [x9], #0x1\n"
    "st1 { v23.b }[2], [x28], #0x1\n"
    "b 22f\n"
    "21:"  // Oddments: Unroll 0: Oddment store: Bit 1: Unset
    "st1 { v31.b }[0], [x11], #0x1\n"
    "st1 { v26.b }[0], [x10], #0x1\n"
    "st1 { v3.b }[0], [x9], #0x1\n"
    "st1 { v23.b }[0], [x28], #0x1\n"
    "22:"  // Oddments: Unroll 0: Oddment store: Bit 1: End
    "23:"  // Oddments: Unroll 0: After oddment store
    "subs x20, x20, #0x4\n"
    "add x12, x12, #0x4\n"
    "ble 35f\n"
    "ldr q31, [%x[params], #0x0]\n"
    "ldr q5, [%x[params], #0x10]\n"
    "movi v8.4s, #0x0\n"
    "movi v30.4s, #0x0\n"
    "ldr q22, [%x[params], #0x20]\n"
    "ldr q20, [%x[params], #0x30]\n"
    "cmp x20, #0x4\n"
    "ldr q17, [%x[params], #0x40]\n"
    "ldr q19, [%x[params], #0x50]\n"
    ".inst 0x6e9595e8  // udot v8.4s, v15.16b, v21.16b\n"
    "add %x[params], %x[params], #0x60\n"
    "mov v26.16b, v31.16b\n"
    "mov v3.16b, v31.16b\n"
    "mov v23.16b, v31.16b\n"
    ".inst 0x6e8294bf  // udot v31.4s, v5.16b, v2.16b\n"
    ".inst 0x6e9594a3  // udot v3.4s, v5.16b, v21.16b\n"
    ".inst 0x6e9d95e8  // udot v8.4s, v15.16b, v29.16b\n"
    ".inst 0x6e9596df  // udot v31.4s, v22.16b, v21.16b\n"
    "ext v21.16b, v21.16b, v21.16b, #0x1\n"
    ".inst 0x6e9594b7  // udot v23.4s, v5.16b, v21.16b\n"
    ".inst 0x6e9595fe  // udot v30.4s, v15.16b, v21.16b\n"
    ".inst 0x6e9d96c3  // udot v3.4s, v22.16b, v29.16b\n"
    "mov v16.16b, v8.16b\n .inst 0x6e9295f0  // udot v16.4s, v15.16b, v18.16b\n"
    ".inst 0x6e8295e8  // udot v8.4s, v15.16b, v2.16b\n"
    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
    ".inst 0x6e9d969f  // udot v31.4s, v20.16b, v29.16b\n"
    "ext v29.16b, v29.16b, v29.16b, #0x1\n"
    ".inst 0x6e8294ba  // udot v26.4s, v5.16b, v2.16b\n"
    ".inst 0x6e929683  // udot v3.4s, v20.16b, v18.16b\n"
    "ext v18.16b, v18.16b, v18.16b, #0x1\n"
    ".inst 0x6e9d96d7  // udot v23.4s, v22.16b, v29.16b\n"
    ".inst 0x6e9d95fe  // udot v30.4s, v15.16b, v29.16b\n"
    "mls v31.4s, v8.4s, v24.4s\n"
    ".inst 0x6e9596da  // udot v26.4s, v22.16b, v21.16b\n"
    "mls v3.4s, v16.4s, v24.4s\n"
    ".inst 0x6e929697  // udot v23.4s, v20.16b, v18.16b\n"
    "mov v16.16b, v30.16b\n .inst 0x6e9295f0  // udot v16.4s, v15.16b, v18.16b\n"
    ".inst 0x6e8295fe  // udot v30.4s, v15.16b, v2.16b\n"
    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
    ".inst 0x6e9d969a  // udot v26.4s, v20.16b, v29.16b\n"
    "sqrdmulh v3.4s, v3.4s, v17.4s\n"
    "mls v23.4s, v16.4s, v24.4s\n"
    "and v16.16b, v31.16b, v19.16b\n"
    "mls v26.4s, v30.4s, v24.4s\n"
    "sqrdmulh v23.4s, v23.4s, v17.4s\n"
    "and v18.16b, v3.16b, v19.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "sqadd v31.4s, v31.4s, v16.4s\n"
    "and v17.16b, v23.16b, v19.16b\n"
    "and v16.16b, v26.16b, v19.16b\n"
    "sqadd v3.4s, v3.4s, v18.4s\n"
    "srshl v31.4s, v31.4s, v19.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "srshl v3.4s, v3.4s, v19.4s\n"
    "add v31.4s, v31.4s, v12.4s\n"
    "sqadd v26.4s, v26.4s, v16.4s\n"
    "sqadd v23.4s, v23.4s, v17.4s\n"
    "add v3.4s, v3.4s, v12.4s\n"
    "smax v31.4s, v31.4s, v7.4s\n"
    "srshl v26.4s, v26.4s, v19.4s\n"
    "srshl v23.4s, v23.4s, v19.4s\n"
    "smax v3.4s, v3.4s, v7.4s\n"
    "smin v31.4s, v31.4s, v11.4s\n"
    "add v26.4s, v26.4s, v12.4s\n"
    "add v23.4s, v23.4s, v12.4s\n"
    "smin v3.4s, v3.4s, v11.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smax v26.4s, v26.4s, v7.4s\n"
    "smax v23.4s, v23.4s, v7.4s\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smin v26.4s, v26.4s, v11.4s\n"
    "smin v23.4s, v23.4s, v11.4s\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "blt 24f\n"
    "str s31, [x11, x12]\n"
    "str s26, [x10, x12]\n"
    "str s3, [x9, x12]\n"
    "str s23, [x28, x12]\n"
    "b 27f\n"
    "24:"  // Oddments: Unroll 1: Oddment store
    "add x11, x11, x12\n"
    "add x10, x10, x12\n"
    "add x9, x9, x12\n"
    "add x28, x28, x12\n"
    "tbz x20, #1, 25f\n"
    "st1 { v31.h }[0], [x11], #0x2\n"
    "st1 { v26.h }[0], [x10], #0x2\n"
    "st1 { v3.h }[0], [x9], #0x2\n"
    "st1 { v23.h }[0], [x28], #0x2\n"
    "tbz x20, #0, 26f\n"
    "st1 { v31.b }[2], [x11], #0x1\n"
    "st1 { v26.b }[2], [x10], #0x1\n"
    "st1 { v3.b }[2], [x9], #0x1\n"
    "st1 { v23.b }[2], [x28], #0x1\n"
    "b 26f\n"
    "25:"  // Oddments: Unroll 1: Oddment store: Bit 1: Unset
    "st1 { v31.b }[0], [x11], #0x1\n"
    "st1 { v26.b }[0], [x10], #0x1\n"
    "st1 { v3.b }[0], [x9], #0x1\n"
    "st1 { v23.b }[0], [x28], #0x1\n"
    "26:"  // Oddments: Unroll 1: Oddment store: Bit 1: End
    "27:"  // Oddments: Unroll 1: After oddment store
    "subs x20, x20, #0x4\n"
    "add x12, x12, #0x4\n"
    "ble 35f\n"
    "ldr q31, [%x[params], #0x0]\n"
    "ldr q29, [%x[params], #0x10]\n"
    "movi v22.4s, #0x0\n"
    "movi v21.4s, #0x0\n"
    "ldr q20, [%x[params], #0x20]\n"
    "ldr q18, [%x[params], #0x30]\n"
    "cmp x20, #0x4\n"
    "ldr q17, [%x[params], #0x40]\n"
    "ldr q19, [%x[params], #0x50]\n"
    ".inst 0x6e9c95f6  // udot v22.4s, v15.16b, v28.16b\n"
    "add %x[params], %x[params], #0x60\n"
    "mov v26.16b, v31.16b\n"
    "mov v3.16b, v31.16b\n"
    "mov v23.16b, v31.16b\n"
    ".inst 0x6e8097bf  // udot v31.4s, v29.16b, v0.16b\n"
    ".inst 0x6e9c97a3  // udot v3.4s, v29.16b, v28.16b\n"
    ".inst 0x6e9b95f6  // udot v22.4s, v15.16b, v27.16b\n"
    ".inst 0x6e9c969f  // udot v31.4s, v20.16b, v28.16b\n"
    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
    ".inst 0x6e9c97b7  // udot v23.4s, v29.16b, v28.16b\n"
    ".inst 0x6e9c95f5  // udot v21.4s, v15.16b, v28.16b\n"
    ".inst 0x6e9b9683  // udot v3.4s, v20.16b, v27.16b\n"
    "mov v16.16b, v22.16b\n .inst 0x6e8e95f0  // udot v16.4s, v15.16b, v14.16b\n"
    ".inst 0x6e8095f6  // udot v22.4s, v15.16b, v0.16b\n"
    "ext v0.16b, v0.16b, v0.16b, #0x1\n"
    ".inst 0x6e9b965f  // udot v31.4s, v18.16b, v27.16b\n"
    "ext v27.16b, v27.16b, v27.16b, #0x1\n"
    ".inst 0x6e8097ba  // udot v26.4s, v29.16b, v0.16b\n"
    ".inst 0x6e8e9643  // udot v3.4s, v18.16b, v14.16b\n"
    "ext v14.16b, v14.16b, v14.16b, #0x1\n"
    ".inst 0x6e9b9697  // udot v23.4s, v20.16b, v27.16b\n"
    ".inst 0x6e9b95f5  // udot v21.4s, v15.16b, v27.16b\n"
    "mls v31.4s, v22.4s, v24.4s\n"
    ".inst 0x6e9c969a  // udot v26.4s, v20.16b, v28.16b\n"
    "mls v3.4s, v16.4s, v24.4s\n"
    ".inst 0x6e8e9657  // udot v23.4s, v18.16b, v14.16b\n"
    "mov v16.16b, v21.16b\n .inst 0x6e8e95f0  // udot v16.4s, v15.16b, v14.16b\n"
    ".inst 0x6e8095f5  // udot v21.4s, v15.16b, v0.16b\n"
    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
    ".inst 0x6e9b965a  // udot v26.4s, v18.16b, v27.16b\n"
    "sqrdmulh v3.4s, v3.4s, v17.4s\n"
    "mls v23.4s, v16.4s, v24.4s\n"
    "and v16.16b, v31.16b, v19.16b\n"
    "mls v26.4s, v21.4s, v24.4s\n"
    "sqrdmulh v23.4s, v23.4s, v17.4s\n"
    "and v18.16b, v3.16b, v19.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "sqadd v31.4s, v31.4s, v16.4s\n"
    "and v17.16b, v23.16b, v19.16b\n"
    "and v16.16b, v26.16b, v19.16b\n"
    "sqadd v3.4s, v3.4s, v18.4s\n"
    "srshl v31.4s, v31.4s, v19.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "srshl v3.4s, v3.4s, v19.4s\n"
    "add v31.4s, v31.4s, v12.4s\n"
    "sqadd v26.4s, v26.4s, v16.4s\n"
    "sqadd v23.4s, v23.4s, v17.4s\n"
    "add v3.4s, v3.4s, v12.4s\n"
    "smax v31.4s, v31.4s, v7.4s\n"
    "srshl v26.4s, v26.4s, v19.4s\n"
    "srshl v23.4s, v23.4s, v19.4s\n"
    "smax v3.4s, v3.4s, v7.4s\n"
    "smin v31.4s, v31.4s, v11.4s\n"
    "add v26.4s, v26.4s, v12.4s\n"
    "add v23.4s, v23.4s, v12.4s\n"
    "smin v3.4s, v3.4s, v11.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smax v26.4s, v26.4s, v7.4s\n"
    "smax v23.4s, v23.4s, v7.4s\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smin v26.4s, v26.4s, v11.4s\n"
    "smin v23.4s, v23.4s, v11.4s\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "blt 28f\n"
    "str s31, [x11, x12]\n"
    "str s26, [x10, x12]\n"
    "str s3, [x9, x12]\n"
    "str s23, [x28, x12]\n"
    "b 31f\n"
    "28:"  // Oddments: Unroll 2: Oddment store
    "add x11, x11, x12\n"
    "add x10, x10, x12\n"
    "add x9, x9, x12\n"
    "add x28, x28, x12\n"
    "tbz x20, #1, 29f\n"
    "st1 { v31.h }[0], [x11], #0x2\n"
    "st1 { v26.h }[0], [x10], #0x2\n"
    "st1 { v3.h }[0], [x9], #0x2\n"
    "st1 { v23.h }[0], [x28], #0x2\n"
    "tbz x20, #0, 30f\n"
    "st1 { v31.b }[2], [x11], #0x1\n"
    "st1 { v26.b }[2], [x10], #0x1\n"
    "st1 { v3.b }[2], [x9], #0x1\n"
    "st1 { v23.b }[2], [x28], #0x1\n"
    "b 30f\n"
    "29:"  // Oddments: Unroll 2: Oddment store: Bit 1: Unset
    "st1 { v31.b }[0], [x11], #0x1\n"
    "st1 { v26.b }[0], [x10], #0x1\n"
    "st1 { v3.b }[0], [x9], #0x1\n"
    "st1 { v23.b }[0], [x28], #0x1\n"
    "30:"  // Oddments: Unroll 2: Oddment store: Bit 1: End
    "31:"  // Oddments: Unroll 2: After oddment store
    "subs x20, x20, #0x4\n"
    "add x12, x12, #0x4\n"
    "ble 35f\n"
    "ldr q31, [%x[params], #0x0]\n"
    "ldr q1, [%x[params], #0x10]\n"
    "movi v22.4s, #0x0\n"
    "movi v21.4s, #0x0\n"
    "ldr q20, [%x[params], #0x20]\n"
    "ldr q18, [%x[params], #0x30]\n"
    "ldr q17, [%x[params], #0x40]\n"
    "ldr q19, [%x[params], #0x50]\n"
    ".inst 0x6e8495f6  // udot v22.4s, v15.16b, v4.16b\n"
    "add %x[params], %x[params], #0x60\n"
    "mov v26.16b, v31.16b\n"
    "mov v3.16b, v31.16b\n"
    "mov v23.16b, v31.16b\n"
    ".inst 0x6e89943f  // udot v31.4s, v1.16b, v9.16b\n"
    ".inst 0x6e849423  // udot v3.4s, v1.16b, v4.16b\n"
    ".inst 0x6e8695f6  // udot v22.4s, v15.16b, v6.16b\n"
    ".inst 0x6e84969f  // udot v31.4s, v20.16b, v4.16b\n"
    "ext v4.16b, v4.16b, v4.16b, #0x1\n"
    ".inst 0x6e849437  // udot v23.4s, v1.16b, v4.16b\n"
    ".inst 0x6e8495f5  // udot v21.4s, v15.16b, v4.16b\n"
    ".inst 0x6e869683  // udot v3.4s, v20.16b, v6.16b\n"
    "mov v16.16b, v22.16b\n .inst 0x6e9995f0  // udot v16.4s, v15.16b, v25.16b\n"
    ".inst 0x6e8995f6  // udot v22.4s, v15.16b, v9.16b\n"
    "ext v9.16b, v9.16b, v9.16b, #0x1\n"
    ".inst 0x6e86965f  // udot v31.4s, v18.16b, v6.16b\n"
    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
    ".inst 0x6e89943a  // udot v26.4s, v1.16b, v9.16b\n"
    ".inst 0x6e999643  // udot v3.4s, v18.16b, v25.16b\n"
    "ext v25.16b, v25.16b, v25.16b, #0x1\n"
    ".inst 0x6e869697  // udot v23.4s, v20.16b, v6.16b\n"
    ".inst 0x6e8695f5  // udot v21.4s, v15.16b, v6.16b\n"
    "mls v31.4s, v22.4s, v24.4s\n"
    ".inst 0x6e84969a  // udot v26.4s, v20.16b, v4.16b\n"
    "mls v3.4s, v16.4s, v24.4s\n"
    ".inst 0x6e999657  // udot v23.4s, v18.16b, v25.16b\n"
    "mov v16.16b, v21.16b\n .inst 0x6e9995f0  // udot v16.4s, v15.16b, v25.16b\n"
    ".inst 0x6e8995f5  // udot v21.4s, v15.16b, v9.16b\n"
    "sqrdmulh v31.4s, v31.4s, v17.4s\n"
    ".inst 0x6e86965a  // udot v26.4s, v18.16b, v6.16b\n"
    "sqrdmulh v3.4s, v3.4s, v17.4s\n"
    "mls v23.4s, v16.4s, v24.4s\n"
    "and v16.16b, v31.16b, v19.16b\n"
    "mls v26.4s, v21.4s, v24.4s\n"
    "sqrdmulh v23.4s, v23.4s, v17.4s\n"
    "and v18.16b, v3.16b, v19.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v26.4s, v26.4s, v17.4s\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "sqadd v31.4s, v31.4s, v16.4s\n"
    "and v17.16b, v23.16b, v19.16b\n"
    "and v16.16b, v26.16b, v19.16b\n"
    "sqadd v3.4s, v3.4s, v18.4s\n"
    "srshl v31.4s, v31.4s, v19.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "srshl v3.4s, v3.4s, v19.4s\n"
    "add v31.4s, v31.4s, v12.4s\n"
    "sqadd v26.4s, v26.4s, v16.4s\n"
    "sqadd v23.4s, v23.4s, v17.4s\n"
    "add v3.4s, v3.4s, v12.4s\n"
    "smax v31.4s, v31.4s, v7.4s\n"
    "srshl v26.4s, v26.4s, v19.4s\n"
    "srshl v23.4s, v23.4s, v19.4s\n"
    "smax v3.4s, v3.4s, v7.4s\n"
    "smin v31.4s, v31.4s, v11.4s\n"
    "add v26.4s, v26.4s, v12.4s\n"
    "add v23.4s, v23.4s, v12.4s\n"
    "smin v3.4s, v3.4s, v11.4s\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smax v26.4s, v26.4s, v7.4s\n"
    "smax v23.4s, v23.4s, v7.4s\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "uzp1 v31.16b, v31.16b, v31.16b\n"
    "smin v26.4s, v26.4s, v11.4s\n"
    "smin v23.4s, v23.4s, v11.4s\n"
    "uzp1 v3.16b, v3.16b, v3.16b\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "uzp1 v26.16b, v26.16b, v26.16b\n"
    "uzp1 v23.16b, v23.16b, v23.16b\n"
    "32:"  // Oddments: Unroll 3: Oddment store
    "add x11, x11, x12\n"
    "add x10, x10, x12\n"
    "add x9, x9, x12\n"
    "add x28, x28, x12\n"
    "tbz x20, #1, 33f\n"
    "st1 { v31.h }[0], [x11], #0x2\n"
    "st1 { v26.h }[0], [x10], #0x2\n"
    "st1 { v3.h }[0], [x9], #0x2\n"
    "st1 { v23.h }[0], [x28], #0x2\n"
    "tbz x20, #0, 34f\n"
    "st1 { v31.b }[2], [x11], #0x1\n"
    "st1 { v26.b }[2], [x10], #0x1\n"
    "st1 { v3.b }[2], [x9], #0x1\n"
    "st1 { v23.b }[2], [x28], #0x1\n"
    "b 34f\n"
    "33:"  // Oddments: Unroll 3: Oddment store: Bit 1: Unset
    "st1 { v31.b }[0], [x11], #0x1\n"
    "st1 { v26.b }[0], [x10], #0x1\n"
    "st1 { v3.b }[0], [x9], #0x1\n"
    "st1 { v23.b }[0], [x28], #0x1\n"
    "34:"  // Oddments: Unroll 3: Oddment store: Bit 1: End
    "35:"  // End
    : [params] "+&r" (params)
    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
  );
}

}  // namespace depthwise
}  // namespace arm_conv

#endif  // defined(__aarch64__)
