/*
 * Copyright (c) 2025-2026 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#pragma once

#if (defined(ENABLE_FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) && defined(__aarch64__)
template<>
void MergeResults<12, 8, false>(
    __fp16 *out_ptr,
    const float * in_ptr,
    const int ldout,
    const int y0, const int ymax,
    const int x0, const int xmax,
    const __fp16 *bias,
    Activation act,
    bool accumulate)
{
    float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());

    switch(act.type) {
        default:
        case Activation::Type::None:
            break;
        case Activation::Type::BoundedReLU:
            maxval = static_cast<float>(act.param1);
            /* fall through */
        case Activation::Type::ReLU:
            minval = 0;
            break;
    }

    size_t rows = ymax-y0;
    size_t cols = xmax-x0;

    out_ptr += (y0 * ldout) + x0;
    bias = (bias == nullptr) ? nullptr : bias + x0;

    __asm__ __volatile__(
      "cbz %x[cols], 108f\n"
      "cbz %x[rows], 108f\n"
      "mov x11, #0x20\n"
      "dup v13.4s, %w[maxval]\n"
      "dup v12.4s, %w[minval]\n"
      "mul x11, %x[ldout], x11\n"
      "cbnz %x[accumulate], 66f\n"
      "1:"  // Initial: Row loop
      "cmp %x[rows], #0x7\n"
      "bgt 58f\n"
      "beq 50f\n"
      "cmp %x[rows], #0x5\n"
      "bgt 42f\n"
      "beq 34f\n"
      "cmp %x[rows], #0x3\n"
      "bgt 26f\n"
      "beq 18f\n"
      "cmp %x[rows], #0x1\n"
      "bgt 10f\n"
      "mov x10, %x[cols]\n"
      "mov x9, %x[out_ptr]\n"
      "mov x28, %x[bias]\n"
      "cmp x10, #0xc\n"
      "blt 6f\n"
      "3:"  // Initial: Height 1: Block loop
      "cbnz %x[bias], 4f\n"
      "movi v21.16b, #0\n"
      "movi v20.16b, #0\n"
      "movi v19.16b, #0\n"
      "b 5f\n"
      "4:"  // Initial: Height 1: Width 3: bias
      "ldr d18, [x28, #0]\n"
      "ldr d17, [x28, #0x8]\n"
      "ldr d16, [x28, #0x10]\n"
      "fcvtl v21.4s, v18.4h\n"
      "fcvtl v20.4s, v17.4h\n"
      "fcvtl v19.4s, v16.4h\n"
      "5:"  // Initial: Height 1: Width 3: init done
      "ldr q18, [%x[in_ptr], #0]\n"
      "ldr q17, [%x[in_ptr], #0x10]\n"
      "sub x10, x10, #0xc\n"
      "add x28, x28, #0x18\n"
      "ldr q16, [%x[in_ptr], #0x20]\n"
      "cmp x10, #0xc\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fadd v18.4s, v18.4s, v21.4s\n"
      "fadd v17.4s, v17.4s, v20.4s\n"
      "fadd v16.4s, v16.4s, v19.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "str d18, [x9, #0]\n"
      "str d17, [x9, #0x8]\n"
      "str d16, [x9, #0x10]\n"
      "add x9, x9, #0x18\n"
      "bge 3b\n"
      "6:"  // Initial: Height 1: no full blocks
      "cbz x10, 9f\n"
      "mov x20, %x[in_ptr]\n"
      "7:"  // Initial: Height 1: Single loop
      "movi v17.16b, #0\n"
      "cbz %x[bias], 8f\n"
      "ldr h16, [x28, #0]\n"
      "fcvtl v17.4s, v16.4h\n"
      "8:"  // Initial: Height 1: Scalar: no bias
      "ldr s16, [%x[in_ptr], #0]\n"
      "subs x10, x10, #0x1\n"
      "add x28, x28, #0x2\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v16.4s, v16.4s, v17.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "str h16, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "bne 7b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "9:"  // Initial: Height 1: no oddments
      "b 108f\n"
      "10:"  // Initial: Height 2
      "mov x10, %x[cols]\n"
      "mov x9, %x[out_ptr]\n"
      "mov x28, %x[bias]\n"
      "cmp x10, #0xc\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "blt 14f\n"
      "11:"  // Initial: Height 2: Block loop
      "cbnz %x[bias], 12f\n"
      "movi v24.16b, #0\n"
      "movi v23.16b, #0\n"
      "movi v22.16b, #0\n"
      "b 13f\n"
      "12:"  // Initial: Height 2: Width 3: bias
      "ldr d18, [x28, #0]\n"
      "ldr d17, [x28, #0x8]\n"
      "ldr d16, [x28, #0x10]\n"
      "fcvtl v24.4s, v18.4h\n"
      "fcvtl v23.4s, v17.4h\n"
      "fcvtl v22.4s, v16.4h\n"
      "13:"  // Initial: Height 2: Width 3: init done
      "ldr q16, [%x[in_ptr], #0]\n"
      "ldr q20, [%x[in_ptr], #0x10]\n"
      "sub x10, x10, #0xc\n"
      "add x28, x28, #0x18\n"
      "ldr q19, [%x[in_ptr], #0x20]\n"
      "ldr q18, [%x[in_ptr], #0x30]\n"
      "cmp x10, #0xc\n"
      "ldr q17, [%x[in_ptr], #0x40]\n"
      "ldr q21, [%x[in_ptr], #0x50]\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fadd v16.4s, v16.4s, v24.4s\n"
      "fadd v20.4s, v20.4s, v23.4s\n"
      "fadd v19.4s, v19.4s, v22.4s\n"
      "fadd v18.4s, v18.4s, v24.4s\n"
      "fadd v17.4s, v17.4s, v23.4s\n"
      "fadd v21.4s, v21.4s, v22.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v20.4h, v20.4s\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "str d16, [x9, #0]\n"
      "fcvtn v16.4h, v21.4s\n"
      "str d20, [x9, #0x8]\n"
      "str d19, [x9, #0x10]\n"
      "add x9, x9, #0x18\n"
      "str d18, [x27, #0]\n"
      "str d17, [x27, #0x8]\n"
      "str d16, [x27, #0x10]\n"
      "add x27, x27, #0x18\n"
      "bge 11b\n"
      "14:"  // Initial: Height 2: no full blocks
      "cbz x10, 17f\n"
      "mov x20, %x[in_ptr]\n"
      "15:"  // Initial: Height 2: Single loop
      "movi v18.16b, #0\n"
      "cbz %x[bias], 16f\n"
      "ldr h16, [x28, #0]\n"
      "fcvtl v18.4s, v16.4h\n"
      "16:"  // Initial: Height 2: Scalar: no bias
      "ldr s17, [%x[in_ptr], #0]\n"
      "ldr s16, [%x[in_ptr], #0x30]\n"
      "subs x10, x10, #0x1\n"
      "add x28, x28, #0x2\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v17.4s, v17.4s, v18.4s\n"
      "fadd v16.4s, v16.4s, v18.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "str h17, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "str h16, [x27, #0]\n"
      "add x27, x27, #0x2\n"
      "bne 15b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "17:"  // Initial: Height 2: no oddments
      "b 108f\n"
      "18:"  // Initial: Height 3
      "mov x10, %x[cols]\n"
      "mov x9, %x[out_ptr]\n"
      "mov x28, %x[bias]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "blt 22f\n"
      "19:"  // Initial: Height 3: Block loop
      "cbnz %x[bias], 20f\n"
      "movi v27.16b, #0\n"
      "movi v26.16b, #0\n"
      "movi v25.16b, #0\n"
      "b 21f\n"
      "20:"  // Initial: Height 3: Width 3: bias
      "ldr d18, [x28, #0]\n"
      "ldr d17, [x28, #0x8]\n"
      "ldr d16, [x28, #0x10]\n"
      "fcvtl v27.4s, v18.4h\n"
      "fcvtl v26.4s, v17.4h\n"
      "fcvtl v25.4s, v16.4h\n"
      "21:"  // Initial: Height 3: Width 3: init done
      "ldr q18, [%x[in_ptr], #0]\n"
      "ldr q17, [%x[in_ptr], #0x10]\n"
      "sub x10, x10, #0xc\n"
      "add x28, x28, #0x18\n"
      "ldr q16, [%x[in_ptr], #0x20]\n"
      "ldr q21, [%x[in_ptr], #0x30]\n"
      "cmp x10, #0xc\n"
      "ldr q20, [%x[in_ptr], #0x40]\n"
      "ldr q19, [%x[in_ptr], #0x50]\n"
      "ldr q24, [%x[in_ptr], #0x60]\n"
      "ldr q23, [%x[in_ptr], #0x70]\n"
      "fadd v18.4s, v18.4s, v27.4s\n"
      "fadd v17.4s, v17.4s, v26.4s\n"
      "ldr q22, [%x[in_ptr], #0x80]\n"
      "fadd v16.4s, v16.4s, v25.4s\n"
      "fadd v21.4s, v21.4s, v27.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fadd v20.4s, v20.4s, v26.4s\n"
      "fadd v19.4s, v19.4s, v25.4s\n"
      "fadd v24.4s, v24.4s, v27.4s\n"
      "fadd v23.4s, v23.4s, v26.4s\n"
      "fadd v22.4s, v22.4s, v25.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v21.4h, v21.4s\n"
      "fcvtn v20.4h, v20.4s\n"
      "str d18, [x9, #0]\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v24.4s\n"
      "str d17, [x9, #0x8]\n"
      "str d16, [x9, #0x10]\n"
      "fcvtn v17.4h, v23.4s\n"
      "fcvtn v16.4h, v22.4s\n"
      "add x9, x9, #0x18\n"
      "str d21, [x27, #0]\n"
      "str d20, [x27, #0x8]\n"
      "str d19, [x27, #0x10]\n"
      "add x27, x27, #0x18\n"
      "str d18, [x26, #0]\n"
      "str d17, [x26, #0x8]\n"
      "str d16, [x26, #0x10]\n"
      "add x26, x26, #0x18\n"
      "bge 19b\n"
      "22:"  // Initial: Height 3: no full blocks
      "cbz x10, 25f\n"
      "mov x20, %x[in_ptr]\n"
      "23:"  // Initial: Height 3: Single loop
      "movi v19.16b, #0\n"
      "cbz %x[bias], 24f\n"
      "ldr h16, [x28, #0]\n"
      "fcvtl v19.4s, v16.4h\n"
      "24:"  // Initial: Height 3: Scalar: no bias
      "ldr s16, [%x[in_ptr], #0]\n"
      "ldr s17, [%x[in_ptr], #0x30]\n"
      "subs x10, x10, #0x1\n"
      "add x28, x28, #0x2\n"
      "ldr s18, [%x[in_ptr], #0x60]\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v16.4s, v16.4s, v19.4s\n"
      "fadd v17.4s, v17.4s, v19.4s\n"
      "fadd v18.4s, v18.4s, v19.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "str h16, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "fcvtn v16.4h, v18.4s\n"
      "str h17, [x27, #0]\n"
      "add x27, x27, #0x2\n"
      "str h16, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "bne 23b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "25:"  // Initial: Height 3: no oddments
      "b 108f\n"
      "26:"  // Initial: Height 4
      "mov x9, %x[out_ptr]\n"
      "mov x10, %x[cols]\n"
      "mov x28, %x[bias]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "add x25, x26, %x[ldout], LSL #1\n"
      "blt 30f\n"
      "27:"  // Initial: Height 4: Block loop
      "cbnz %x[bias], 28f\n"
      "movi v30.16b, #0\n"
      "movi v29.16b, #0\n"
      "movi v28.16b, #0\n"
      "b 29f\n"
      "28:"  // Initial: Height 4: Width 3: bias
      "ldr d18, [x28, #0]\n"
      "ldr d17, [x28, #0x8]\n"
      "ldr d16, [x28, #0x10]\n"
      "fcvtl v30.4s, v18.4h\n"
      "fcvtl v29.4s, v17.4h\n"
      "fcvtl v28.4s, v16.4h\n"
      "29:"  // Initial: Height 4: Width 3: init done
      "ldr q19, [%x[in_ptr], #0]\n"
      "ldr q18, [%x[in_ptr], #0x10]\n"
      "sub x10, x10, #0xc\n"
      "add x28, x28, #0x18\n"
      "ldr q17, [%x[in_ptr], #0x20]\n"
      "ldr q16, [%x[in_ptr], #0x30]\n"
      "cmp x10, #0xc\n"
      "ldr q23, [%x[in_ptr], #0x40]\n"
      "ldr q22, [%x[in_ptr], #0x50]\n"
      "ldr q21, [%x[in_ptr], #0x60]\n"
      "ldr q20, [%x[in_ptr], #0x70]\n"
      "fadd v19.4s, v19.4s, v30.4s\n"
      "fadd v18.4s, v18.4s, v29.4s\n"
      "ldr q27, [%x[in_ptr], #0x80]\n"
      "ldr q26, [%x[in_ptr], #0x90]\n"
      "fadd v17.4s, v17.4s, v28.4s\n"
      "fadd v16.4s, v16.4s, v30.4s\n"
      "ldr q25, [%x[in_ptr], #0xa0]\n"
      "ldr q24, [%x[in_ptr], #0xb0]\n"
      "fadd v23.4s, v23.4s, v29.4s\n"
      "fadd v22.4s, v22.4s, v28.4s\n"
      "fadd v21.4s, v21.4s, v30.4s\n"
      "fadd v20.4s, v20.4s, v29.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fadd v27.4s, v27.4s, v28.4s\n"
      "fadd v26.4s, v26.4s, v30.4s\n"
      "fadd v25.4s, v25.4s, v29.4s\n"
      "fadd v24.4s, v24.4s, v28.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v23.4h, v23.4s\n"
      "fcvtn v22.4h, v22.4s\n"
      "str d19, [x9, #0]\n"
      "str d18, [x9, #0x8]\n"
      "fcvtn v21.4h, v21.4s\n"
      "fcvtn v20.4h, v20.4s\n"
      "str d17, [x9, #0x10]\n"
      "fcvtn v19.4h, v27.4s\n"
      "fcvtn v18.4h, v26.4s\n"
      "add x9, x9, #0x18\n"
      "str d16, [x27, #0]\n"
      "fcvtn v17.4h, v25.4s\n"
      "fcvtn v16.4h, v24.4s\n"
      "str d23, [x27, #0x8]\n"
      "str d22, [x27, #0x10]\n"
      "add x27, x27, #0x18\n"
      "str d21, [x26, #0]\n"
      "str d20, [x26, #0x8]\n"
      "str d19, [x26, #0x10]\n"
      "add x26, x26, #0x18\n"
      "str d18, [x25, #0]\n"
      "str d17, [x25, #0x8]\n"
      "str d16, [x25, #0x10]\n"
      "add x25, x25, #0x18\n"
      "bge 27b\n"
      "30:"  // Initial: Height 4: no full blocks
      "cbz x10, 33f\n"
      "mov x20, %x[in_ptr]\n"
      "31:"  // Initial: Height 4: Single loop
      "movi v20.16b, #0\n"
      "cbz %x[bias], 32f\n"
      "ldr h16, [x28, #0]\n"
      "fcvtl v20.4s, v16.4h\n"
      "32:"  // Initial: Height 4: Scalar: no bias
      "ldr s16, [%x[in_ptr], #0]\n"
      "ldr s18, [%x[in_ptr], #0x30]\n"
      "subs x10, x10, #0x1\n"
      "add x28, x28, #0x2\n"
      "ldr s17, [%x[in_ptr], #0x60]\n"
      "ldr s19, [%x[in_ptr], #0x90]\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v16.4s, v16.4s, v20.4s\n"
      "fadd v18.4s, v18.4s, v20.4s\n"
      "fadd v17.4s, v17.4s, v20.4s\n"
      "fadd v19.4s, v19.4s, v20.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "str h16, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "fcvtn v16.4h, v19.4s\n"
      "str h18, [x27, #0]\n"
      "add x27, x27, #0x2\n"
      "str h17, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "str h16, [x25, #0]\n"
      "add x25, x25, #0x2\n"
      "bne 31b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "33:"  // Initial: Height 4: no oddments
      "b 108f\n"
      "34:"  // Initial: Height 5
      "mov x9, %x[out_ptr]\n"
      "mov x10, %x[cols]\n"
      "mov x28, %x[bias]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "add x25, x26, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "add x24, x25, %x[ldout], LSL #1\n"
      "blt 38f\n"
      "35:"  // Initial: Height 5: Block loop
      "cbnz %x[bias], 36f\n"
      "movi v1.16b, #0\n"
      "movi v0.16b, #0\n"
      "movi v31.16b, #0\n"
      "b 37f\n"
      "36:"  // Initial: Height 5: Width 3: bias
      "ldr d18, [x28, #0]\n"
      "ldr d17, [x28, #0x8]\n"
      "ldr d16, [x28, #0x10]\n"
      "fcvtl v1.4s, v18.4h\n"
      "fcvtl v0.4s, v17.4h\n"
      "fcvtl v31.4s, v16.4h\n"
      "37:"  // Initial: Height 5: Width 3: init done
      "ldr q16, [%x[in_ptr], #0]\n"
      "ldr q20, [%x[in_ptr], #0x10]\n"
      "sub x10, x10, #0xc\n"
      "add x28, x28, #0x18\n"
      "ldr q19, [%x[in_ptr], #0x20]\n"
      "ldr q18, [%x[in_ptr], #0x30]\n"
      "cmp x10, #0xc\n"
      "ldr q17, [%x[in_ptr], #0x40]\n"
      "ldr q30, [%x[in_ptr], #0x50]\n"
      "ldr q24, [%x[in_ptr], #0x60]\n"
      "ldr q23, [%x[in_ptr], #0x70]\n"
      "fadd v16.4s, v16.4s, v1.4s\n"
      "fadd v20.4s, v20.4s, v0.4s\n"
      "ldr q22, [%x[in_ptr], #0x80]\n"
      "ldr q21, [%x[in_ptr], #0x90]\n"
      "fadd v19.4s, v19.4s, v31.4s\n"
      "fadd v18.4s, v18.4s, v1.4s\n"
      "ldr q29, [%x[in_ptr], #0xa0]\n"
      "ldr q28, [%x[in_ptr], #0xb0]\n"
      "fadd v17.4s, v17.4s, v0.4s\n"
      "fadd v30.4s, v30.4s, v31.4s\n"
      "ldr q27, [%x[in_ptr], #0xc0]\n"
      "ldr q26, [%x[in_ptr], #0xd0]\n"
      "fadd v24.4s, v24.4s, v1.4s\n"
      "fadd v23.4s, v23.4s, v0.4s\n"
      "ldr q25, [%x[in_ptr], #0xe0]\n"
      "fadd v22.4s, v22.4s, v31.4s\n"
      "fadd v21.4s, v21.4s, v1.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fadd v29.4s, v29.4s, v0.4s\n"
      "fadd v28.4s, v28.4s, v31.4s\n"
      "fadd v27.4s, v27.4s, v1.4s\n"
      "fadd v26.4s, v26.4s, v0.4s\n"
      "fadd v25.4s, v25.4s, v31.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v30.4s, v30.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v29.4s, v29.4s, v13.4s\n"
      "fmin v28.4s, v28.4s, v13.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v30.4s, v30.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v29.4s, v29.4s, v12.4s\n"
      "fmax v28.4s, v28.4s, v12.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v20.4h, v20.4s\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "str d16, [x9, #0]\n"
      "fcvtn v16.4h, v30.4s\n"
      "fcvtn v24.4h, v24.4s\n"
      "str d20, [x9, #0x8]\n"
      "str d19, [x9, #0x10]\n"
      "fcvtn v23.4h, v23.4s\n"
      "fcvtn v22.4h, v22.4s\n"
      "add x9, x9, #0x18\n"
      "str d18, [x27, #0]\n"
      "fcvtn v21.4h, v21.4s\n"
      "fcvtn v20.4h, v29.4s\n"
      "str d17, [x27, #0x8]\n"
      "fcvtn v19.4h, v28.4s\n"
      "fcvtn v18.4h, v27.4s\n"
      "str d16, [x27, #0x10]\n"
      "fcvtn v17.4h, v26.4s\n"
      "fcvtn v16.4h, v25.4s\n"
      "add x27, x27, #0x18\n"
      "str d24, [x26, #0]\n"
      "str d23, [x26, #0x8]\n"
      "str d22, [x26, #0x10]\n"
      "add x26, x26, #0x18\n"
      "str d21, [x25, #0]\n"
      "str d20, [x25, #0x8]\n"
      "str d19, [x25, #0x10]\n"
      "add x25, x25, #0x18\n"
      "str d18, [x24, #0]\n"
      "str d17, [x24, #0x8]\n"
      "str d16, [x24, #0x10]\n"
      "add x24, x24, #0x18\n"
      "bge 35b\n"
      "38:"  // Initial: Height 5: no full blocks
      "cbz x10, 41f\n"
      "mov x20, %x[in_ptr]\n"
      "39:"  // Initial: Height 5: Single loop
      "movi v21.16b, #0\n"
      "cbz %x[bias], 40f\n"
      "ldr h16, [x28, #0]\n"
      "fcvtl v21.4s, v16.4h\n"
      "40:"  // Initial: Height 5: Scalar: no bias
      "ldr s16, [%x[in_ptr], #0]\n"
      "ldr s19, [%x[in_ptr], #0x30]\n"
      "subs x10, x10, #0x1\n"
      "add x28, x28, #0x2\n"
      "ldr s18, [%x[in_ptr], #0x60]\n"
      "ldr s17, [%x[in_ptr], #0x90]\n"
      "ldr s20, [%x[in_ptr], #0xc0]\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v16.4s, v16.4s, v21.4s\n"
      "fadd v19.4s, v19.4s, v21.4s\n"
      "fadd v18.4s, v18.4s, v21.4s\n"
      "fadd v17.4s, v17.4s, v21.4s\n"
      "fadd v20.4s, v20.4s, v21.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "str h16, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "fcvtn v16.4h, v20.4s\n"
      "str h19, [x27, #0]\n"
      "add x27, x27, #0x2\n"
      "str h18, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "str h17, [x25, #0]\n"
      "add x25, x25, #0x2\n"
      "str h16, [x24, #0]\n"
      "add x24, x24, #0x2\n"
      "bne 39b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "41:"  // Initial: Height 5: no oddments
      "b 108f\n"
      "42:"  // Initial: Height 6
      "mov x9, %x[out_ptr]\n"
      "mov x10, %x[cols]\n"
      "mov x28, %x[bias]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "add x25, x26, %x[ldout], LSL #1\n"
      "add x24, x25, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "add x23, x24, %x[ldout], LSL #1\n"
      "blt 46f\n"
      "43:"  // Initial: Height 6: Block loop
      "cbnz %x[bias], 44f\n"
      "movi v4.16b, #0\n"
      "movi v3.16b, #0\n"
      "movi v2.16b, #0\n"
      "b 45f\n"
      "44:"  // Initial: Height 6: Width 3: bias
      "ldr d18, [x28, #0]\n"
      "ldr d17, [x28, #0x8]\n"
      "ldr d16, [x28, #0x10]\n"
      "fcvtl v4.4s, v18.4h\n"
      "fcvtl v3.4s, v17.4h\n"
      "fcvtl v2.4s, v16.4h\n"
      "45:"  // Initial: Height 6: Width 3: init done
      "ldr q21, [%x[in_ptr], #0]\n"
      "ldr q16, [%x[in_ptr], #0x10]\n"
      "sub x10, x10, #0xc\n"
      "add x28, x28, #0x18\n"
      "ldr q20, [%x[in_ptr], #0x20]\n"
      "ldr q19, [%x[in_ptr], #0x30]\n"
      "cmp x10, #0xc\n"
      "ldr q18, [%x[in_ptr], #0x40]\n"
      "ldr q17, [%x[in_ptr], #0x50]\n"
      "ldr q1, [%x[in_ptr], #0x60]\n"
      "ldr q26, [%x[in_ptr], #0x70]\n"
      "fadd v21.4s, v21.4s, v4.4s\n"
      "fadd v16.4s, v16.4s, v3.4s\n"
      "ldr q25, [%x[in_ptr], #0x80]\n"
      "ldr q24, [%x[in_ptr], #0x90]\n"
      "fadd v20.4s, v20.4s, v2.4s\n"
      "fadd v19.4s, v19.4s, v4.4s\n"
      "ldr q23, [%x[in_ptr], #0xa0]\n"
      "ldr q22, [%x[in_ptr], #0xb0]\n"
      "fadd v18.4s, v18.4s, v3.4s\n"
      "fadd v17.4s, v17.4s, v2.4s\n"
      "ldr q0, [%x[in_ptr], #0xc0]\n"
      "ldr q31, [%x[in_ptr], #0xd0]\n"
      "fadd v1.4s, v1.4s, v4.4s\n"
      "fadd v26.4s, v26.4s, v3.4s\n"
      "ldr q30, [%x[in_ptr], #0xe0]\n"
      "ldr q29, [%x[in_ptr], #0xf0]\n"
      "fadd v25.4s, v25.4s, v2.4s\n"
      "fadd v24.4s, v24.4s, v4.4s\n"
      "ldr q28, [%x[in_ptr], #0x100]\n"
      "ldr q27, [%x[in_ptr], #0x110]\n"
      "fadd v23.4s, v23.4s, v3.4s\n"
      "fadd v22.4s, v22.4s, v2.4s\n"
      "fadd v0.4s, v0.4s, v4.4s\n"
      "fadd v31.4s, v31.4s, v3.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fadd v30.4s, v30.4s, v2.4s\n"
      "fadd v29.4s, v29.4s, v4.4s\n"
      "fadd v28.4s, v28.4s, v3.4s\n"
      "fadd v27.4s, v27.4s, v2.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v1.4s, v1.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmin v0.4s, v0.4s, v13.4s\n"
      "fmin v31.4s, v31.4s, v13.4s\n"
      "fmin v30.4s, v30.4s, v13.4s\n"
      "fmin v29.4s, v29.4s, v13.4s\n"
      "fmin v28.4s, v28.4s, v13.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v1.4s, v1.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fmax v0.4s, v0.4s, v12.4s\n"
      "fmax v31.4s, v31.4s, v12.4s\n"
      "fmax v30.4s, v30.4s, v12.4s\n"
      "fmax v29.4s, v29.4s, v12.4s\n"
      "fmax v28.4s, v28.4s, v12.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fcvtn v21.4h, v21.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v20.4h, v20.4s\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "str d21, [x9, #0]\n"
      "str d16, [x9, #0x8]\n"
      "fcvtn v16.4h, v1.4s\n"
      "fcvtn v26.4h, v26.4s\n"
      "str d20, [x9, #0x10]\n"
      "fcvtn v25.4h, v25.4s\n"
      "fcvtn v24.4h, v24.4s\n"
      "add x9, x9, #0x18\n"
      "str d19, [x27, #0]\n"
      "fcvtn v23.4h, v23.4s\n"
      "fcvtn v22.4h, v22.4s\n"
      "str d18, [x27, #0x8]\n"
      "fcvtn v21.4h, v0.4s\n"
      "fcvtn v20.4h, v31.4s\n"
      "str d17, [x27, #0x10]\n"
      "fcvtn v19.4h, v30.4s\n"
      "fcvtn v18.4h, v29.4s\n"
      "add x27, x27, #0x18\n"
      "str d16, [x26, #0]\n"
      "fcvtn v17.4h, v28.4s\n"
      "fcvtn v16.4h, v27.4s\n"
      "str d26, [x26, #0x8]\n"
      "str d25, [x26, #0x10]\n"
      "add x26, x26, #0x18\n"
      "str d24, [x25, #0]\n"
      "str d23, [x25, #0x8]\n"
      "str d22, [x25, #0x10]\n"
      "add x25, x25, #0x18\n"
      "str d21, [x24, #0]\n"
      "str d20, [x24, #0x8]\n"
      "str d19, [x24, #0x10]\n"
      "add x24, x24, #0x18\n"
      "str d18, [x23, #0]\n"
      "str d17, [x23, #0x8]\n"
      "str d16, [x23, #0x10]\n"
      "add x23, x23, #0x18\n"
      "bge 43b\n"
      "46:"  // Initial: Height 6: no full blocks
      "cbz x10, 49f\n"
      "mov x20, %x[in_ptr]\n"
      "47:"  // Initial: Height 6: Single loop
      "movi v22.16b, #0\n"
      "cbz %x[bias], 48f\n"
      "ldr h16, [x28, #0]\n"
      "fcvtl v22.4s, v16.4h\n"
      "48:"  // Initial: Height 6: Scalar: no bias
      "ldr s16, [%x[in_ptr], #0]\n"
      "ldr s20, [%x[in_ptr], #0x30]\n"
      "subs x10, x10, #0x1\n"
      "add x28, x28, #0x2\n"
      "ldr s19, [%x[in_ptr], #0x60]\n"
      "ldr s18, [%x[in_ptr], #0x90]\n"
      "ldr s17, [%x[in_ptr], #0xc0]\n"
      "ldr s21, [%x[in_ptr], #0xf0]\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v16.4s, v16.4s, v22.4s\n"
      "fadd v20.4s, v20.4s, v22.4s\n"
      "fadd v19.4s, v19.4s, v22.4s\n"
      "fadd v18.4s, v18.4s, v22.4s\n"
      "fadd v17.4s, v17.4s, v22.4s\n"
      "fadd v21.4s, v21.4s, v22.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v20.4h, v20.4s\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "str h16, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "fcvtn v16.4h, v21.4s\n"
      "str h20, [x27, #0]\n"
      "add x27, x27, #0x2\n"
      "str h19, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "str h18, [x25, #0]\n"
      "add x25, x25, #0x2\n"
      "str h17, [x24, #0]\n"
      "add x24, x24, #0x2\n"
      "str h16, [x23, #0]\n"
      "add x23, x23, #0x2\n"
      "bne 47b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "49:"  // Initial: Height 6: no oddments
      "b 108f\n"
      "50:"  // Initial: Height 7
      "mov x9, %x[out_ptr]\n"
      "mov x10, %x[cols]\n"
      "mov x28, %x[bias]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "add x25, x26, %x[ldout], LSL #1\n"
      "add x24, x25, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "add x23, x24, %x[ldout], LSL #1\n"
      "add x22, x23, %x[ldout], LSL #1\n"
      "blt 54f\n"
      "51:"  // Initial: Height 7: Block loop
      "cbnz %x[bias], 52f\n"
      "movi v7.16b, #0\n"
      "movi v6.16b, #0\n"
      "movi v5.16b, #0\n"
      "b 53f\n"
      "52:"  // Initial: Height 7: Width 3: bias
      "ldr d18, [x28, #0]\n"
      "ldr d17, [x28, #0x8]\n"
      "ldr d16, [x28, #0x10]\n"
      "fcvtl v7.4s, v18.4h\n"
      "fcvtl v6.4s, v17.4h\n"
      "fcvtl v5.4s, v16.4h\n"
      "53:"  // Initial: Height 7: Width 3: init done
      "ldr q18, [%x[in_ptr], #0]\n"
      "ldr q17, [%x[in_ptr], #0x10]\n"
      "sub x10, x10, #0xc\n"
      "add x28, x28, #0x18\n"
      "ldr q16, [%x[in_ptr], #0x20]\n"
      "ldr q21, [%x[in_ptr], #0x30]\n"
      "cmp x10, #0xc\n"
      "ldr q20, [%x[in_ptr], #0x40]\n"
      "ldr q19, [%x[in_ptr], #0x50]\n"
      "ldr q4, [%x[in_ptr], #0x60]\n"
      "ldr q3, [%x[in_ptr], #0x70]\n"
      "fadd v18.4s, v18.4s, v7.4s\n"
      "fadd v17.4s, v17.4s, v6.4s\n"
      "ldr q2, [%x[in_ptr], #0x80]\n"
      "ldr q27, [%x[in_ptr], #0x90]\n"
      "fadd v16.4s, v16.4s, v5.4s\n"
      "fadd v21.4s, v21.4s, v7.4s\n"
      "ldr q26, [%x[in_ptr], #0xa0]\n"
      "ldr q25, [%x[in_ptr], #0xb0]\n"
      "fadd v20.4s, v20.4s, v6.4s\n"
      "fadd v19.4s, v19.4s, v5.4s\n"
      "ldr q24, [%x[in_ptr], #0xc0]\n"
      "ldr q23, [%x[in_ptr], #0xd0]\n"
      "fadd v4.4s, v4.4s, v7.4s\n"
      "fadd v3.4s, v3.4s, v6.4s\n"
      "ldr q22, [%x[in_ptr], #0xe0]\n"
      "ldr q1, [%x[in_ptr], #0xf0]\n"
      "fadd v2.4s, v2.4s, v5.4s\n"
      "fadd v27.4s, v27.4s, v7.4s\n"
      "ldr q0, [%x[in_ptr], #0x100]\n"
      "ldr q31, [%x[in_ptr], #0x110]\n"
      "fadd v26.4s, v26.4s, v6.4s\n"
      "fadd v25.4s, v25.4s, v5.4s\n"
      "ldr q30, [%x[in_ptr], #0x120]\n"
      "ldr q29, [%x[in_ptr], #0x130]\n"
      "fadd v24.4s, v24.4s, v7.4s\n"
      "fadd v23.4s, v23.4s, v6.4s\n"
      "ldr q28, [%x[in_ptr], #0x140]\n"
      "fadd v22.4s, v22.4s, v5.4s\n"
      "fadd v1.4s, v1.4s, v7.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fadd v0.4s, v0.4s, v6.4s\n"
      "fadd v31.4s, v31.4s, v5.4s\n"
      "fadd v30.4s, v30.4s, v7.4s\n"
      "fadd v29.4s, v29.4s, v6.4s\n"
      "fadd v28.4s, v28.4s, v5.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v4.4s, v4.4s, v13.4s\n"
      "fmin v3.4s, v3.4s, v13.4s\n"
      "fmin v2.4s, v2.4s, v13.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmin v1.4s, v1.4s, v13.4s\n"
      "fmin v0.4s, v0.4s, v13.4s\n"
      "fmin v31.4s, v31.4s, v13.4s\n"
      "fmin v30.4s, v30.4s, v13.4s\n"
      "fmin v29.4s, v29.4s, v13.4s\n"
      "fmin v28.4s, v28.4s, v13.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v4.4s, v4.4s, v12.4s\n"
      "fmax v3.4s, v3.4s, v12.4s\n"
      "fmax v2.4s, v2.4s, v12.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fmax v1.4s, v1.4s, v12.4s\n"
      "fmax v0.4s, v0.4s, v12.4s\n"
      "fmax v31.4s, v31.4s, v12.4s\n"
      "fmax v30.4s, v30.4s, v12.4s\n"
      "fmax v29.4s, v29.4s, v12.4s\n"
      "fmax v28.4s, v28.4s, v12.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v21.4h, v21.4s\n"
      "fcvtn v20.4h, v20.4s\n"
      "str d18, [x9, #0]\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v4.4s\n"
      "str d17, [x9, #0x8]\n"
      "str d16, [x9, #0x10]\n"
      "fcvtn v17.4h, v3.4s\n"
      "fcvtn v16.4h, v2.4s\n"
      "add x9, x9, #0x18\n"
      "str d21, [x27, #0]\n"
      "fcvtn v27.4h, v27.4s\n"
      "fcvtn v26.4h, v26.4s\n"
      "str d20, [x27, #0x8]\n"
      "fcvtn v25.4h, v25.4s\n"
      "fcvtn v24.4h, v24.4s\n"
      "str d19, [x27, #0x10]\n"
      "fcvtn v23.4h, v23.4s\n"
      "fcvtn v22.4h, v22.4s\n"
      "add x27, x27, #0x18\n"
      "str d18, [x26, #0]\n"
      "fcvtn v21.4h, v1.4s\n"
      "fcvtn v20.4h, v0.4s\n"
      "str d17, [x26, #0x8]\n"
      "fcvtn v19.4h, v31.4s\n"
      "fcvtn v18.4h, v30.4s\n"
      "str d16, [x26, #0x10]\n"
      "fcvtn v17.4h, v29.4s\n"
      "fcvtn v16.4h, v28.4s\n"
      "add x26, x26, #0x18\n"
      "str d27, [x25, #0]\n"
      "str d26, [x25, #0x8]\n"
      "str d25, [x25, #0x10]\n"
      "add x25, x25, #0x18\n"
      "str d24, [x24, #0]\n"
      "str d23, [x24, #0x8]\n"
      "str d22, [x24, #0x10]\n"
      "add x24, x24, #0x18\n"
      "str d21, [x23, #0]\n"
      "str d20, [x23, #0x8]\n"
      "str d19, [x23, #0x10]\n"
      "add x23, x23, #0x18\n"
      "str d18, [x22, #0]\n"
      "str d17, [x22, #0x8]\n"
      "str d16, [x22, #0x10]\n"
      "add x22, x22, #0x18\n"
      "bge 51b\n"
      "54:"  // Initial: Height 7: no full blocks
      "cbz x10, 57f\n"
      "mov x20, %x[in_ptr]\n"
      "55:"  // Initial: Height 7: Single loop
      "movi v23.16b, #0\n"
      "cbz %x[bias], 56f\n"
      "ldr h16, [x28, #0]\n"
      "fcvtl v23.4s, v16.4h\n"
      "56:"  // Initial: Height 7: Scalar: no bias
      "ldr s16, [%x[in_ptr], #0]\n"
      "ldr s21, [%x[in_ptr], #0x30]\n"
      "subs x10, x10, #0x1\n"
      "add x28, x28, #0x2\n"
      "ldr s20, [%x[in_ptr], #0x60]\n"
      "ldr s19, [%x[in_ptr], #0x90]\n"
      "ldr s18, [%x[in_ptr], #0xc0]\n"
      "ldr s17, [%x[in_ptr], #0xf0]\n"
      "ldr s22, [%x[in_ptr], #0x120]\n"
      "fadd v16.4s, v16.4s, v23.4s\n"
      "fadd v21.4s, v21.4s, v23.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v20.4s, v20.4s, v23.4s\n"
      "fadd v19.4s, v19.4s, v23.4s\n"
      "fadd v18.4s, v18.4s, v23.4s\n"
      "fadd v17.4s, v17.4s, v23.4s\n"
      "fadd v22.4s, v22.4s, v23.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v21.4h, v21.4s\n"
      "fcvtn v20.4h, v20.4s\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "str h16, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "fcvtn v17.4h, v17.4s\n"
      "fcvtn v16.4h, v22.4s\n"
      "str h21, [x27, #0]\n"
      "add x27, x27, #0x2\n"
      "str h20, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "str h19, [x25, #0]\n"
      "add x25, x25, #0x2\n"
      "str h18, [x24, #0]\n"
      "add x24, x24, #0x2\n"
      "str h17, [x23, #0]\n"
      "add x23, x23, #0x2\n"
      "str h16, [x22, #0]\n"
      "add x22, x22, #0x2\n"
      "bne 55b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "57:"  // Initial: Height 7: no oddments
      "b 108f\n"
      "58:"  // Initial: Height 8
      "mov x9, %x[out_ptr]\n"
      "mov x10, %x[cols]\n"
      "mov x28, %x[bias]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "add x25, x26, %x[ldout], LSL #1\n"
      "add x24, x25, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "add x23, x24, %x[ldout], LSL #1\n"
      "add x22, x23, %x[ldout], LSL #1\n"
      "add x21, x22, %x[ldout], LSL #1\n"
      "blt 62f\n"
      "59:"  // Initial: Height 8: Block loop
      "cbnz %x[bias], 60f\n"
      "movi v10.16b, #0\n"
      "movi v9.16b, #0\n"
      "movi v8.16b, #0\n"
      "b 61f\n"
      "60:"  // Initial: Height 8: Width 3: bias
      "ldr d18, [x28, #0]\n"
      "ldr d17, [x28, #0x8]\n"
      "ldr d16, [x28, #0x10]\n"
      "fcvtl v10.4s, v18.4h\n"
      "fcvtl v9.4s, v17.4h\n"
      "fcvtl v8.4s, v16.4h\n"
      "61:"  // Initial: Height 8: Width 3: init done
      "ldr q18, [%x[in_ptr], #0]\n"
      "ldr q17, [%x[in_ptr], #0x10]\n"
      "sub x10, x10, #0xc\n"
      "add x28, x28, #0x18\n"
      "ldr q16, [%x[in_ptr], #0x20]\n"
      "ldr q22, [%x[in_ptr], #0x30]\n"
      "cmp x10, #0xc\n"
      "ldr q21, [%x[in_ptr], #0x40]\n"
      "ldr q20, [%x[in_ptr], #0x50]\n"
      "ldr q19, [%x[in_ptr], #0x60]\n"
      "ldr q7, [%x[in_ptr], #0x70]\n"
      "fadd v18.4s, v18.4s, v10.4s\n"
      "fadd v17.4s, v17.4s, v9.4s\n"
      "ldr q6, [%x[in_ptr], #0x80]\n"
      "ldr q5, [%x[in_ptr], #0x90]\n"
      "fadd v16.4s, v16.4s, v8.4s\n"
      "fadd v22.4s, v22.4s, v10.4s\n"
      "ldr q29, [%x[in_ptr], #0xa0]\n"
      "ldr q28, [%x[in_ptr], #0xb0]\n"
      "fadd v21.4s, v21.4s, v9.4s\n"
      "fadd v20.4s, v20.4s, v8.4s\n"
      "ldr q27, [%x[in_ptr], #0xc0]\n"
      "ldr q26, [%x[in_ptr], #0xd0]\n"
      "fadd v19.4s, v19.4s, v10.4s\n"
      "fadd v7.4s, v7.4s, v9.4s\n"
      "ldr q25, [%x[in_ptr], #0xe0]\n"
      "ldr q24, [%x[in_ptr], #0xf0]\n"
      "fadd v6.4s, v6.4s, v8.4s\n"
      "fadd v5.4s, v5.4s, v10.4s\n"
      "ldr q23, [%x[in_ptr], #0x100]\n"
      "ldr q4, [%x[in_ptr], #0x110]\n"
      "fadd v29.4s, v29.4s, v9.4s\n"
      "fadd v28.4s, v28.4s, v8.4s\n"
      "ldr q3, [%x[in_ptr], #0x120]\n"
      "ldr q2, [%x[in_ptr], #0x130]\n"
      "fadd v27.4s, v27.4s, v10.4s\n"
      "fadd v26.4s, v26.4s, v9.4s\n"
      "ldr q1, [%x[in_ptr], #0x140]\n"
      "ldr q0, [%x[in_ptr], #0x150]\n"
      "fadd v25.4s, v25.4s, v8.4s\n"
      "fadd v24.4s, v24.4s, v10.4s\n"
      "ldr q31, [%x[in_ptr], #0x160]\n"
      "ldr q30, [%x[in_ptr], #0x170]\n"
      "fadd v23.4s, v23.4s, v9.4s\n"
      "fadd v4.4s, v4.4s, v8.4s\n"
      "fadd v3.4s, v3.4s, v10.4s\n"
      "fadd v2.4s, v2.4s, v9.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fadd v1.4s, v1.4s, v8.4s\n"
      "fadd v0.4s, v0.4s, v10.4s\n"
      "fadd v31.4s, v31.4s, v9.4s\n"
      "fadd v30.4s, v30.4s, v8.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v7.4s, v7.4s, v13.4s\n"
      "fmin v6.4s, v6.4s, v13.4s\n"
      "fmin v5.4s, v5.4s, v13.4s\n"
      "fmin v29.4s, v29.4s, v13.4s\n"
      "fmin v28.4s, v28.4s, v13.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v4.4s, v4.4s, v13.4s\n"
      "fmin v3.4s, v3.4s, v13.4s\n"
      "fmin v2.4s, v2.4s, v13.4s\n"
      "fmin v1.4s, v1.4s, v13.4s\n"
      "fmin v0.4s, v0.4s, v13.4s\n"
      "fmin v31.4s, v31.4s, v13.4s\n"
      "fmin v30.4s, v30.4s, v13.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v7.4s, v7.4s, v12.4s\n"
      "fmax v6.4s, v6.4s, v12.4s\n"
      "fmax v5.4s, v5.4s, v12.4s\n"
      "fmax v29.4s, v29.4s, v12.4s\n"
      "fmax v28.4s, v28.4s, v12.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v4.4s, v4.4s, v12.4s\n"
      "fmax v3.4s, v3.4s, v12.4s\n"
      "fmax v2.4s, v2.4s, v12.4s\n"
      "fmax v1.4s, v1.4s, v12.4s\n"
      "fmax v0.4s, v0.4s, v12.4s\n"
      "fmax v31.4s, v31.4s, v12.4s\n"
      "fmax v30.4s, v30.4s, v12.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v22.4h, v22.4s\n"
      "fcvtn v21.4h, v21.4s\n"
      "fcvtn v20.4h, v20.4s\n"
      "str d18, [x9, #0]\n"
      "str d17, [x9, #0x8]\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v7.4s\n"
      "str d16, [x9, #0x10]\n"
      "fcvtn v17.4h, v6.4s\n"
      "fcvtn v16.4h, v5.4s\n"
      "add x9, x9, #0x18\n"
      "str d22, [x27, #0]\n"
      "fcvtn v29.4h, v29.4s\n"
      "fcvtn v28.4h, v28.4s\n"
      "str d21, [x27, #0x8]\n"
      "fcvtn v27.4h, v27.4s\n"
      "fcvtn v26.4h, v26.4s\n"
      "str d20, [x27, #0x10]\n"
      "fcvtn v25.4h, v25.4s\n"
      "fcvtn v24.4h, v24.4s\n"
      "add x27, x27, #0x18\n"
      "str d19, [x26, #0]\n"
      "fcvtn v23.4h, v23.4s\n"
      "fcvtn v22.4h, v4.4s\n"
      "str d18, [x26, #0x8]\n"
      "fcvtn v21.4h, v3.4s\n"
      "fcvtn v20.4h, v2.4s\n"
      "str d17, [x26, #0x10]\n"
      "fcvtn v19.4h, v1.4s\n"
      "fcvtn v18.4h, v0.4s\n"
      "add x26, x26, #0x18\n"
      "str d16, [x25, #0]\n"
      "fcvtn v17.4h, v31.4s\n"
      "fcvtn v16.4h, v30.4s\n"
      "str d29, [x25, #0x8]\n"
      "str d28, [x25, #0x10]\n"
      "add x25, x25, #0x18\n"
      "str d27, [x24, #0]\n"
      "str d26, [x24, #0x8]\n"
      "str d25, [x24, #0x10]\n"
      "add x24, x24, #0x18\n"
      "str d24, [x23, #0]\n"
      "str d23, [x23, #0x8]\n"
      "str d22, [x23, #0x10]\n"
      "add x23, x23, #0x18\n"
      "str d21, [x22, #0]\n"
      "str d20, [x22, #0x8]\n"
      "str d19, [x22, #0x10]\n"
      "add x22, x22, #0x18\n"
      "str d18, [x21, #0]\n"
      "str d17, [x21, #0x8]\n"
      "str d16, [x21, #0x10]\n"
      "add x21, x21, #0x18\n"
      "bge 59b\n"
      "62:"  // Initial: Height 8: no full blocks
      "cbz x10, 65f\n"
      "mov x20, %x[in_ptr]\n"
      "63:"  // Initial: Height 8: Single loop
      "movi v24.16b, #0\n"
      "cbz %x[bias], 64f\n"
      "ldr h16, [x28, #0]\n"
      "fcvtl v24.4s, v16.4h\n"
      "64:"  // Initial: Height 8: Scalar: no bias
      "ldr s17, [%x[in_ptr], #0]\n"
      "ldr s16, [%x[in_ptr], #0x30]\n"
      "subs x10, x10, #0x1\n"
      "add x28, x28, #0x2\n"
      "ldr s21, [%x[in_ptr], #0x60]\n"
      "ldr s20, [%x[in_ptr], #0x90]\n"
      "ldr s19, [%x[in_ptr], #0xc0]\n"
      "ldr s18, [%x[in_ptr], #0xf0]\n"
      "ldr s23, [%x[in_ptr], #0x120]\n"
      "ldr s22, [%x[in_ptr], #0x150]\n"
      "fadd v17.4s, v17.4s, v24.4s\n"
      "fadd v16.4s, v16.4s, v24.4s\n"
      "fadd v21.4s, v21.4s, v24.4s\n"
      "fadd v20.4s, v20.4s, v24.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v19.4s, v19.4s, v24.4s\n"
      "fadd v18.4s, v18.4s, v24.4s\n"
      "fadd v23.4s, v23.4s, v24.4s\n"
      "fadd v22.4s, v22.4s, v24.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v16.4s, v16.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v16.4s, v16.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "fcvtn v16.4h, v16.4s\n"
      "fcvtn v21.4h, v21.4s\n"
      "fcvtn v20.4h, v20.4s\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "str h17, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "str h16, [x27, #0]\n"
      "fcvtn v17.4h, v23.4s\n"
      "fcvtn v16.4h, v22.4s\n"
      "add x27, x27, #0x2\n"
      "str h21, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "str h20, [x25, #0]\n"
      "add x25, x25, #0x2\n"
      "str h19, [x24, #0]\n"
      "add x24, x24, #0x2\n"
      "str h18, [x23, #0]\n"
      "add x23, x23, #0x2\n"
      "str h17, [x22, #0]\n"
      "add x22, x22, #0x2\n"
      "str h16, [x21, #0]\n"
      "add x21, x21, #0x2\n"
      "bne 63b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "65:"  // Initial: Height 8: no oddments
      "subs %x[rows], %x[rows], #0x8\n"
      "add %x[out_ptr], %x[out_ptr], x11\n"
      "bgt 1b\n"
      "b 108f\n"
      "66:"  // Accumulate
      "67:"  // Accumulate: Row loop
      "cmp %x[rows], #0x7\n"
      "bgt 103f\n"
      "beq 98f\n"
      "cmp %x[rows], #0x5\n"
      "bgt 93f\n"
      "beq 88f\n"
      "cmp %x[rows], #0x3\n"
      "bgt 83f\n"
      "beq 78f\n"
      "cmp %x[rows], #0x1\n"
      "bgt 73f\n"
      "mov x10, %x[cols]\n"
      "mov x9, %x[out_ptr]\n"
      "cmp x10, #0xc\n"
      "blt 70f\n"
      "69:"  // Accumulate: Height 1: Block loop
      "ldr d16, [x9, #0]\n"
      "ldr q19, [%x[in_ptr], #0]\n"
      "sub x10, x10, #0xc\n"
      "ldr q18, [%x[in_ptr], #0x10]\n"
      "ldr q17, [%x[in_ptr], #0x20]\n"
      "cmp x10, #0xc\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v19.4s, v19.4s, v16.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fcvtn v16.4h, v19.4s\n"
      "str d16, [x9, #0]\n"
      "ldr d16, [x9, #0x8]\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v18.4s, v18.4s, v16.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fcvtn v16.4h, v18.4s\n"
      "str d16, [x9, #0x8]\n"
      "ldr d16, [x9, #0x10]\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v17.4s, v17.4s, v16.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fcvtn v16.4h, v17.4s\n"
      "str d16, [x9, #0x10]\n"
      "add x9, x9, #0x18\n"
      "bge 69b\n"
      "70:"  // Accumulate: Height 1: no full blocks
      "cbz x10, 72f\n"
      "mov x20, %x[in_ptr]\n"
      "71:"  // Accumulate: Height 1: Single loop
      "ldr h16, [x9, #0]\n"
      "ldr s17, [%x[in_ptr], #0]\n"
      "subs x10, x10, #0x1\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v17.4s, v17.4s, v16.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fcvtn v16.4h, v17.4s\n"
      "str h16, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "bne 71b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "72:"  // Accumulate: Height 1: no oddments
      "b 108f\n"
      "73:"  // Accumulate: Height 2
      "mov x10, %x[cols]\n"
      "mov x9, %x[out_ptr]\n"
      "cmp x10, #0xc\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "blt 75f\n"
      "74:"  // Accumulate: Height 2: Block loop
      "ldr d17, [x9, #0]\n"
      "ldr d16, [x27, #0]\n"
      "sub x10, x10, #0xc\n"
      "ldr q23, [%x[in_ptr], #0]\n"
      "ldr q22, [%x[in_ptr], #0x30]\n"
      "cmp x10, #0xc\n"
      "ldr q21, [%x[in_ptr], #0x10]\n"
      "ldr q20, [%x[in_ptr], #0x40]\n"
      "ldr q19, [%x[in_ptr], #0x20]\n"
      "ldr q18, [%x[in_ptr], #0x50]\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fadd v23.4s, v23.4s, v17.4s\n"
      "fadd v22.4s, v22.4s, v16.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fcvtn v16.4h, v23.4s\n"
      "fcvtn v17.4h, v22.4s\n"
      "str d16, [x9, #0]\n"
      "ldr d16, [x9, #0x8]\n"
      "str d17, [x27, #0]\n"
      "fcvtl v17.4s, v16.4h\n"
      "ldr d16, [x27, #0x8]\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v21.4s, v21.4s, v17.4s\n"
      "fadd v20.4s, v20.4s, v16.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fcvtn v16.4h, v21.4s\n"
      "fcvtn v17.4h, v20.4s\n"
      "str d16, [x9, #0x8]\n"
      "ldr d16, [x9, #0x10]\n"
      "str d17, [x27, #0x8]\n"
      "fcvtl v17.4s, v16.4h\n"
      "ldr d16, [x27, #0x10]\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v19.4s, v19.4s, v17.4s\n"
      "fadd v18.4s, v18.4s, v16.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fcvtn v17.4h, v19.4s\n"
      "fcvtn v16.4h, v18.4s\n"
      "str d17, [x9, #0x10]\n"
      "add x9, x9, #0x18\n"
      "str d16, [x27, #0x10]\n"
      "add x27, x27, #0x18\n"
      "bge 74b\n"
      "75:"  // Accumulate: Height 2: no full blocks
      "cbz x10, 77f\n"
      "mov x20, %x[in_ptr]\n"
      "76:"  // Accumulate: Height 2: Single loop
      "ldr h17, [x9, #0]\n"
      "ldr h16, [x27, #0]\n"
      "subs x10, x10, #0x1\n"
      "ldr s19, [%x[in_ptr], #0]\n"
      "ldr s18, [%x[in_ptr], #0x30]\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v19.4s, v19.4s, v17.4s\n"
      "fadd v18.4s, v18.4s, v16.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fcvtn v16.4h, v19.4s\n"
      "str h16, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "fcvtn v16.4h, v18.4s\n"
      "str h16, [x27, #0]\n"
      "add x27, x27, #0x2\n"
      "bne 76b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "77:"  // Accumulate: Height 2: no oddments
      "b 108f\n"
      "78:"  // Accumulate: Height 3
      "mov x10, %x[cols]\n"
      "mov x9, %x[out_ptr]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "blt 80f\n"
      "79:"  // Accumulate: Height 3: Block loop
      "ldr d18, [x9, #0]\n"
      "ldr d17, [x27, #0]\n"
      "sub x10, x10, #0xc\n"
      "ldr d16, [x26, #0]\n"
      "ldr q27, [%x[in_ptr], #0]\n"
      "cmp x10, #0xc\n"
      "ldr q26, [%x[in_ptr], #0x30]\n"
      "ldr q25, [%x[in_ptr], #0x60]\n"
      "ldr q24, [%x[in_ptr], #0x10]\n"
      "ldr q23, [%x[in_ptr], #0x40]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fcvtl v17.4s, v17.4h\n"
      "ldr q22, [%x[in_ptr], #0x70]\n"
      "ldr q21, [%x[in_ptr], #0x20]\n"
      "fcvtl v16.4s, v16.4h\n"
      "ldr q20, [%x[in_ptr], #0x50]\n"
      "ldr q19, [%x[in_ptr], #0x80]\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fadd v27.4s, v27.4s, v18.4s\n"
      "fadd v26.4s, v26.4s, v17.4s\n"
      "fadd v25.4s, v25.4s, v16.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fcvtn v18.4h, v27.4s\n"
      "fcvtn v16.4h, v26.4s\n"
      "fcvtn v17.4h, v25.4s\n"
      "str d18, [x9, #0]\n"
      "str d16, [x27, #0]\n"
      "ldr d16, [x9, #0x8]\n"
      "str d17, [x26, #0]\n"
      "ldr d17, [x27, #0x8]\n"
      "fcvtl v18.4s, v16.4h\n"
      "ldr d16, [x26, #0x8]\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v24.4s, v24.4s, v18.4s\n"
      "fadd v23.4s, v23.4s, v17.4s\n"
      "fadd v22.4s, v22.4s, v16.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fcvtn v16.4h, v24.4s\n"
      "fcvtn v18.4h, v23.4s\n"
      "str d16, [x9, #0x8]\n"
      "fcvtn v17.4h, v22.4s\n"
      "ldr d16, [x9, #0x10]\n"
      "str d18, [x27, #0x8]\n"
      "str d17, [x26, #0x8]\n"
      "fcvtl v18.4s, v16.4h\n"
      "ldr d17, [x27, #0x10]\n"
      "ldr d16, [x26, #0x10]\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v21.4s, v21.4s, v18.4s\n"
      "fadd v20.4s, v20.4s, v17.4s\n"
      "fadd v19.4s, v19.4s, v16.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fcvtn v16.4h, v21.4s\n"
      "fcvtn v17.4h, v20.4s\n"
      "str d16, [x9, #0x10]\n"
      "add x9, x9, #0x18\n"
      "fcvtn v16.4h, v19.4s\n"
      "str d17, [x27, #0x10]\n"
      "add x27, x27, #0x18\n"
      "str d16, [x26, #0x10]\n"
      "add x26, x26, #0x18\n"
      "bge 79b\n"
      "80:"  // Accumulate: Height 3: no full blocks
      "cbz x10, 82f\n"
      "mov x20, %x[in_ptr]\n"
      "81:"  // Accumulate: Height 3: Single loop
      "ldr h18, [x9, #0]\n"
      "ldr h17, [x27, #0]\n"
      "subs x10, x10, #0x1\n"
      "ldr h16, [x26, #0]\n"
      "ldr s21, [%x[in_ptr], #0]\n"
      "ldr s20, [%x[in_ptr], #0x30]\n"
      "ldr s19, [%x[in_ptr], #0x60]\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fcvtl v18.4s, v18.4h\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v21.4s, v21.4s, v18.4s\n"
      "fadd v20.4s, v20.4s, v17.4s\n"
      "fadd v19.4s, v19.4s, v16.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fcvtn v16.4h, v21.4s\n"
      "fcvtn v17.4h, v20.4s\n"
      "str h16, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "fcvtn v16.4h, v19.4s\n"
      "str h17, [x27, #0]\n"
      "add x27, x27, #0x2\n"
      "str h16, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "bne 81b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "82:"  // Accumulate: Height 3: no oddments
      "b 108f\n"
      "83:"  // Accumulate: Height 4
      "mov x9, %x[out_ptr]\n"
      "mov x10, %x[cols]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "add x25, x26, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "blt 85f\n"
      "84:"  // Accumulate: Height 4: Block loop
      "ldr d19, [x9, #0]\n"
      "ldr d18, [x27, #0]\n"
      "sub x10, x10, #0xc\n"
      "ldr d17, [x26, #0]\n"
      "ldr d16, [x25, #0]\n"
      "cmp x10, #0xc\n"
      "ldr q31, [%x[in_ptr], #0]\n"
      "ldr q30, [%x[in_ptr], #0x30]\n"
      "ldr q29, [%x[in_ptr], #0x60]\n"
      "ldr q28, [%x[in_ptr], #0x90]\n"
      "fcvtl v19.4s, v19.4h\n"
      "fcvtl v18.4s, v18.4h\n"
      "ldr q27, [%x[in_ptr], #0x10]\n"
      "ldr q26, [%x[in_ptr], #0x40]\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "ldr q25, [%x[in_ptr], #0x70]\n"
      "ldr q24, [%x[in_ptr], #0xa0]\n"
      "ldr q23, [%x[in_ptr], #0x20]\n"
      "ldr q22, [%x[in_ptr], #0x50]\n"
      "fadd v31.4s, v31.4s, v19.4s\n"
      "fadd v30.4s, v30.4s, v18.4s\n"
      "ldr q21, [%x[in_ptr], #0x80]\n"
      "ldr q20, [%x[in_ptr], #0xb0]\n"
      "fadd v29.4s, v29.4s, v17.4s\n"
      "fadd v28.4s, v28.4s, v16.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fmin v31.4s, v31.4s, v13.4s\n"
      "fmin v30.4s, v30.4s, v13.4s\n"
      "fmin v29.4s, v29.4s, v13.4s\n"
      "fmin v28.4s, v28.4s, v13.4s\n"
      "fmax v31.4s, v31.4s, v12.4s\n"
      "fmax v30.4s, v30.4s, v12.4s\n"
      "fmax v29.4s, v29.4s, v12.4s\n"
      "fmax v28.4s, v28.4s, v12.4s\n"
      "fcvtn v19.4h, v31.4s\n"
      "fcvtn v16.4h, v30.4s\n"
      "fcvtn v18.4h, v29.4s\n"
      "fcvtn v17.4h, v28.4s\n"
      "str d19, [x9, #0]\n"
      "str d16, [x27, #0]\n"
      "ldr d16, [x9, #0x8]\n"
      "str d18, [x26, #0]\n"
      "str d17, [x25, #0]\n"
      "ldr d18, [x27, #0x8]\n"
      "fcvtl v19.4s, v16.4h\n"
      "ldr d17, [x26, #0x8]\n"
      "ldr d16, [x25, #0x8]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fcvtl v17.4s, v17.4h\n"
      "fadd v27.4s, v27.4s, v19.4s\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v26.4s, v26.4s, v18.4s\n"
      "fadd v25.4s, v25.4s, v17.4s\n"
      "fadd v24.4s, v24.4s, v16.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fcvtn v17.4h, v27.4s\n"
      "fcvtn v19.4h, v26.4s\n"
      "fcvtn v16.4h, v25.4s\n"
      "str d17, [x9, #0x8]\n"
      "fcvtn v18.4h, v24.4s\n"
      "ldr d17, [x9, #0x10]\n"
      "str d19, [x27, #0x8]\n"
      "str d16, [x26, #0x8]\n"
      "ldr d16, [x27, #0x10]\n"
      "str d18, [x25, #0x8]\n"
      "fcvtl v19.4s, v17.4h\n"
      "ldr d17, [x26, #0x10]\n"
      "fcvtl v18.4s, v16.4h\n"
      "ldr d16, [x25, #0x10]\n"
      "fcvtl v17.4s, v17.4h\n"
      "fadd v23.4s, v23.4s, v19.4s\n"
      "fadd v22.4s, v22.4s, v18.4s\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v21.4s, v21.4s, v17.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fadd v20.4s, v20.4s, v16.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fcvtn v17.4h, v23.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fcvtn v16.4h, v22.4s\n"
      "str d17, [x9, #0x10]\n"
      "add x9, x9, #0x18\n"
      "fcvtn v17.4h, v21.4s\n"
      "str d16, [x27, #0x10]\n"
      "add x27, x27, #0x18\n"
      "fcvtn v16.4h, v20.4s\n"
      "str d17, [x26, #0x10]\n"
      "add x26, x26, #0x18\n"
      "str d16, [x25, #0x10]\n"
      "add x25, x25, #0x18\n"
      "bge 84b\n"
      "85:"  // Accumulate: Height 4: no full blocks
      "cbz x10, 87f\n"
      "mov x20, %x[in_ptr]\n"
      "86:"  // Accumulate: Height 4: Single loop
      "ldr h19, [x9, #0]\n"
      "ldr h18, [x27, #0]\n"
      "subs x10, x10, #0x1\n"
      "ldr h17, [x26, #0]\n"
      "ldr h16, [x25, #0]\n"
      "ldr s23, [%x[in_ptr], #0]\n"
      "ldr s22, [%x[in_ptr], #0x30]\n"
      "ldr s21, [%x[in_ptr], #0x60]\n"
      "ldr s20, [%x[in_ptr], #0x90]\n"
      "fcvtl v19.4s, v19.4h\n"
      "fcvtl v18.4s, v18.4h\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v23.4s, v23.4s, v19.4s\n"
      "fadd v22.4s, v22.4s, v18.4s\n"
      "fadd v21.4s, v21.4s, v17.4s\n"
      "fadd v20.4s, v20.4s, v16.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fcvtn v19.4h, v23.4s\n"
      "fcvtn v18.4h, v22.4s\n"
      "fcvtn v17.4h, v21.4s\n"
      "fcvtn v16.4h, v20.4s\n"
      "str h19, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "str h18, [x27, #0]\n"
      "add x27, x27, #0x2\n"
      "str h17, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "str h16, [x25, #0]\n"
      "add x25, x25, #0x2\n"
      "bne 86b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "87:"  // Accumulate: Height 4: no oddments
      "b 108f\n"
      "88:"  // Accumulate: Height 5
      "mov x9, %x[out_ptr]\n"
      "mov x10, %x[cols]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "add x25, x26, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "add x24, x25, %x[ldout], LSL #1\n"
      "blt 90f\n"
      "89:"  // Accumulate: Height 5: Block loop
      "ldr d20, [x9, #0]\n"
      "ldr d19, [x27, #0]\n"
      "sub x10, x10, #0xc\n"
      "ldr d18, [x26, #0]\n"
      "ldr d17, [x25, #0]\n"
      "cmp x10, #0xc\n"
      "ldr d16, [x24, #0]\n"
      "ldr q3, [%x[in_ptr], #0]\n"
      "ldr q2, [%x[in_ptr], #0x30]\n"
      "ldr q1, [%x[in_ptr], #0x60]\n"
      "fcvtl v20.4s, v20.4h\n"
      "fcvtl v19.4s, v19.4h\n"
      "ldr q0, [%x[in_ptr], #0x90]\n"
      "ldr q31, [%x[in_ptr], #0xc0]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fcvtl v17.4s, v17.4h\n"
      "ldr q30, [%x[in_ptr], #0x10]\n"
      "ldr q29, [%x[in_ptr], #0x40]\n"
      "fcvtl v16.4s, v16.4h\n"
      "ldr q28, [%x[in_ptr], #0x70]\n"
      "ldr q27, [%x[in_ptr], #0xa0]\n"
      "fadd v3.4s, v3.4s, v20.4s\n"
      "fadd v2.4s, v2.4s, v19.4s\n"
      "ldr q26, [%x[in_ptr], #0xd0]\n"
      "ldr q25, [%x[in_ptr], #0x20]\n"
      "fadd v1.4s, v1.4s, v18.4s\n"
      "fadd v0.4s, v0.4s, v17.4s\n"
      "ldr q24, [%x[in_ptr], #0x50]\n"
      "ldr q23, [%x[in_ptr], #0x80]\n"
      "fadd v31.4s, v31.4s, v16.4s\n"
      "ldr q22, [%x[in_ptr], #0xb0]\n"
      "ldr q21, [%x[in_ptr], #0xe0]\n"
      "fmin v3.4s, v3.4s, v13.4s\n"
      "fmin v2.4s, v2.4s, v13.4s\n"
      "fmin v1.4s, v1.4s, v13.4s\n"
      "fmin v0.4s, v0.4s, v13.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fmin v31.4s, v31.4s, v13.4s\n"
      "fmax v3.4s, v3.4s, v12.4s\n"
      "fmax v2.4s, v2.4s, v12.4s\n"
      "fmax v1.4s, v1.4s, v12.4s\n"
      "fmax v0.4s, v0.4s, v12.4s\n"
      "fmax v31.4s, v31.4s, v12.4s\n"
      "fcvtn v20.4h, v3.4s\n"
      "fcvtn v19.4h, v2.4s\n"
      "fcvtn v17.4h, v1.4s\n"
      "fcvtn v16.4h, v0.4s\n"
      "fcvtn v18.4h, v31.4s\n"
      "str d20, [x9, #0]\n"
      "str d19, [x27, #0]\n"
      "str d17, [x26, #0]\n"
      "ldr d17, [x9, #0x8]\n"
      "str d16, [x25, #0]\n"
      "ldr d16, [x27, #0x8]\n"
      "str d18, [x24, #0]\n"
      "ldr d18, [x26, #0x8]\n"
      "fcvtl v20.4s, v17.4h\n"
      "ldr d17, [x25, #0x8]\n"
      "fcvtl v19.4s, v16.4h\n"
      "ldr d16, [x24, #0x8]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fcvtl v17.4s, v17.4h\n"
      "fadd v30.4s, v30.4s, v20.4s\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v29.4s, v29.4s, v19.4s\n"
      "fadd v28.4s, v28.4s, v18.4s\n"
      "fadd v27.4s, v27.4s, v17.4s\n"
      "fmin v30.4s, v30.4s, v13.4s\n"
      "fadd v26.4s, v26.4s, v16.4s\n"
      "fmin v29.4s, v29.4s, v13.4s\n"
      "fmin v28.4s, v28.4s, v13.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmax v30.4s, v30.4s, v12.4s\n"
      "fmax v29.4s, v29.4s, v12.4s\n"
      "fmax v28.4s, v28.4s, v12.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fcvtn v18.4h, v30.4s\n"
      "fcvtn v19.4h, v29.4s\n"
      "fcvtn v17.4h, v28.4s\n"
      "fcvtn v16.4h, v27.4s\n"
      "str d18, [x9, #0x8]\n"
      "fcvtn v18.4h, v26.4s\n"
      "str d19, [x27, #0x8]\n"
      "str d17, [x26, #0x8]\n"
      "ldr d17, [x9, #0x10]\n"
      "str d16, [x25, #0x8]\n"
      "ldr d16, [x27, #0x10]\n"
      "str d18, [x24, #0x8]\n"
      "ldr d18, [x26, #0x10]\n"
      "fcvtl v20.4s, v17.4h\n"
      "ldr d17, [x25, #0x10]\n"
      "fcvtl v19.4s, v16.4h\n"
      "ldr d16, [x24, #0x10]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fcvtl v17.4s, v17.4h\n"
      "fadd v25.4s, v25.4s, v20.4s\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v24.4s, v24.4s, v19.4s\n"
      "fadd v23.4s, v23.4s, v18.4s\n"
      "fadd v22.4s, v22.4s, v17.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fadd v21.4s, v21.4s, v16.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fcvtn v16.4h, v25.4s\n"
      "fcvtn v19.4h, v24.4s\n"
      "fcvtn v18.4h, v23.4s\n"
      "fcvtn v17.4h, v22.4s\n"
      "str d16, [x9, #0x10]\n"
      "add x9, x9, #0x18\n"
      "fcvtn v16.4h, v21.4s\n"
      "str d19, [x27, #0x10]\n"
      "add x27, x27, #0x18\n"
      "str d18, [x26, #0x10]\n"
      "add x26, x26, #0x18\n"
      "str d17, [x25, #0x10]\n"
      "add x25, x25, #0x18\n"
      "str d16, [x24, #0x10]\n"
      "add x24, x24, #0x18\n"
      "bge 89b\n"
      "90:"  // Accumulate: Height 5: no full blocks
      "cbz x10, 92f\n"
      "mov x20, %x[in_ptr]\n"
      "91:"  // Accumulate: Height 5: Single loop
      "ldr h20, [x9, #0]\n"
      "ldr h19, [x27, #0]\n"
      "subs x10, x10, #0x1\n"
      "ldr h18, [x26, #0]\n"
      "ldr h17, [x25, #0]\n"
      "ldr h16, [x24, #0]\n"
      "ldr s25, [%x[in_ptr], #0]\n"
      "ldr s24, [%x[in_ptr], #0x30]\n"
      "ldr s23, [%x[in_ptr], #0x60]\n"
      "fcvtl v20.4s, v20.4h\n"
      "fcvtl v19.4s, v19.4h\n"
      "ldr s22, [%x[in_ptr], #0x90]\n"
      "ldr s21, [%x[in_ptr], #0xc0]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v25.4s, v25.4s, v20.4s\n"
      "fadd v24.4s, v24.4s, v19.4s\n"
      "fadd v23.4s, v23.4s, v18.4s\n"
      "fadd v22.4s, v22.4s, v17.4s\n"
      "fadd v21.4s, v21.4s, v16.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmin v21.4s, v21.4s, v13.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fmax v21.4s, v21.4s, v12.4s\n"
      "fcvtn v20.4h, v25.4s\n"
      "fcvtn v19.4h, v24.4s\n"
      "fcvtn v18.4h, v23.4s\n"
      "fcvtn v17.4h, v22.4s\n"
      "fcvtn v16.4h, v21.4s\n"
      "str h20, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "str h19, [x27, #0]\n"
      "add x27, x27, #0x2\n"
      "str h18, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "str h17, [x25, #0]\n"
      "add x25, x25, #0x2\n"
      "str h16, [x24, #0]\n"
      "add x24, x24, #0x2\n"
      "bne 91b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "92:"  // Accumulate: Height 5: no oddments
      "b 108f\n"
      "93:"  // Accumulate: Height 6
      "mov x9, %x[out_ptr]\n"
      "mov x10, %x[cols]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "add x25, x26, %x[ldout], LSL #1\n"
      "add x24, x25, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "add x23, x24, %x[ldout], LSL #1\n"
      "blt 95f\n"
      "94:"  // Accumulate: Height 6: Block loop
      "ldr d21, [x9, #0]\n"
      "ldr d20, [x27, #0]\n"
      "sub x10, x10, #0xc\n"
      "ldr d19, [x26, #0]\n"
      "ldr d18, [x25, #0]\n"
      "cmp x10, #0xc\n"
      "ldr d17, [x24, #0]\n"
      "ldr d16, [x23, #0]\n"
      "ldr q6, [%x[in_ptr], #0]\n"
      "ldr q5, [%x[in_ptr], #0x30]\n"
      "fcvtl v22.4s, v21.4h\n"
      "fcvtl v21.4s, v20.4h\n"
      "ldr q4, [%x[in_ptr], #0x60]\n"
      "ldr q3, [%x[in_ptr], #0x90]\n"
      "fcvtl v20.4s, v19.4h\n"
      "fcvtl v18.4s, v18.4h\n"
      "ldr q2, [%x[in_ptr], #0xc0]\n"
      "ldr q19, [%x[in_ptr], #0xf0]\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "ldr q1, [%x[in_ptr], #0x10]\n"
      "ldr q0, [%x[in_ptr], #0x40]\n"
      "fadd v6.4s, v6.4s, v22.4s\n"
      "fadd v5.4s, v5.4s, v21.4s\n"
      "ldr q31, [%x[in_ptr], #0x70]\n"
      "ldr q30, [%x[in_ptr], #0xa0]\n"
      "fadd v4.4s, v4.4s, v20.4s\n"
      "fadd v3.4s, v3.4s, v18.4s\n"
      "ldr q29, [%x[in_ptr], #0xd0]\n"
      "ldr q28, [%x[in_ptr], #0x100]\n"
      "fadd v2.4s, v2.4s, v17.4s\n"
      "fadd v19.4s, v19.4s, v16.4s\n"
      "ldr q27, [%x[in_ptr], #0x20]\n"
      "ldr q26, [%x[in_ptr], #0x50]\n"
      "fmin v6.4s, v6.4s, v13.4s\n"
      "fmin v5.4s, v5.4s, v13.4s\n"
      "ldr q25, [%x[in_ptr], #0x80]\n"
      "ldr q24, [%x[in_ptr], #0xb0]\n"
      "fmin v4.4s, v4.4s, v13.4s\n"
      "fmin v3.4s, v3.4s, v13.4s\n"
      "ldr q23, [%x[in_ptr], #0xe0]\n"
      "ldr q22, [%x[in_ptr], #0x110]\n"
      "fmin v2.4s, v2.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmax v6.4s, v6.4s, v12.4s\n"
      "fmax v5.4s, v5.4s, v12.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fmax v4.4s, v4.4s, v12.4s\n"
      "fmax v3.4s, v3.4s, v12.4s\n"
      "fmax v2.4s, v2.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fcvtn v21.4h, v6.4s\n"
      "fcvtn v20.4h, v5.4s\n"
      "fcvtn v18.4h, v4.4s\n"
      "fcvtn v17.4h, v3.4s\n"
      "fcvtn v16.4h, v2.4s\n"
      "fcvtn v19.4h, v19.4s\n"
      "str d21, [x9, #0]\n"
      "str d20, [x27, #0]\n"
      "str d18, [x26, #0]\n"
      "ldr d18, [x9, #0x8]\n"
      "str d17, [x25, #0]\n"
      "ldr d17, [x27, #0x8]\n"
      "str d16, [x24, #0]\n"
      "ldr d16, [x26, #0x8]\n"
      "str d19, [x23, #0]\n"
      "fcvtl v21.4s, v18.4h\n"
      "ldr d18, [x25, #0x8]\n"
      "fcvtl v20.4s, v17.4h\n"
      "ldr d17, [x24, #0x8]\n"
      "fcvtl v19.4s, v16.4h\n"
      "ldr d16, [x23, #0x8]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fadd v1.4s, v1.4s, v21.4s\n"
      "fadd v0.4s, v0.4s, v20.4s\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v31.4s, v31.4s, v19.4s\n"
      "fadd v30.4s, v30.4s, v18.4s\n"
      "fmin v1.4s, v1.4s, v13.4s\n"
      "fmin v0.4s, v0.4s, v13.4s\n"
      "fadd v29.4s, v29.4s, v17.4s\n"
      "fadd v28.4s, v28.4s, v16.4s\n"
      "fmin v31.4s, v31.4s, v13.4s\n"
      "fmin v30.4s, v30.4s, v13.4s\n"
      "fmax v1.4s, v1.4s, v12.4s\n"
      "fmax v0.4s, v0.4s, v12.4s\n"
      "fmin v29.4s, v29.4s, v13.4s\n"
      "fmin v28.4s, v28.4s, v13.4s\n"
      "fmax v31.4s, v31.4s, v12.4s\n"
      "fmax v30.4s, v30.4s, v12.4s\n"
      "fcvtn v18.4h, v1.4s\n"
      "fcvtn v16.4h, v0.4s\n"
      "fmax v29.4s, v29.4s, v12.4s\n"
      "fmax v28.4s, v28.4s, v12.4s\n"
      "fcvtn v20.4h, v31.4s\n"
      "fcvtn v17.4h, v30.4s\n"
      "str d18, [x9, #0x8]\n"
      "str d16, [x27, #0x8]\n"
      "fcvtn v19.4h, v29.4s\n"
      "fcvtn v18.4h, v28.4s\n"
      "ldr d16, [x9, #0x10]\n"
      "str d20, [x26, #0x8]\n"
      "str d17, [x25, #0x8]\n"
      "ldr d17, [x27, #0x10]\n"
      "str d19, [x24, #0x8]\n"
      "fcvtl v21.4s, v16.4h\n"
      "ldr d16, [x26, #0x10]\n"
      "str d18, [x23, #0x8]\n"
      "ldr d18, [x25, #0x10]\n"
      "fcvtl v20.4s, v17.4h\n"
      "ldr d17, [x24, #0x10]\n"
      "fcvtl v19.4s, v16.4h\n"
      "fadd v27.4s, v27.4s, v21.4s\n"
      "ldr d16, [x23, #0x10]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fcvtl v17.4s, v17.4h\n"
      "fadd v26.4s, v26.4s, v20.4s\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v25.4s, v25.4s, v19.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fadd v24.4s, v24.4s, v18.4s\n"
      "fadd v23.4s, v23.4s, v17.4s\n"
      "fadd v22.4s, v22.4s, v16.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fcvtn v17.4h, v27.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fcvtn v16.4h, v26.4s\n"
      "str d17, [x9, #0x10]\n"
      "add x9, x9, #0x18\n"
      "fcvtn v19.4h, v25.4s\n"
      "fcvtn v18.4h, v24.4s\n"
      "fcvtn v17.4h, v23.4s\n"
      "str d16, [x27, #0x10]\n"
      "add x27, x27, #0x18\n"
      "fcvtn v16.4h, v22.4s\n"
      "str d19, [x26, #0x10]\n"
      "add x26, x26, #0x18\n"
      "str d18, [x25, #0x10]\n"
      "add x25, x25, #0x18\n"
      "str d17, [x24, #0x10]\n"
      "add x24, x24, #0x18\n"
      "str d16, [x23, #0x10]\n"
      "add x23, x23, #0x18\n"
      "bge 94b\n"
      "95:"  // Accumulate: Height 6: no full blocks
      "cbz x10, 97f\n"
      "mov x20, %x[in_ptr]\n"
      "96:"  // Accumulate: Height 6: Single loop
      "ldr h21, [x9, #0]\n"
      "ldr h20, [x27, #0]\n"
      "subs x10, x10, #0x1\n"
      "ldr h19, [x26, #0]\n"
      "ldr h18, [x25, #0]\n"
      "ldr h17, [x24, #0]\n"
      "ldr h16, [x23, #0]\n"
      "ldr s27, [%x[in_ptr], #0]\n"
      "ldr s26, [%x[in_ptr], #0x30]\n"
      "fcvtl v21.4s, v21.4h\n"
      "fcvtl v20.4s, v20.4h\n"
      "ldr s25, [%x[in_ptr], #0x60]\n"
      "ldr s24, [%x[in_ptr], #0x90]\n"
      "fcvtl v19.4s, v19.4h\n"
      "fcvtl v18.4s, v18.4h\n"
      "ldr s23, [%x[in_ptr], #0xc0]\n"
      "ldr s22, [%x[in_ptr], #0xf0]\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v27.4s, v27.4s, v21.4s\n"
      "fadd v26.4s, v26.4s, v20.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v25.4s, v25.4s, v19.4s\n"
      "fadd v24.4s, v24.4s, v18.4s\n"
      "fadd v23.4s, v23.4s, v17.4s\n"
      "fadd v22.4s, v22.4s, v16.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fcvtn v21.4h, v27.4s\n"
      "fcvtn v20.4h, v26.4s\n"
      "fcvtn v19.4h, v25.4s\n"
      "fcvtn v18.4h, v24.4s\n"
      "fcvtn v17.4h, v23.4s\n"
      "fcvtn v16.4h, v22.4s\n"
      "str h21, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "str h20, [x27, #0]\n"
      "add x27, x27, #0x2\n"
      "str h19, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "str h18, [x25, #0]\n"
      "add x25, x25, #0x2\n"
      "str h17, [x24, #0]\n"
      "add x24, x24, #0x2\n"
      "str h16, [x23, #0]\n"
      "add x23, x23, #0x2\n"
      "bne 96b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "97:"  // Accumulate: Height 6: no oddments
      "b 108f\n"
      "98:"  // Accumulate: Height 7
      "mov x9, %x[out_ptr]\n"
      "mov x10, %x[cols]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "add x25, x26, %x[ldout], LSL #1\n"
      "add x24, x25, %x[ldout], LSL #1\n"
      "add x23, x24, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "add x22, x23, %x[ldout], LSL #1\n"
      "blt 100f\n"
      "99:"  // Accumulate: Height 7: Block loop
      "ldr d22, [x9, #0]\n"
      "ldr d21, [x27, #0]\n"
      "sub x10, x10, #0xc\n"
      "ldr d20, [x26, #0]\n"
      "ldr d19, [x25, #0]\n"
      "cmp x10, #0xc\n"
      "ldr d18, [x24, #0]\n"
      "ldr d17, [x23, #0]\n"
      "ldr d16, [x22, #0]\n"
      "ldr q9, [%x[in_ptr], #0]\n"
      "fcvtl v24.4s, v22.4h\n"
      "fcvtl v23.4s, v21.4h\n"
      "ldr q8, [%x[in_ptr], #0x30]\n"
      "ldr q7, [%x[in_ptr], #0x60]\n"
      "fcvtl v21.4s, v20.4h\n"
      "fcvtl v19.4s, v19.4h\n"
      "ldr q6, [%x[in_ptr], #0x90]\n"
      "ldr q5, [%x[in_ptr], #0xc0]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fcvtl v17.4s, v17.4h\n"
      "ldr q20, [%x[in_ptr], #0xf0]\n"
      "ldr q22, [%x[in_ptr], #0x120]\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v9.4s, v9.4s, v24.4s\n"
      "ldr q4, [%x[in_ptr], #0x10]\n"
      "ldr q3, [%x[in_ptr], #0x40]\n"
      "fadd v8.4s, v8.4s, v23.4s\n"
      "fadd v7.4s, v7.4s, v21.4s\n"
      "ldr q2, [%x[in_ptr], #0x70]\n"
      "ldr q1, [%x[in_ptr], #0xa0]\n"
      "fadd v6.4s, v6.4s, v19.4s\n"
      "fadd v5.4s, v5.4s, v18.4s\n"
      "ldr q0, [%x[in_ptr], #0xd0]\n"
      "ldr q31, [%x[in_ptr], #0x100]\n"
      "fadd v20.4s, v20.4s, v17.4s\n"
      "fadd v22.4s, v22.4s, v16.4s\n"
      "ldr q30, [%x[in_ptr], #0x130]\n"
      "ldr q29, [%x[in_ptr], #0x20]\n"
      "fmin v9.4s, v9.4s, v13.4s\n"
      "fmin v8.4s, v8.4s, v13.4s\n"
      "ldr q28, [%x[in_ptr], #0x50]\n"
      "ldr q27, [%x[in_ptr], #0x80]\n"
      "fmin v7.4s, v7.4s, v13.4s\n"
      "fmin v6.4s, v6.4s, v13.4s\n"
      "ldr q26, [%x[in_ptr], #0xb0]\n"
      "ldr q25, [%x[in_ptr], #0xe0]\n"
      "fmin v5.4s, v5.4s, v13.4s\n"
      "fmin v20.4s, v20.4s, v13.4s\n"
      "ldr q24, [%x[in_ptr], #0x110]\n"
      "ldr q23, [%x[in_ptr], #0x140]\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmax v9.4s, v9.4s, v12.4s\n"
      "fmax v8.4s, v8.4s, v12.4s\n"
      "fmax v7.4s, v7.4s, v12.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fmax v6.4s, v6.4s, v12.4s\n"
      "fmax v5.4s, v5.4s, v12.4s\n"
      "fmax v20.4s, v20.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fcvtn v21.4h, v9.4s\n"
      "fcvtn v19.4h, v8.4s\n"
      "fcvtn v16.4h, v7.4s\n"
      "fcvtn v18.4h, v6.4s\n"
      "fcvtn v17.4h, v5.4s\n"
      "fcvtn v20.4h, v20.4s\n"
      "str d21, [x9, #0]\n"
      "str d19, [x27, #0]\n"
      "fcvtn v19.4h, v22.4s\n"
      "str d16, [x26, #0]\n"
      "ldr d16, [x9, #0x8]\n"
      "str d18, [x25, #0]\n"
      "ldr d18, [x27, #0x8]\n"
      "str d17, [x24, #0]\n"
      "ldr d17, [x26, #0x8]\n"
      "str d20, [x23, #0]\n"
      "fcvtl v22.4s, v16.4h\n"
      "ldr d16, [x25, #0x8]\n"
      "str d19, [x22, #0]\n"
      "fcvtl v21.4s, v18.4h\n"
      "ldr d18, [x24, #0x8]\n"
      "fcvtl v20.4s, v17.4h\n"
      "ldr d17, [x23, #0x8]\n"
      "fcvtl v19.4s, v16.4h\n"
      "fadd v4.4s, v4.4s, v22.4s\n"
      "ldr d16, [x22, #0x8]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fadd v3.4s, v3.4s, v21.4s\n"
      "fadd v2.4s, v2.4s, v20.4s\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v1.4s, v1.4s, v19.4s\n"
      "fadd v0.4s, v0.4s, v18.4s\n"
      "fmin v4.4s, v4.4s, v13.4s\n"
      "fadd v31.4s, v31.4s, v17.4s\n"
      "fmin v3.4s, v3.4s, v13.4s\n"
      "fadd v30.4s, v30.4s, v16.4s\n"
      "fmin v2.4s, v2.4s, v13.4s\n"
      "fmin v1.4s, v1.4s, v13.4s\n"
      "fmin v0.4s, v0.4s, v13.4s\n"
      "fmin v31.4s, v31.4s, v13.4s\n"
      "fmax v4.4s, v4.4s, v12.4s\n"
      "fmin v30.4s, v30.4s, v13.4s\n"
      "fmax v3.4s, v3.4s, v12.4s\n"
      "fmax v2.4s, v2.4s, v12.4s\n"
      "fmax v1.4s, v1.4s, v12.4s\n"
      "fmax v0.4s, v0.4s, v12.4s\n"
      "fmax v31.4s, v31.4s, v12.4s\n"
      "fmax v30.4s, v30.4s, v12.4s\n"
      "fcvtn v19.4h, v4.4s\n"
      "fcvtn v21.4h, v3.4s\n"
      "fcvtn v16.4h, v2.4s\n"
      "fcvtn v18.4h, v1.4s\n"
      "fcvtn v17.4h, v0.4s\n"
      "str d19, [x9, #0x8]\n"
      "fcvtn v20.4h, v31.4s\n"
      "fcvtn v19.4h, v30.4s\n"
      "str d21, [x27, #0x8]\n"
      "str d16, [x26, #0x8]\n"
      "ldr d16, [x9, #0x10]\n"
      "str d18, [x25, #0x8]\n"
      "ldr d18, [x27, #0x10]\n"
      "str d17, [x24, #0x8]\n"
      "ldr d17, [x26, #0x10]\n"
      "str d20, [x23, #0x8]\n"
      "fcvtl v22.4s, v16.4h\n"
      "ldr d16, [x25, #0x10]\n"
      "str d19, [x22, #0x8]\n"
      "fcvtl v21.4s, v18.4h\n"
      "ldr d18, [x24, #0x10]\n"
      "fcvtl v20.4s, v17.4h\n"
      "ldr d17, [x23, #0x10]\n"
      "fcvtl v19.4s, v16.4h\n"
      "fadd v29.4s, v29.4s, v22.4s\n"
      "ldr d16, [x22, #0x10]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fadd v28.4s, v28.4s, v21.4s\n"
      "fadd v27.4s, v27.4s, v20.4s\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v26.4s, v26.4s, v19.4s\n"
      "fadd v25.4s, v25.4s, v18.4s\n"
      "fmin v29.4s, v29.4s, v13.4s\n"
      "fadd v24.4s, v24.4s, v17.4s\n"
      "fmin v28.4s, v28.4s, v13.4s\n"
      "fadd v23.4s, v23.4s, v16.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmax v29.4s, v29.4s, v12.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmax v28.4s, v28.4s, v12.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fcvtn v16.4h, v29.4s\n"
      "fcvtn v21.4h, v28.4s\n"
      "fcvtn v20.4h, v27.4s\n"
      "fcvtn v19.4h, v26.4s\n"
      "fcvtn v18.4h, v25.4s\n"
      "str d16, [x9, #0x10]\n"
      "add x9, x9, #0x18\n"
      "fcvtn v17.4h, v24.4s\n"
      "fcvtn v16.4h, v23.4s\n"
      "str d21, [x27, #0x10]\n"
      "add x27, x27, #0x18\n"
      "str d20, [x26, #0x10]\n"
      "add x26, x26, #0x18\n"
      "str d19, [x25, #0x10]\n"
      "add x25, x25, #0x18\n"
      "str d18, [x24, #0x10]\n"
      "add x24, x24, #0x18\n"
      "str d17, [x23, #0x10]\n"
      "add x23, x23, #0x18\n"
      "str d16, [x22, #0x10]\n"
      "add x22, x22, #0x18\n"
      "bge 99b\n"
      "100:"  // Accumulate: Height 7: no full blocks
      "cbz x10, 102f\n"
      "mov x20, %x[in_ptr]\n"
      "101:"  // Accumulate: Height 7: Single loop
      "ldr h22, [x9, #0]\n"
      "ldr h21, [x27, #0]\n"
      "subs x10, x10, #0x1\n"
      "ldr h20, [x26, #0]\n"
      "ldr h19, [x25, #0]\n"
      "ldr h18, [x24, #0]\n"
      "ldr h17, [x23, #0]\n"
      "ldr h16, [x22, #0]\n"
      "ldr s29, [%x[in_ptr], #0]\n"
      "fcvtl v28.4s, v22.4h\n"
      "fcvtl v27.4s, v21.4h\n"
      "ldr s26, [%x[in_ptr], #0x30]\n"
      "ldr s25, [%x[in_ptr], #0x60]\n"
      "fcvtl v21.4s, v20.4h\n"
      "fcvtl v20.4s, v19.4h\n"
      "ldr s24, [%x[in_ptr], #0x90]\n"
      "ldr s23, [%x[in_ptr], #0xc0]\n"
      "fcvtl v19.4s, v18.4h\n"
      "fcvtl v18.4s, v17.4h\n"
      "ldr s17, [%x[in_ptr], #0xf0]\n"
      "ldr s22, [%x[in_ptr], #0x120]\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v29.4s, v29.4s, v28.4s\n"
      "fadd v26.4s, v26.4s, v27.4s\n"
      "fadd v25.4s, v25.4s, v21.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v24.4s, v24.4s, v20.4s\n"
      "fadd v23.4s, v23.4s, v19.4s\n"
      "fadd v17.4s, v17.4s, v18.4s\n"
      "fadd v22.4s, v22.4s, v16.4s\n"
      "fmin v29.4s, v29.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v17.4s, v17.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmax v29.4s, v29.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v17.4s, v17.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fcvtn v21.4h, v29.4s\n"
      "fcvtn v16.4h, v26.4s\n"
      "fcvtn v20.4h, v25.4s\n"
      "fcvtn v19.4h, v24.4s\n"
      "fcvtn v18.4h, v23.4s\n"
      "fcvtn v17.4h, v17.4s\n"
      "str h21, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "str h16, [x27, #0]\n"
      "fcvtn v16.4h, v22.4s\n"
      "add x27, x27, #0x2\n"
      "str h20, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "str h19, [x25, #0]\n"
      "add x25, x25, #0x2\n"
      "str h18, [x24, #0]\n"
      "add x24, x24, #0x2\n"
      "str h17, [x23, #0]\n"
      "add x23, x23, #0x2\n"
      "str h16, [x22, #0]\n"
      "add x22, x22, #0x2\n"
      "bne 101b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "102:"  // Accumulate: Height 7: no oddments
      "b 108f\n"
      "103:"  // Accumulate: Height 8
      "mov x9, %x[out_ptr]\n"
      "mov x10, %x[cols]\n"
      "add x27, x9, %x[ldout], LSL #1\n"
      "add x26, x27, %x[ldout], LSL #1\n"
      "add x25, x26, %x[ldout], LSL #1\n"
      "add x24, x25, %x[ldout], LSL #1\n"
      "add x23, x24, %x[ldout], LSL #1\n"
      "cmp x10, #0xc\n"
      "add x22, x23, %x[ldout], LSL #1\n"
      "add x21, x22, %x[ldout], LSL #1\n"
      "blt 105f\n"
      "104:"  // Accumulate: Height 8: Block loop
      "ldr d23, [x9, #0]\n"
      "ldr d22, [x27, #0]\n"
      "sub x10, x10, #0xc\n"
      "ldr d21, [x26, #0]\n"
      "ldr d20, [x25, #0]\n"
      "cmp x10, #0xc\n"
      "ldr d19, [x24, #0]\n"
      "ldr d18, [x23, #0]\n"
      "ldr d17, [x22, #0]\n"
      "ldr d16, [x21, #0]\n"
      "fcvtl v26.4s, v23.4h\n"
      "fcvtl v25.4s, v22.4h\n"
      "ldr q11, [%x[in_ptr], #0]\n"
      "ldr q10, [%x[in_ptr], #0x30]\n"
      "fcvtl v24.4s, v21.4h\n"
      "fcvtl v23.4s, v20.4h\n"
      "ldr q9, [%x[in_ptr], #0x60]\n"
      "ldr q8, [%x[in_ptr], #0x90]\n"
      "fcvtl v21.4s, v19.4h\n"
      "fcvtl v20.4s, v18.4h\n"
      "ldr q18, [%x[in_ptr], #0xc0]\n"
      "ldr q19, [%x[in_ptr], #0xf0]\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "ldr q7, [%x[in_ptr], #0x120]\n"
      "ldr q22, [%x[in_ptr], #0x150]\n"
      "fadd v11.4s, v11.4s, v26.4s\n"
      "fadd v10.4s, v10.4s, v25.4s\n"
      "ldr q6, [%x[in_ptr], #0x10]\n"
      "ldr q5, [%x[in_ptr], #0x40]\n"
      "fadd v9.4s, v9.4s, v24.4s\n"
      "fadd v8.4s, v8.4s, v23.4s\n"
      "ldr q4, [%x[in_ptr], #0x70]\n"
      "ldr q3, [%x[in_ptr], #0xa0]\n"
      "fadd v18.4s, v18.4s, v21.4s\n"
      "fadd v19.4s, v19.4s, v20.4s\n"
      "ldr q2, [%x[in_ptr], #0xd0]\n"
      "ldr q1, [%x[in_ptr], #0x100]\n"
      "fadd v7.4s, v7.4s, v17.4s\n"
      "fadd v22.4s, v22.4s, v16.4s\n"
      "ldr q0, [%x[in_ptr], #0x130]\n"
      "ldr q31, [%x[in_ptr], #0x160]\n"
      "fmin v11.4s, v11.4s, v13.4s\n"
      "fmin v10.4s, v10.4s, v13.4s\n"
      "ldr q30, [%x[in_ptr], #0x20]\n"
      "ldr q29, [%x[in_ptr], #0x50]\n"
      "fmin v9.4s, v9.4s, v13.4s\n"
      "fmin v8.4s, v8.4s, v13.4s\n"
      "ldr q28, [%x[in_ptr], #0x80]\n"
      "ldr q27, [%x[in_ptr], #0xb0]\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "ldr q26, [%x[in_ptr], #0xe0]\n"
      "ldr q25, [%x[in_ptr], #0x110]\n"
      "fmin v7.4s, v7.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "ldr q24, [%x[in_ptr], #0x140]\n"
      "ldr q23, [%x[in_ptr], #0x170]\n"
      "fmax v11.4s, v11.4s, v12.4s\n"
      "fmax v10.4s, v10.4s, v12.4s\n"
      "fmax v9.4s, v9.4s, v12.4s\n"
      "fmax v8.4s, v8.4s, v12.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x180\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v7.4s, v7.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fcvtn v21.4h, v11.4s\n"
      "fcvtn v20.4h, v10.4s\n"
      "fcvtn v17.4h, v9.4s\n"
      "fcvtn v16.4h, v8.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "fcvtn v19.4h, v19.4s\n"
      "str d21, [x9, #0]\n"
      "str d20, [x27, #0]\n"
      "fcvtn v21.4h, v7.4s\n"
      "fcvtn v20.4h, v22.4s\n"
      "str d17, [x26, #0]\n"
      "ldr d17, [x9, #0x8]\n"
      "str d16, [x25, #0]\n"
      "ldr d16, [x27, #0x8]\n"
      "str d18, [x24, #0]\n"
      "ldr d18, [x26, #0x8]\n"
      "str d19, [x23, #0]\n"
      "fcvtl v19.4s, v17.4h\n"
      "ldr d17, [x25, #0x8]\n"
      "str d21, [x22, #0]\n"
      "fcvtl v22.4s, v16.4h\n"
      "ldr d16, [x24, #0x8]\n"
      "str d20, [x21, #0]\n"
      "fcvtl v21.4s, v18.4h\n"
      "ldr d18, [x23, #0x8]\n"
      "fcvtl v20.4s, v17.4h\n"
      "fadd v6.4s, v6.4s, v19.4s\n"
      "ldr d17, [x22, #0x8]\n"
      "fcvtl v19.4s, v16.4h\n"
      "fadd v5.4s, v5.4s, v22.4s\n"
      "ldr d16, [x21, #0x8]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fadd v4.4s, v4.4s, v21.4s\n"
      "fadd v3.4s, v3.4s, v20.4s\n"
      "fcvtl v17.4s, v17.4h\n"
      "fadd v2.4s, v2.4s, v19.4s\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v1.4s, v1.4s, v18.4s\n"
      "fmin v6.4s, v6.4s, v13.4s\n"
      "fmin v5.4s, v5.4s, v13.4s\n"
      "fadd v0.4s, v0.4s, v17.4s\n"
      "fmin v4.4s, v4.4s, v13.4s\n"
      "fadd v31.4s, v31.4s, v16.4s\n"
      "fmin v3.4s, v3.4s, v13.4s\n"
      "fmin v2.4s, v2.4s, v13.4s\n"
      "fmin v1.4s, v1.4s, v13.4s\n"
      "fmin v0.4s, v0.4s, v13.4s\n"
      "fmax v6.4s, v6.4s, v12.4s\n"
      "fmin v31.4s, v31.4s, v13.4s\n"
      "fmax v5.4s, v5.4s, v12.4s\n"
      "fmax v4.4s, v4.4s, v12.4s\n"
      "fmax v3.4s, v3.4s, v12.4s\n"
      "fmax v2.4s, v2.4s, v12.4s\n"
      "fmax v1.4s, v1.4s, v12.4s\n"
      "fmax v0.4s, v0.4s, v12.4s\n"
      "fmax v31.4s, v31.4s, v12.4s\n"
      "fcvtn v21.4h, v6.4s\n"
      "fcvtn v20.4h, v5.4s\n"
      "fcvtn v17.4h, v4.4s\n"
      "fcvtn v16.4h, v3.4s\n"
      "fcvtn v18.4h, v2.4s\n"
      "fcvtn v19.4h, v1.4s\n"
      "str d21, [x9, #0x8]\n"
      "str d20, [x27, #0x8]\n"
      "fcvtn v21.4h, v0.4s\n"
      "fcvtn v20.4h, v31.4s\n"
      "str d17, [x26, #0x8]\n"
      "ldr d17, [x9, #0x10]\n"
      "str d16, [x25, #0x8]\n"
      "ldr d16, [x27, #0x10]\n"
      "str d18, [x24, #0x8]\n"
      "ldr d18, [x26, #0x10]\n"
      "str d19, [x23, #0x8]\n"
      "fcvtl v19.4s, v17.4h\n"
      "ldr d17, [x25, #0x10]\n"
      "str d21, [x22, #0x8]\n"
      "fcvtl v22.4s, v16.4h\n"
      "ldr d16, [x24, #0x10]\n"
      "str d20, [x21, #0x8]\n"
      "fcvtl v21.4s, v18.4h\n"
      "ldr d18, [x23, #0x10]\n"
      "fcvtl v20.4s, v17.4h\n"
      "fadd v30.4s, v30.4s, v19.4s\n"
      "ldr d17, [x22, #0x10]\n"
      "fcvtl v19.4s, v16.4h\n"
      "fadd v29.4s, v29.4s, v22.4s\n"
      "ldr d16, [x21, #0x10]\n"
      "fcvtl v18.4s, v18.4h\n"
      "fadd v28.4s, v28.4s, v21.4s\n"
      "fadd v27.4s, v27.4s, v20.4s\n"
      "fcvtl v17.4s, v17.4h\n"
      "fadd v26.4s, v26.4s, v19.4s\n"
      "fcvtl v16.4s, v16.4h\n"
      "fadd v25.4s, v25.4s, v18.4s\n"
      "fmin v30.4s, v30.4s, v13.4s\n"
      "fmin v29.4s, v29.4s, v13.4s\n"
      "fadd v24.4s, v24.4s, v17.4s\n"
      "fmin v28.4s, v28.4s, v13.4s\n"
      "fadd v23.4s, v23.4s, v16.4s\n"
      "fmin v27.4s, v27.4s, v13.4s\n"
      "fmin v26.4s, v26.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmax v30.4s, v30.4s, v12.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmax v29.4s, v29.4s, v12.4s\n"
      "fmax v28.4s, v28.4s, v12.4s\n"
      "fmax v27.4s, v27.4s, v12.4s\n"
      "fmax v26.4s, v26.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fcvtn v17.4h, v30.4s\n"
      "fcvtn v16.4h, v29.4s\n"
      "fcvtn v21.4h, v28.4s\n"
      "fcvtn v20.4h, v27.4s\n"
      "fcvtn v19.4h, v26.4s\n"
      "fcvtn v18.4h, v25.4s\n"
      "str d17, [x9, #0x10]\n"
      "add x9, x9, #0x18\n"
      "str d16, [x27, #0x10]\n"
      "fcvtn v17.4h, v24.4s\n"
      "fcvtn v16.4h, v23.4s\n"
      "add x27, x27, #0x18\n"
      "str d21, [x26, #0x10]\n"
      "add x26, x26, #0x18\n"
      "str d20, [x25, #0x10]\n"
      "add x25, x25, #0x18\n"
      "str d19, [x24, #0x10]\n"
      "add x24, x24, #0x18\n"
      "str d18, [x23, #0x10]\n"
      "add x23, x23, #0x18\n"
      "str d17, [x22, #0x10]\n"
      "add x22, x22, #0x18\n"
      "str d16, [x21, #0x10]\n"
      "add x21, x21, #0x18\n"
      "bge 104b\n"
      "105:"  // Accumulate: Height 8: no full blocks
      "cbz x10, 107f\n"
      "mov x20, %x[in_ptr]\n"
      "106:"  // Accumulate: Height 8: Single loop
      "ldr h23, [x9, #0]\n"
      "ldr h22, [x27, #0]\n"
      "subs x10, x10, #0x1\n"
      "ldr h21, [x26, #0]\n"
      "ldr h20, [x25, #0]\n"
      "ldr h19, [x24, #0]\n"
      "ldr h18, [x23, #0]\n"
      "ldr h17, [x22, #0]\n"
      "ldr h16, [x21, #0]\n"
      "fcvtl v31.4s, v23.4h\n"
      "fcvtl v30.4s, v22.4h\n"
      "ldr s29, [%x[in_ptr], #0]\n"
      "ldr s28, [%x[in_ptr], #0x30]\n"
      "fcvtl v27.4s, v21.4h\n"
      "fcvtl v26.4s, v20.4h\n"
      "ldr s25, [%x[in_ptr], #0x60]\n"
      "ldr s24, [%x[in_ptr], #0x90]\n"
      "fcvtl v21.4s, v19.4h\n"
      "fcvtl v20.4s, v18.4h\n"
      "ldr s19, [%x[in_ptr], #0xc0]\n"
      "ldr s18, [%x[in_ptr], #0xf0]\n"
      "fcvtl v17.4s, v17.4h\n"
      "fcvtl v16.4s, v16.4h\n"
      "ldr s23, [%x[in_ptr], #0x120]\n"
      "ldr s22, [%x[in_ptr], #0x150]\n"
      "fadd v29.4s, v29.4s, v31.4s\n"
      "fadd v28.4s, v28.4s, v30.4s\n"
      "fadd v25.4s, v25.4s, v27.4s\n"
      "fadd v24.4s, v24.4s, v26.4s\n"
      "add %x[in_ptr], %x[in_ptr], #0x4\n"
      "fadd v19.4s, v19.4s, v21.4s\n"
      "fadd v18.4s, v18.4s, v20.4s\n"
      "fadd v23.4s, v23.4s, v17.4s\n"
      "fadd v22.4s, v22.4s, v16.4s\n"
      "fmin v29.4s, v29.4s, v13.4s\n"
      "fmin v28.4s, v28.4s, v13.4s\n"
      "fmin v25.4s, v25.4s, v13.4s\n"
      "fmin v24.4s, v24.4s, v13.4s\n"
      "fmin v19.4s, v19.4s, v13.4s\n"
      "fmin v18.4s, v18.4s, v13.4s\n"
      "fmin v23.4s, v23.4s, v13.4s\n"
      "fmin v22.4s, v22.4s, v13.4s\n"
      "fmax v29.4s, v29.4s, v12.4s\n"
      "fmax v28.4s, v28.4s, v12.4s\n"
      "fmax v25.4s, v25.4s, v12.4s\n"
      "fmax v24.4s, v24.4s, v12.4s\n"
      "fmax v19.4s, v19.4s, v12.4s\n"
      "fmax v18.4s, v18.4s, v12.4s\n"
      "fmax v23.4s, v23.4s, v12.4s\n"
      "fmax v22.4s, v22.4s, v12.4s\n"
      "fcvtn v17.4h, v29.4s\n"
      "fcvtn v16.4h, v28.4s\n"
      "fcvtn v21.4h, v25.4s\n"
      "fcvtn v20.4h, v24.4s\n"
      "fcvtn v19.4h, v19.4s\n"
      "fcvtn v18.4h, v18.4s\n"
      "str h17, [x9, #0]\n"
      "add x9, x9, #0x2\n"
      "str h16, [x27, #0]\n"
      "fcvtn v17.4h, v23.4s\n"
      "fcvtn v16.4h, v22.4s\n"
      "add x27, x27, #0x2\n"
      "str h21, [x26, #0]\n"
      "add x26, x26, #0x2\n"
      "str h20, [x25, #0]\n"
      "add x25, x25, #0x2\n"
      "str h19, [x24, #0]\n"
      "add x24, x24, #0x2\n"
      "str h18, [x23, #0]\n"
      "add x23, x23, #0x2\n"
      "str h17, [x22, #0]\n"
      "add x22, x22, #0x2\n"
      "str h16, [x21, #0]\n"
      "add x21, x21, #0x2\n"
      "bne 106b\n"
      "add %x[in_ptr], x20, #0x180\n"
      "107:"  // Accumulate: Height 8: no oddments
      "subs %x[rows], %x[rows], #0x8\n"
      "add %x[out_ptr], %x[out_ptr], x11\n"
      "bgt 67b\n"
      "108:"  // Exit
      : [in_ptr] "+&r" (in_ptr), [out_ptr] "+&r" (out_ptr), [rows] "+&r" (rows)
      : [accumulate] "r" (accumulate), [bias] "r" (bias), [cols] "r" (cols), [ldout] "r" (ldout), [maxval] "r" (maxval), [minval] "r" (minval)
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
    );
}

#endif // (defined(ENABLE_FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) && defined(__aarch64__)

