// Copyright (c) 2022 Arm Limited
// Copyright (c) 2022 Hanno Becker
// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer
// Copyright (c) 2024 The mlkem-native project authors
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT

// ----------------------------------------------------------------------------
// Inverse number-theoretic transform from ML-KEM
// Input a[256], z_01234[80], z_56[384] (all signed 16-bit words); output a[256] (signed 16-bit words).
//
// The transform is in-place with input and output a[256], with the input in
// bitreversed order and the output mapped into the Montgomery domain via
// x |-> (2^16 * x) mod 3329. The two other parameters are expected to point to
// tables of constants whose definitions can be found in the mlkem-native
// repo (mlkem/native/aarch64/src/aarch64_zetas.c) or our "tests/test.c".
//
// extern void mlkem_intt(int16_t a[static 256],const int16_t z_01234[static 80],
//                        const int16_t z_56[static 384]);
//
// Standard ARM ABI: X0 = a, X1 = z_01234, X2 = z_56
// ----------------------------------------------------------------------------
#include "_internal_s2n_bignum_arm.h"

        S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_intt)
        S2N_BN_FUNCTION_TYPE_DIRECTIVE(mlkem_intt)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_intt)
        .text
        .balign 4

S2N_BN_SYMBOL(mlkem_intt):
        CFI_START

// This implementation is generated by SLOTHY, set up to optimize for
// the Neoverse N1 microarchitecture, starting from the clean version
//
//   https://github.com/pq-code-package/mlkem-native/blob/main/mlkem/native/aarch64/intt_clean.S
//
// in the mlkem-native repository.

        CFI_DEC_SP(64)
        CFI_STACKSAVE2(d8,d9,0)
        CFI_STACKSAVE2(d10,d11,16)
        CFI_STACKSAVE2(d12,d13,32)
        CFI_STACKSAVE2(d14,d15,48)
        mov     w5, #0xd01
        mov     v7.h[0], w5
        mov     w5, #0x4ebf
        mov     v7.h[1], w5
        mov     w5, #0x200
        dup     v29.8h, w5
        mov     w5, #0x13b0
        dup     v30.8h, w5
        mov     x3, x0
        mov     x4, #0x8

Lmlkem_intt_scale_start:
        ldr     q8, [x3]
        ldr     q9, [x3, #16]
        ldr     q10, [x3, #32]
        ldr     q11, [x3, #48]
        sqrdmulh        v27.8h, v8.8h, v30.8h
        mul     v8.8h, v8.8h, v29.8h
        mls     v8.8h, v27.8h, v7.h[0]
        sqrdmulh        v27.8h, v9.8h, v30.8h
        mul     v9.8h, v9.8h, v29.8h
        mls     v9.8h, v27.8h, v7.h[0]
        sqrdmulh        v27.8h, v10.8h, v30.8h
        mul     v10.8h, v10.8h, v29.8h
        mls     v10.8h, v27.8h, v7.h[0]
        sqrdmulh        v27.8h, v11.8h, v30.8h
        mul     v11.8h, v11.8h, v29.8h
        mls     v11.8h, v27.8h, v7.h[0]
        str     q8, [x3], #64
        stur    q9, [x3, #-48]
        stur    q10, [x3, #-32]
        stur    q11, [x3, #-16]
        subs    x4, x4, #0x1
        cbnz    x4, Lmlkem_intt_scale_start

        mov     x3, x0
        mov     x4, #0x8
        ldr     q1, [x3, #32]
        ldr     q18, [x3, #48]
        ldr     q15, [x3]
        ldr     q21, [x3, #16]
        ldr     q3, [x2], #96
        ldur    q16, [x2, #-48]
        ldr     q4, [x1], #16
        ldur    q30, [x2, #-32]
        trn1    v11.4s, v1.4s, v18.4s
        trn2    v18.4s, v1.4s, v18.4s
        trn1    v20.4s, v15.4s, v21.4s
        trn2    v1.4s, v15.4s, v21.4s
        ldur    q0, [x2, #-16]
        ldur    q22, [x2, #-80]
        trn1    v8.2d, v20.2d, v11.2d
        trn1    v6.2d, v1.2d, v18.2d
        trn2    v1.2d, v1.2d, v18.2d
        trn2    v21.2d, v20.2d, v11.2d
        sub     v11.8h, v8.8h, v6.8h
        add     v20.8h, v8.8h, v6.8h
        add     v14.8h, v21.8h, v1.8h
        sub     v15.8h, v21.8h, v1.8h
        sqrdmulh        v16.8h, v11.8h, v16.8h
        ldur    q6, [x2, #-64]
        sub     v18.8h, v20.8h, v14.8h
        add     v21.8h, v20.8h, v14.8h
        sqrdmulh        v0.8h, v15.8h, v0.8h
        mul     v11.8h, v11.8h, v6.8h
        mul     v1.8h, v15.8h, v30.8h
        mls     v11.8h, v16.8h, v7.h[0]
        mls     v1.8h, v0.8h, v7.h[0]
        sqrdmulh        v0.8h, v18.8h, v22.8h
        mul     v16.8h, v18.8h, v3.8h
        sub     v18.8h, v11.8h, v1.8h
        add     v13.8h, v11.8h, v1.8h
        sqrdmulh        v11.8h, v18.8h, v22.8h
        trn1    v20.4s, v21.4s, v13.4s
        trn2    v1.4s, v21.4s, v13.4s
        mls     v16.8h, v0.8h, v7.h[0]
        mul     v3.8h, v18.8h, v3.8h
        mls     v3.8h, v11.8h, v7.h[0]
        trn2    v11.4s, v16.4s, v3.4s
        trn1    v16.4s, v16.4s, v3.4s
        trn2    v21.2d, v1.2d, v11.2d
        trn2    v0.2d, v20.2d, v16.2d
        trn1    v1.2d, v1.2d, v11.2d
        trn1    v11.2d, v20.2d, v16.2d
        sub     v13.8h, v0.8h, v21.8h
        add     v29.8h, v0.8h, v21.8h
        add     v9.8h, v11.8h, v1.8h
        sub     v23.8h, v11.8h, v1.8h
        sqdmulh v1.8h, v29.8h, v7.h[1]
        sqdmulh v27.8h, v9.8h, v7.h[1]
        sqrdmulh        v16.8h, v13.8h, v4.h[5]
        srshr   v14.8h, v1.8h, #11
        sub     x4, x4, #0x1

Lmlkem_intt_layer3456_start:
        mls     v29.8h, v14.8h, v7.h[0]
        ldr     q3, [x3, #96]
        ldr     q30, [x3, #112]
        ldr     q20, [x2, #32]
        mul     v6.8h, v23.8h, v4.h[2]
        ldr     q24, [x2, #16]
        ldr     q0, [x3, #64]
        ldr     q14, [x3, #80]
        srshr   v25.8h, v27.8h, #11
        mul     v15.8h, v13.8h, v4.h[4]
        trn1    v18.4s, v3.4s, v30.4s
        ldr     q28, [x2], #96
        trn2    v5.4s, v3.4s, v30.4s
        sqrdmulh        v19.8h, v23.8h, v4.h[3]
        trn2    v26.4s, v0.4s, v14.4s
        trn1    v11.4s, v0.4s, v14.4s
        mls     v9.8h, v25.8h, v7.h[0]
        trn2    v0.2d, v26.2d, v5.2d
        ldur    q17, [x2, #-16]
        mls     v15.8h, v16.8h, v7.h[0]
        trn2    v23.2d, v11.2d, v18.2d
        trn1    v30.2d, v26.2d, v5.2d
        ldur    q1, [x2, #-32]
        mls     v6.8h, v19.8h, v7.h[0]
        sub     v14.8h, v23.8h, v0.8h
        trn1    v19.2d, v11.2d, v18.2d
        ldur    q10, [x2, #-48]
        add     v31.8h, v23.8h, v0.8h
        sqrdmulh        v27.8h, v14.8h, v17.8h
        add     v18.8h, v19.8h, v30.8h
        mul     v13.8h, v14.8h, v1.8h
        sub     v22.8h, v19.8h, v30.8h
        sub     v26.8h, v18.8h, v31.8h
        sqrdmulh        v10.8h, v22.8h, v10.8h
        sub     v25.8h, v9.8h, v29.8h
        add     v9.8h, v9.8h, v29.8h
        mls     v13.8h, v27.8h, v7.h[0]
        add     v2.8h, v18.8h, v31.8h
        str     q9, [x3], #64
        sub     v23.8h, v6.8h, v15.8h
        add     v21.8h, v6.8h, v15.8h
        mul     v3.8h, v22.8h, v20.8h
        mls     v3.8h, v10.8h, v7.h[0]
        sqrdmulh        v20.8h, v26.8h, v24.8h
        mul     v22.8h, v26.8h, v28.8h
        add     v12.8h, v3.8h, v13.8h
        sub     v15.8h, v3.8h, v13.8h
        sqrdmulh        v13.8h, v23.8h, v4.h[1]
        sqrdmulh        v0.8h, v15.8h, v24.8h
        mul     v27.8h, v15.8h, v28.8h
        mls     v22.8h, v20.8h, v7.h[0]
        mls     v27.8h, v0.8h, v7.h[0]
        sqdmulh v19.8h, v21.8h, v7.h[1]
        trn1    v26.4s, v2.4s, v12.4s
        mul     v10.8h, v23.8h, v4.h[0]
        trn2    v2.4s, v2.4s, v12.4s
        trn2    v12.4s, v22.4s, v27.4s
        trn1    v8.4s, v22.4s, v27.4s
        mul     v31.8h, v25.8h, v4.h[0]
        trn2    v3.2d, v2.2d, v12.2d
        sqrdmulh        v11.8h, v25.8h, v4.h[1]
        trn2    v0.2d, v26.2d, v8.2d
        srshr   v16.8h, v19.8h, #11
        ldr     q4, [x1], #16
        mls     v10.8h, v13.8h, v7.h[0]
        add     v29.8h, v0.8h, v3.8h
        trn1    v18.2d, v26.2d, v8.2d
        trn1    v20.2d, v2.2d, v12.2d
        sqdmulh v15.8h, v29.8h, v7.h[1]
        sub     v13.8h, v0.8h, v3.8h
        mls     v21.8h, v16.8h, v7.h[0]
        add     v9.8h, v18.8h, v20.8h
        stur    q10, [x3, #-16]
        sub     v23.8h, v18.8h, v20.8h
        mls     v31.8h, v11.8h, v7.h[0]
        srshr   v14.8h, v15.8h, #11
        sqrdmulh        v16.8h, v13.8h, v4.h[5]
        stur    q21, [x3, #-48]
        sqdmulh v27.8h, v9.8h, v7.h[1]
        stur    q31, [x3, #-32]
        sub     x4, x4, #0x1
        cbnz    x4, Lmlkem_intt_layer3456_start

        mls     v29.8h, v14.8h, v7.h[0]
        srshr   v1.8h, v27.8h, #11
        mul     v11.8h, v13.8h, v4.h[4]
        mls     v9.8h, v1.8h, v7.h[0]
        sqrdmulh        v1.8h, v23.8h, v4.h[3]
        mul     v20.8h, v23.8h, v4.h[2]
        sub     v21.8h, v9.8h, v29.8h
        add     v0.8h, v9.8h, v29.8h
        mls     v11.8h, v16.8h, v7.h[0]
        mls     v20.8h, v1.8h, v7.h[0]
        str     q0, [x3], #64
        mul     v1.8h, v21.8h, v4.h[0]
        sqrdmulh        v16.8h, v21.8h, v4.h[1]
        add     v21.8h, v20.8h, v11.8h
        sub     v11.8h, v20.8h, v11.8h
        sqdmulh v20.8h, v21.8h, v7.h[1]
        sqrdmulh        v0.8h, v11.8h, v4.h[1]
        mul     v11.8h, v11.8h, v4.h[0]
        srshr   v20.8h, v20.8h, #11
        mls     v1.8h, v16.8h, v7.h[0]
        mls     v11.8h, v0.8h, v7.h[0]
        mls     v21.8h, v20.8h, v7.h[0]
        stur    q1, [x3, #-32]
        stur    q11, [x3, #-16]
        stur    q21, [x3, #-48]
        mov     x4, #0x4
        ldr     q0, [x1], #32
        ldur    q1, [x1, #-16]
        ldr     q6, [x0, #64]
        ldr     q16, [x0]
        ldr     q18, [x0, #192]
        ldr     q27, [x0, #128]
        ldr     q26, [x0, #320]
        ldr     q5, [x0, #256]
        ldr     q4, [x0, #448]
        ldr     q2, [x0, #384]
        add     v12.8h, v16.8h, v6.8h
        sub     v11.8h, v16.8h, v6.8h
        add     v3.8h, v27.8h, v18.8h
        sub     v21.8h, v27.8h, v18.8h
        sub     v18.8h, v5.8h, v26.8h
        mul     v10.8h, v11.8h, v0.h[6]
        add     v24.8h, v5.8h, v26.8h
        sqrdmulh        v27.8h, v18.8h, v1.h[3]
        sub     v19.8h, v12.8h, v3.8h
        add     v29.8h, v12.8h, v3.8h
        mul     v14.8h, v18.8h, v1.h[2]
        sub     v13.8h, v2.8h, v4.8h
        sqrdmulh        v31.8h, v21.8h, v1.h[1]
        sqrdmulh        v26.8h, v11.8h, v0.h[7]
        mul     v21.8h, v21.8h, v1.h[0]
        sub     x4, x4, #0x1

Lmlkem_intt_layer102_start:
        mls     v14.8h, v27.8h, v7.h[0]
        ldr     q15, [x0, #16]
        ldr     q9, [x0, #208]
        add     v18.8h, v2.8h, v4.8h
        mul     v17.8h, v13.8h, v1.h[4]
        ldr     q20, [x0, #80]
        ldr     q2, [x0, #400]
        ldr     q5, [x0, #272]
        sub     v11.8h, v24.8h, v18.8h
        sqrdmulh        v8.8h, v13.8h, v1.h[5]
        ldr     q23, [x0, #336]
        sqrdmulh        v16.8h, v11.8h, v0.h[5]
        sub     v12.8h, v15.8h, v20.8h
        ldr     q3, [x0, #144]
        add     v28.8h, v15.8h, v20.8h
        add     v4.8h, v24.8h, v18.8h
        mul     v30.8h, v11.8h, v0.h[4]
        sub     v20.8h, v5.8h, v23.8h
        add     v24.8h, v5.8h, v23.8h
        mls     v17.8h, v8.8h, v7.h[0]
        sub     v11.8h, v29.8h, v4.8h
        mls     v30.8h, v16.8h, v7.h[0]
        sqrdmulh        v27.8h, v20.8h, v1.h[3]
        add     v16.8h, v14.8h, v17.8h
        sub     v13.8h, v14.8h, v17.8h
        sqrdmulh        v23.8h, v19.8h, v0.h[3]
        sub     v25.8h, v3.8h, v9.8h
        add     v5.8h, v3.8h, v9.8h
        mul     v6.8h, v19.8h, v0.h[2]
        mul     v8.8h, v11.8h, v0.h[0]
        mls     v10.8h, v26.8h, v7.h[0]
        sqrdmulh        v26.8h, v12.8h, v0.h[7]
        mul     v14.8h, v20.8h, v1.h[2]
        mul     v22.8h, v13.8h, v0.h[4]
        mls     v21.8h, v31.8h, v7.h[0]
        sqrdmulh        v9.8h, v11.8h, v0.h[1]
        sqrdmulh        v20.8h, v13.8h, v0.h[5]
        sub     v13.8h, v10.8h, v21.8h
        add     v15.8h, v10.8h, v21.8h
        sqrdmulh        v31.8h, v25.8h, v1.h[1]
        sqrdmulh        v21.8h, v13.8h, v0.h[3]
        sub     v18.8h, v15.8h, v16.8h
        add     v3.8h, v15.8h, v16.8h
        add     v4.8h, v29.8h, v4.8h
        mls     v22.8h, v20.8h, v7.h[0]
        sub     v19.8h, v28.8h, v5.8h
        mls     v6.8h, v23.8h, v7.h[0]
        str     q4, [x0], #16
        mul     v29.8h, v13.8h, v0.h[2]
        mls     v29.8h, v21.8h, v7.h[0]
        add     v11.8h, v6.8h, v30.8h
        mls     v8.8h, v9.8h, v7.h[0]
        str     q11, [x0, #112]
        sub     v11.8h, v6.8h, v30.8h
        sqrdmulh        v21.8h, v18.8h, v0.h[1]
        sqrdmulh        v4.8h, v11.8h, v0.h[1]
        str     q8, [x0, #240]
        sub     v16.8h, v29.8h, v22.8h
        str     q3, [x0, #48]
        mul     v20.8h, v11.8h, v0.h[0]
        sqrdmulh        v11.8h, v16.8h, v0.h[1]
        mls     v20.8h, v4.8h, v7.h[0]
        mul     v23.8h, v16.8h, v0.h[0]
        mls     v23.8h, v11.8h, v7.h[0]
        str     q20, [x0, #368]
        mul     v11.8h, v18.8h, v0.h[0]
        mls     v11.8h, v21.8h, v7.h[0]
        str     q23, [x0, #432]
        ldr     q4, [x0, #448]
        mul     v10.8h, v12.8h, v0.h[6]
        add     v12.8h, v29.8h, v22.8h
        add     v29.8h, v28.8h, v5.8h
        mul     v21.8h, v25.8h, v1.h[0]
        str     q12, [x0, #176]
        str     q11, [x0, #304]
        sub     v13.8h, v2.8h, v4.8h
        sub     x4, x4, #0x1
        cbnz    x4, Lmlkem_intt_layer102_start

        mls     v21.8h, v31.8h, v7.h[0]
        add     v22.8h, v2.8h, v4.8h
        sqrdmulh        v15.8h, v13.8h, v1.h[5]
        add     v8.8h, v24.8h, v22.8h
        sub     v17.8h, v29.8h, v8.8h
        mul     v28.8h, v13.8h, v1.h[4]
        add     v29.8h, v29.8h, v8.8h
        sub     v13.8h, v24.8h, v22.8h
        sqrdmulh        v25.8h, v17.8h, v0.h[1]
        str     q29, [x0], #16
        mls     v28.8h, v15.8h, v7.h[0]
        mls     v10.8h, v26.8h, v7.h[0]
        mul     v29.8h, v17.8h, v0.h[0]
        mls     v29.8h, v25.8h, v7.h[0]
        mls     v14.8h, v27.8h, v7.h[0]
        sqrdmulh        v20.8h, v13.8h, v0.h[5]
        str     q29, [x0, #240]
        mul     v4.8h, v13.8h, v0.h[4]
        add     v12.8h, v10.8h, v21.8h
        add     v22.8h, v14.8h, v28.8h
        sub     v8.8h, v10.8h, v21.8h
        sqrdmulh        v11.8h, v19.8h, v0.h[3]
        add     v6.8h, v12.8h, v22.8h
        sub     v3.8h, v14.8h, v28.8h
        mls     v4.8h, v20.8h, v7.h[0]
        str     q6, [x0, #48]
        sub     v16.8h, v12.8h, v22.8h
        mul     v12.8h, v19.8h, v0.h[2]
        mul     v14.8h, v3.8h, v0.h[4]
        sqrdmulh        v22.8h, v3.8h, v0.h[5]
        mls     v12.8h, v11.8h, v7.h[0]
        mul     v20.8h, v8.8h, v0.h[2]
        mls     v14.8h, v22.8h, v7.h[0]
        add     v5.8h, v12.8h, v4.8h
        sub     v21.8h, v12.8h, v4.8h
        sqrdmulh        v4.8h, v8.8h, v0.h[3]
        str     q5, [x0, #112]
        sqrdmulh        v9.8h, v21.8h, v0.h[1]
        mul     v19.8h, v21.8h, v0.h[0]
        mls     v20.8h, v4.8h, v7.h[0]
        mls     v19.8h, v9.8h, v7.h[0]
        sqrdmulh        v9.8h, v16.8h, v0.h[1]
        sub     v5.8h, v20.8h, v14.8h
        add     v4.8h, v20.8h, v14.8h
        mul     v20.8h, v16.8h, v0.h[0]
        str     q4, [x0, #176]
        sqrdmulh        v18.8h, v5.8h, v0.h[1]
        str     q19, [x0, #368]
        mul     v23.8h, v5.8h, v0.h[0]
        mls     v20.8h, v9.8h, v7.h[0]
        mls     v23.8h, v18.8h, v7.h[0]
        str     q20, [x0, #304]
        str     q23, [x0, #432]
        CFI_STACKLOAD2(d8,d9,0)
        CFI_STACKLOAD2(d10,d11,16)
        CFI_STACKLOAD2(d12,d13,32)
        CFI_STACKLOAD2(d14,d15,48)
        CFI_INC_SP(64)
        CFI_RET

S2N_BN_SIZE_DIRECTIVE(mlkem_intt)

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack, "", %progbits
#endif
