//
// Syd: rock-solid application kernel
// src/workers/int.rs: `syd_int' interrupter thread
//
// Copyright (c) 2024, 2025 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    env,
    os::fd::{FromRawFd, OwnedFd, RawFd},
    sync::{
        atomic::{AtomicBool, Ordering},
        Arc,
    },
    thread,
};

use libseccomp::{scmp_cmp, ScmpAction, ScmpFilterContext, ScmpSyscall};
use nix::{
    errno::Errno,
    sched::{unshare, CloneFlags},
    unistd::{getpid, lseek64, write, Gid, Pid, Uid, Whence},
};
use serde::{ser::SerializeMap, Serialize, Serializer};

use crate::{
    alert,
    cache::SysInterrupt,
    config::*,
    confine::{confine_scmp_madvise, confine_scmp_write, scmp_add_setid_rules, ExportMode},
    cookie::{CookieIdx, SYSCOOKIE_POOL},
    err::{err2no, scmp2no, SydJoinHandle, SydResult},
    error,
    fd::closeexcept,
    fs::{seccomp_notify_id_valid, tgkill},
    info,
    proc::{proc_status_open, proc_status_read},
    retry::retry_on_eintr,
    sandbox::Flags,
    sigset::SydSigSet,
    workers::WorkerCache,
};

#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum InterruptSource {
    Manual,
    Signal(SydSigSet),
    ProcessInvalid(Errno),
}

impl Serialize for InterruptSource {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        let mut map = serializer.serialize_map(Some(2))?;
        match self {
            Self::Manual => {
                map.serialize_entry("name", "manual")?;
            }
            Self::Signal(set) => {
                map.serialize_entry("name", "signal")?;
                map.serialize_entry("set", set)?;
            }
            Self::ProcessInvalid(errno) => {
                let err = *errno as i32;
                map.serialize_entry("name", "process")?;
                map.serialize_entry("err", &err)?;
            }
        }
        map.end()
    }
}

#[derive(Clone)]
pub(crate) struct Interrupter {
    seccomp_fd: RawFd,
    flags: Flags,

    transit_uids: Vec<(Uid, Uid)>,
    transit_gids: Vec<(Gid, Gid)>,

    should_exit: Arc<AtomicBool>,
    cache: Arc<WorkerCache>,
}

impl Interrupter {
    pub(crate) fn new(
        seccomp_fd: RawFd,
        flags: Flags,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
        should_exit: Arc<AtomicBool>,
        cache: Arc<WorkerCache>,
    ) -> Self {
        Self {
            flags,
            seccomp_fd,
            should_exit,
            cache,
            transit_uids: transit_uids.to_vec(),
            transit_gids: transit_gids.to_vec(),
        }
    }

    #[expect(clippy::cognitive_complexity)]
    pub(crate) fn try_spawn(self, notif_pipe: (RawFd, RawFd)) -> Result<SydJoinHandle<()>, Errno> {
        thread::Builder::new()
            .name("syd_int".to_string())
            .stack_size(INT_STACK_SIZE)
            .spawn(move || {
                // SAFETY: We use exit_group(2) here to bail,
                // because this unsharing is a critical safety feature.
                if let Err(errno) = unshare(CloneFlags::CLONE_FS | CloneFlags::CLONE_FILES) {
                    alert!("ctx": "boot", "op": "unshare_interrupt_thread",
                        "msg": format!("failed to unshare(CLONE_FS|CLONE_FILES): {errno}"),
                        "err": errno as i32);
                    std::process::exit(101);
                }

                // SAFETY: notif_pipe points to valid FDs.
                let (pipe_rd, pipe_wr) = unsafe {
                    (
                        OwnedFd::from_raw_fd(notif_pipe.0),
                        OwnedFd::from_raw_fd(notif_pipe.1),
                    )
                };
                drop(pipe_rd);
                let buf = [42u8; 1];
                #[expect(clippy::disallowed_methods)]
                match retry_on_eintr(|| write(&pipe_wr, &buf)).unwrap() {
                    0 => return Err(Errno::EIO.into()), // Syd died before reading.
                    1 => {}
                    n => unreachable!("BUG: invalid pipe write of size {n}!"),
                }

                // Close the notification pipe.
                drop(pipe_wr);

                // SAFETY: The Interrupt thread needs to inherit the following FDs:
                // 1. Seccomp-notify FD.
                // 2. Static FD of procfs(5).
                // 3. Log FD.
                // We have to sort the set as the FDs are randomized.
                #[expect(clippy::cast_sign_loss)]
                let mut set = vec![
                    self.seccomp_fd as libc::c_uint,
                    PROC_FD() as libc::c_uint,
                    crate::log::LOG_FD.load(Ordering::Relaxed) as libc::c_uint,
                ];
                set.sort_unstable();
                if let Err(errno) = closeexcept(&set) {
                    alert!("ctx": "boot", "op": "close_range_interrupt_thread",
                        "msg": format!("failed to close range: {errno}"),
                        "err": errno as i32);
                    std::process::exit(101);
                }
                drop(set);

                // To be used by tgkill when signaling threads.
                let tgid = getpid();

                // Honour dry-run when exporting.
                let dry_run =
                    env::var_os(ENV_SKIP_SCMP).is_some() || ExportMode::from_env().is_some();

                // Confine `syd_int' thread.
                if !dry_run {
                    // SAFETY: We use exit_group(2) here to bail,
                    // because this confinement is a critical safety feature.
                    let ctx = match Self::prepare_confine(
                        self.seccomp_fd,
                        tgid,
                        self.flags,
                        &self.transit_uids,
                        &self.transit_gids,
                        false,
                    ) {
                        Ok(ctx) => ctx,
                        Err(error) => {
                            let errno = error.errno().unwrap_or(Errno::ENOSYS);
                            alert!("ctx": "boot", "op": "confine_int_thread",
                                "msg": format!("failed to confine: {error}"),
                                "err": errno as i32);
                            std::process::exit(101);
                        }
                    };

                    // Load seccomp(2) BPF into the kernel.
                    // SAFETY: We use exit_group(2) here to bail,
                    // because this confinement is a critical safety feature.
                    if let Err(error) = ctx.load() {
                        let errno = scmp2no(&error).unwrap_or(Errno::ENOSYS);
                        alert!("ctx": "boot", "op": "confine_int_thread",
                            "msg": format!("failed to confine: {error}"),
                            "err": errno as i32);
                        std::process::exit(101);
                    }

                    let safe_setid = self
                        .flags
                        .intersects(Flags::FL_ALLOW_SAFE_SETUID | Flags::FL_ALLOW_SAFE_SETGID);
                    info!("ctx": "confine", "op": "confine_int_thread",
                        "msg": format!("interrupt thread confined with{} SROP mitigation",
                            if safe_setid { "out" } else { "" }));
                } else {
                    error!("ctx": "confine", "op": "confine_int_thread",
                        "msg": "interrupt thread is running unconfined in debug mode");
                }

                // Enter main loop.
                self.main(tgid)
            })
            .map_err(|err| err2no(&err))
    }

    fn main(self, tgid: Pid) -> SydResult<()> {
        loop {
            // Wait for one cycle.
            std::thread::sleep(INT_CYCLE_TIME.into());

            // Unblock invalidated blocking system calls.
            {
                let (ref lock, ref cvar) = *self.cache.sysint_map.sys_block;
                let mut map = lock.lock().unwrap_or_else(|err| err.into_inner());

                // As long as the map is empty,
                // we wait for an insert notification.
                map = cvar
                    .wait_while(map, |map| map.is_empty())
                    .unwrap_or_else(|err| err.into_inner());

                // Handle interrupts as necessary.
                map.retain_mut(|interrupt| self.handle_interrupt(tgid, interrupt));

                // Keep memory usage minimal.
                map.shrink_to_fit();
            }

            // Check if it's the time to exit.
            if self.should_exit.load(Ordering::Relaxed) {
                break;
            }
        }

        Ok(())
    }

    // Handles syscall signal interrupts.
    //
    // Returns false if interrupt is handled, true otherwise.
    fn handle_interrupt(&self, tgid: Pid, interrupt: &mut SysInterrupt) -> bool {
        // Check if syd_emu is already done with the request.
        if interrupt.delete {
            return false;
        }
        // Check if syd_mon requested manual interruption.
        if interrupt.signal {
            Self::interrupt(tgid, interrupt, InterruptSource::Manual);
            return false;
        }

        // Open proc_pid_status(5) if it's not open already.
        //
        // SAFETY:
        // 1. We want to wake the respective syd_emu thread in case the
        //    process is no longer valid otherwise we may end up with a
        //    deadlock: See miniupnpc tests, thx kepstin!
        // 2. To prevent PID reuse vectors we validate the request ID.
        let status_fd = if let Some(fd) = interrupt.status.as_ref() {
            if let Err(errno) = lseek64(fd, 0, Whence::SeekSet) {
                if self.is_valid(interrupt.request.id) {
                    let source = InterruptSource::ProcessInvalid(errno);
                    Self::interrupt(tgid, interrupt, source);
                } // no need to interrupt for invalid seccomp-id.
                return false;
            }
            fd
        } else {
            let fd = match proc_status_open(interrupt.request.pid()) {
                Ok(fd) if self.is_valid(interrupt.request.id) => {
                    // seccomp-id validated, proc_pid_status(5) is valid.
                    fd
                }
                Err(errno) if self.is_valid(interrupt.request.id) => {
                    let source = InterruptSource::ProcessInvalid(errno);
                    Self::interrupt(tgid, interrupt, source);
                    return false;
                }
                // seccomp-id invalid, no need to interrupt.
                _ => return false,
            };
            interrupt.status = Some(fd);
            #[expect(clippy::disallowed_methods)]
            interrupt.status.as_ref().unwrap()
        };

        // Parse proc_pid_status(5).
        let status = match proc_status_read(status_fd) {
            Ok(status) => status,
            Err(errno) if self.is_valid(interrupt.request.id) => {
                let source = InterruptSource::ProcessInvalid(errno);
                Self::interrupt(tgid, interrupt, source);
                return false;
            }
            // seccomp-id invalid, no need to interrupt.
            _ => return false,
        };

        // Check pending signals for the thread.
        //
        // 1. Check for per-{thread,process} pending signals:
        //
        // Aggregate pending signals from both the thread-local and
        // process-global queues. In Linux, each thread (LWP) has its
        // own pending set for signals targeted via tgkill/pthread_kill,
        // while the process-wide pending set captures signals delivered
        // to the PID (e.g., via kill). Taking the bitwise OR yields
        // the complete set of signals awaiting delivery that could
        // interrupt this execution context.
        let mut sigset = status.sig_pending_thread | status.sig_pending_process;

        // 2. Exclude per-thread blocked signals:
        //
        // Remove those signals currently masked by this thread's signal
        // mask. Signal masks are maintained per-thread via
        // pthread_sigmask; masked signals remain pending but are not
        // delivered until unblocked. By subtracting the blocked set, we
        // isolate only the pending signals eligible for immediate
        // synchronous or asynchronous handling.
        sigset.del_set(status.sig_blocked);

        if sigset.is_empty() {
            // No interrupt signals received, keep the entry.
            return true;
        }

        // 3. Filter out restarting signals per-process, unless ignore_restart is set.
        // This may be the case e.g. when the socket has a timeout for accept and connect.
        // Note, `interrupt.ignore_restart` check was done before calling this function and
        // sigset_restart is only Some if it is false.
        if !interrupt.ignore_restart {
            if let Some(sigset_restart) = self
                .cache
                .sysint_map
                .sig_restart
                .lock()
                .unwrap_or_else(|err| err.into_inner())
                .get(&status.pid)
            {
                sigset.del_set(*sigset_restart);

                if sigset.is_empty() {
                    // Only restarting signals received, keep the entry.
                    return true;
                }
            }
        }

        // Interrupt syd_emu thread and remove entry.
        Self::interrupt(tgid, interrupt, InterruptSource::Signal(sigset));
        false
    }

    // Interrupt the respective `syd_emu` thread.
    #[expect(clippy::cognitive_complexity)]
    fn interrupt(tgid: Pid, interrupt: &SysInterrupt, source: InterruptSource) {
        match tgkill(tgid, interrupt.handler, libc::SIGALRM) {
            Ok(_) | Err(Errno::ESRCH) => {
                info!("ctx": "int", "op": "interrupt_emulator",
                    "msg": "interrupted emulator thread",
                    "src": source, "int": interrupt);
            }
            Err(errno) => {
                alert!("ctx": "int", "op": "interrupt_emulator",
                    "msg": format!("failed to interrupt emulator: {errno}"),
                    "err": errno as i32,
                    "src": source, "int": interrupt);
                std::process::exit(101);
            }
        }
    }

    #[inline(always)]
    fn is_valid(&self, id: u64) -> bool {
        // EAGAIN|EINTR is handled.
        // ENOENT means child died mid-way.
        seccomp_notify_id_valid(self.seccomp_fd, id).is_ok()
    }

    /// Confine Interrupter thread.
    #[expect(clippy::cognitive_complexity)]
    pub(crate) fn prepare_confine(
        seccomp_fd: RawFd,
        tgid: Pid,
        flags: Flags,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
        _dry_run: bool,
    ) -> SydResult<ScmpFilterContext> {
        // Note, we cannot confine `syd_int` with a per-thread
        // Landlock filter here, because it requires access to
        // proc_pid_status(5) which in turn requires ptrace
        // rights and Landlock unconditionally limits that.

        // Create seccomp filter with default action.
        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_exec_speculative:1
        ctx.set_ctl_ssb(flags.allow_unsafe_exec_speculative())?;

        // DO NOT synchronize filter to all threads.
        // Other threads will self-confine.
        ctx.set_ctl_tsync(false)?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // SAFETY: Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Allow interrupt handler thread to send the
        // SIGALRM signal to threads in Syd's thread group.
        let sysname = "tgkill";
        #[expect(clippy::cast_sign_loss)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[
                        scmp_cmp!($arg0 == tgid.as_raw() as u64),
                        scmp_cmp!($arg2 == libc::SIGALRM as u64),
                    ],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_int_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Allow interrupt handler thread to
        // validate seccomp request IDs using ioctl(2).
        let sysname = "ioctl";
        #[expect(clippy::cast_sign_loss)]
        #[expect(clippy::unnecessary_cast)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[
                        scmp_cmp!($arg0 == seccomp_fd as u64),
                        scmp_cmp!($arg1 == crate::fs::SECCOMP_IOCTL_NOTIF_ID_VALID as u64),
                    ],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_int_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Allow openat2 with the static procfs(5) fd only.
        // Apply system call argument cookies.
        let sysname = "openat2";
        #[expect(clippy::cast_sign_loss)]
        #[expect(clippy::useless_conversion)]
        match ScmpSyscall::from_name(sysname) {
            Ok(syscall) => {
                ctx.add_rule_conditional(
                    ScmpAction::Allow,
                    syscall,
                    &[
                        scmp_cmp!($arg0 == PROC_FD() as u64),
                        scmp_cmp!($arg4 == SYSCOOKIE_POOL.get(CookieIdx::Openat2Arg4).into()),
                        scmp_cmp!($arg5 == SYSCOOKIE_POOL.get(CookieIdx::Openat2Arg5).into()),
                    ],
                )?;
            }
            Err(_) => {
                info!("ctx": "confine", "op": "allow_int_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Deny rest of open and stat family with ENOSYS rather than KillProcess.
        // We need this because std::thread::spawn has unwanted
        // side-effects such as opening /sys/devices/system/cpu/online
        // on some architectures.
        //
        // Note, we avoid this when profiling is enabled,
        // as gperf requires it to write profiling data.
        for sysname in ["open", "openat", "stat", "lstat", "statx", "newfstatat"] {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    let action = if !cfg!(feature = "prof") {
                        ScmpAction::Errno(Errno::ENOSYS as i32)
                    } else {
                        ScmpAction::Allow
                    };
                    ctx.add_rule(action, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_int_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow safe fcntl(2) utility calls.
        for sysname in ["fcntl", "fcntl64"] {
            let syscall = match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => syscall,
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_int_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    continue;
                }
            };

            for op in INT_FCNTL_OPS {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg1 == *op)])?;
            }
        }

        // Allow safe prctl(2) operations.
        let sysname = "prctl";
        if let Ok(syscall) = ScmpSyscall::from_name(sysname) {
            for (_, op) in INT_PRCTL_OPS {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg0 == *op)])?;
            }
        } else {
            info!("ctx": "confine", "op": "allow_int_syscall",
                "msg": format!("invalid or unsupported syscall {sysname}"));
        }

        // Prevent executable memory.
        const PROT_EXEC: u64 = libc::PROT_EXEC as u64;
        for sysname in ["mmap", "mmap2", "mprotect"] {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule_conditional(
                        ScmpAction::Allow,
                        syscall,
                        &[scmp_cmp!($arg2 & PROT_EXEC == 0)],
                    )?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_int_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow writes to the log-fd.
        // No proc_pid_mem(5) access required here.
        confine_scmp_write(&mut ctx, None, false)?;

        // Allow safe madvise(2) advice.
        confine_scmp_madvise(&mut ctx)?;

        // Allow safe system calls.
        for sysname in INT_SYSCALLS.iter().chain(VDSO_SYSCALLS) {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_int_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow futex system calls.
        for sysname in FUTEX_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_int_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow getid system calls.
        for sysname in GET_ID_SYSCALLS {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_int_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow UID/GID changing system calls as necessary.
        let safe_setuid = flags.allow_safe_setuid();
        let safe_setgid = flags.allow_safe_setgid();
        if safe_setuid || safe_setgid {
            scmp_add_setid_rules(
                "int",
                &mut ctx,
                safe_setuid,
                safe_setgid,
                transit_uids,
                transit_gids,
            )?;
        }

        Ok(ctx)
    }
}
