DragonFlyBSD Kernel Audit
DF-0032 / exhaust.c
← back to finding ↓ download raw
/*
 * DF-0032 instrumented exhaustion probe.
 *
 * GOAL: Determine whether fork()'s RFFDG path can be made to return ENOMEM
 * (the exact signature of the fdcopy()-failure leak in fork1(), kern_fork.c:553)
 * from an UNPRIVILEGED user, and how close we can push the M_FILEDESC
 * per-type ks_limit before RLIMIT_NPROC (EAGAIN) stops us.
 *
 * MODEL UNDER TEST (traced in sys/):
 *   - fdcopy() returns -1 only when its M_NULLOK kmalloc of sizeof(struct
 *     filedesc) fails (kern_descrip.c:2481-2486). That kmalloc returns NULL
 *     only when M_FILEDESC's per-type memuse >= ks_limit
 *     (kern_slaballoc.c:863-879). ks_limit = kmem_lim_size()/10 = ~195MB here.
 *   - fork1() maps that failure to ENOMEM (kern_fork.c:553) and leaks nprocs.
 *   - The fd_files[] array allocation inside fdcopy() is M_WAITOK (no M_NULLOK)
 *     at kern_descrip.c:2504-2505, so it would PANIC the kernel at
 *     kern_slaballoc.c:877 on limit exhaustion rather than return NULL.
 *
 * Therefore: to see fork()==ENOMEM (the leak), M_FILEDESC usage must ALREADY be
 * >= ks_limit before the newfdp kmalloc; but crossing that limit is dominated
 * by the large M_WAITOK fd_files array, which panics first. This probe
 * empirically confirms: we cannot get M_FILEDESC near 195MB from an
 * unprivileged uid before RLIMIT_NPROC (EAGAIN) caps us, and no ENOMEM occurs.
 *
 * Build:  cc -o exhaust exhaust.c
 * Run:    ./exhaust
 */
#include <sys/resource.h>
#include <sys/wait.h>
#include <signal.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>

static long maxprocperuid(void) {
    struct rlimit rl;
    if (getrlimit(RLIMIT_NPROC, &rl) == 0) return (long)rl.rlim_cur;
    return -1;
}

int main(void) {
    long nproc_limit = maxprocperuid();
    int eagain = 0, enomem = 0, ok = 0, other = 0;
    int high_fd = 0;

    fprintf(stderr, "[*] RLIMIT_NPROC(cur) = %ld\n", nproc_limit);
    fprintf(stderr, "[*] Phase 1: grow this proc's fd_files[] table via dup2()\n");

    /* Grow our own fd table to near the per-proc cap so each fork()'s fdcopy
     * allocates a large (~256KB) M_FILEDESC fd_files array in the child. */
    int devnull = open("/dev/null", O_RDWR);
    if (devnull < 0) { perror("open /dev/null"); return 1; }
    /* kern.maxfilesperproc is typically 16144; aim for ~15000 to stay under. */
    int target = 15000;
    for (int fd = 64; fd <= target; fd += 64) {
        if (dup2(devnull, fd) < 0) { high_fd = fd; break; }
        high_fd = fd;
    }
    fprintf(stderr, "[*] grew fd table to fd=%d (fd_files[] ~%dKB per fdcopy)\n",
            high_fd, (high_fd * 16) / 1024);

    fprintf(stderr, "[*] Phase 2: fork() children (RFFDG) to accumulate M_FILEDESC\n");
    fprintf(stderr, "[*] watching for fork()==ENOMEM (the fdcopy-failure leak marker)\n");

    /* Fork as many children as RLIMIT_NPROC allows; children sleep holding
     * their fd tables so M_FILEDESC accumulates. */
    for (int i = 0; i < 4000; i++) {
        pid_t p = fork();
        if (p == 0) { pause(); _exit(0); }
        else if (p > 0) { ok++; }
        else {
            if (errno == EAGAIN) { eagain++; if (eagain == 1) fprintf(stderr, "[!] first EAGAIN after %d children (RLIMIT_NPROC / maxproc)\n", ok); }
            else if (errno == ENOMEM) {
                enomem++;
                if (enomem == 1) {
                    fprintf(stderr, "[!!!] ENOMEM from fork() -- fdcopy failure leak TRIGGERED at child %d\n", ok);
                    /* Self-report the kernel malloc state at the exact leak moment. */
                    fprintf(stderr, "[*] M_FILEDESC/M_PROC/M_SUBPROC state at first ENOMEM:\n");
                    fflush(stderr);
                    int r = system("vmstat -m 2>/dev/null | grep -E '^[[:space:]]+(proc|lwp|subproc|file_desc)[[:space:]]' "
                                   "|| vmstat -m | grep -E 'proc|file_desc'");
                    (void)r;
                    /* also report swap/free via vmstat (1 line) */
                    fprintf(stderr, "[*] memory at first ENOMEM:\n");
                    fflush(stderr);
                    int r2 = system("vmstat -c 1 2>/dev/null | head -3 || vmstat | head -3");
                    (void)r2;
                }
            }
            else { other++; if (other == 1) fprintf(stderr, "[?] unexpected errno=%d (%s)\n", errno, strerror(errno)); }
            if (eagain > 50) break;   /* stop once capped */
        }
    }

    fprintf(stderr, "[*] summary: ok=%d eagain=%d enomem=%d other=%d\n",
            ok, eagain, enomem, other);
    if (enomem > 0) {
        fprintf(stderr, "[!!!] BUG TRIGGERED: fork() returned ENOMEM (fdcopy failure)\n");
    } else {
        fprintf(stderr, "[*] NO ENOMEM observed -- fdcopy never failed.\n"
                        "    (M_FILEDESC did not reach its ~195MB ks_limit before RLIMIT_NPROC capped forks.)\n");
    }

    /* cleanup: kill children */
    kill(0, SIGTERM);
    while (waitpid(-1, NULL, 0) > 0) ;
    return (enomem > 0) ? 2 : 0;
}