Add macb silent TX stall fix series to kernel build

Three-patch series targeting the BCM2712/RP1 (Raspberry Pi 5) silent TX hang documented at: * https://github.com/cilium/cilium/issues/43198 * https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877 0001: flush PCIe posted write after TSTART doorbell 0002: re-check ISR after IER re-enable in macb_tx_poll 0003: add TX stall watchdog fallback for lost TCOMP New patches live in patches/linux/ and are copied into checkouts/pkgs/kernel/build/patches/ via a new 'patches-linux' Makefile target, wired into the existing 'patches' aggregate. Verified to apply cleanly against raspberrypi/linux @ f2f68e79f16f (the ref pinned by the preceding commit). Author of the patches: Lukasz Raczylo <[email protected]>.
2026-07-21 07:14:17 +00:00 · 2026-04-24 18:47:51 +01:00
parent 511fbc0e97
commit 2ed45176d6
4 changed files with 323 additions and 4 deletions
@@ -63,7 +63,7 @@ checkouts-clean:
 #
 # Patches
 #
-.PHONY: patches-pkgs patches-talos patches-sbc-raspberrypi patches patches
+.PHONY: patches-pkgs patches-talos patches-sbc-raspberrypi patches-linux patches
 patches-pkgs:
 	cd "$(CHECKOUTS_DIRECTORY)/pkgs" && \
 		git am "$(PATCHES_DIRECTORY)/siderolabs/pkgs/0001-Patched-for-Raspberry-Pi-5.patch"
@@ -77,10 +77,18 @@ patches-sbc-raspberrypi:
 	cd "$(CHECKOUTS_DIRECTORY)/sbc-raspberrypi" && \
 		git am "$(PATCHES_DIRECTORY)/siderolabs/sbc-raspberrypi/0001-Patched-for-Raspberry-Pi-5.patch"

-patches: patches-pkgs patches-talos patches-sbc-raspberrypi
+# Drop local kernel .patch files into the pkgs kernel/build patch dir so
+# they are picked up by the patch loop in kernel/build/pkg.yaml. Sorted
+# numeric filenames preserve apply order.
+patches-linux:
+	@if [ -d "$(PATCHES_DIRECTORY)/linux" ] && ls "$(PATCHES_DIRECTORY)/linux"/*.patch >/dev/null 2>&1; then \
+		mkdir -p "$(CHECKOUTS_DIRECTORY)/pkgs/kernel/build/patches" && \
+		cp -v "$(PATCHES_DIRECTORY)/linux"/*.patch "$(CHECKOUTS_DIRECTORY)/pkgs/kernel/build/patches/"; \
+	else \
+		echo "No local kernel patches in $(PATCHES_DIRECTORY)/linux, skipping"; \
+	fi

-# Backwards-compatible alias
-patches: patches
+patches: patches-pkgs patches-talos patches-sbc-raspberrypi patches-linux

 .PHONY: kernel
 kernel:
@@ -0,0 +1,70 @@
+From 0000000000000000000000000000000000000001 Mon Sep 17 00:00:00 2001
+From: Lukasz Raczylo <[email protected]>
+Date: Fri, 24 Apr 2026 00:00:00 +0000
+Subject: [PATCH 1/3] net: macb: flush PCIe posted write after TSTART doorbell
+
+macb_start_xmit() and macb_tx_restart() both kick transmission by
+OR-ing MACB_BIT(TSTART) into NCR.  On PCIe-attached macb instances --
+notably BCM2712 + RP1 PCIe south bridge on Raspberry Pi 5 -- the
+doorbell write is a posted PCIe write that can sit in the fabric's
+write queue until something drains it.  A source-level comment at
+the TSTART site already acknowledges the problem:
+
+	/* TSTART write might get dropped, so make the IRQ retrigger
+	 * a buffer read */
+
+and arms a recovery handshake via queue->tx_pending /
+queue->txubr_pending that is picked up on the next TCOMP interrupt.
+That recovery path only runs if a TCOMP interrupt actually fires;
+if the lost doorbell means no TX starts, there is no TCOMP, and the
+ring stalls silently.
+
+Add a read-back of NCR after the TSTART write.  The read serialises
+the PCIe posted-write queue and ensures the doorbell reaches the MAC
+before macb_start_xmit() / macb_tx_restart() return.  The existing
+'TSTART might get dropped' handshake is preserved as a safety net
+for cases where the fabric genuinely drops the write despite the
+read barrier, but with this barrier it should rarely if ever be
+needed on PCIe-attached parts.
+
+Observed to be the most common trigger for the silent TX stall
+documented in the linked reports.
+
+Link: https://github.com/cilium/cilium/issues/43198
+Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
+Signed-off-by: Lukasz Raczylo <[email protected]>
+---
+ drivers/net/ethernet/cadence/macb_main.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
+--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
+@@ -1949,6 +1949,13 @@
+ 
+ 	spin_lock(&bp->lock);
+ 	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
+	/*
+	 * Flush the PCIe posted-write queue so the TSTART doorbell
+	 * reliably reaches the MAC.  Without this, the write can sit
+	 * in the fabric and the MAC never advances, causing a silent
+	 * TX stall.
+	 */
+	(void)macb_readl(bp, NCR);
+ 	spin_unlock(&bp->lock);
+ 
+ out_tx_ptr_unlock:
+@@ -2630,6 +2637,11 @@
+ 		queue->tx_pending = 1;
+ 
+ 	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
+	/*
+	 * Flush the PCIe posted-write queue; see the comment in
+	 * macb_tx_restart() for the reasoning.
+	 */
+	(void)macb_readl(bp, NCR);
+ 	spin_unlock(&bp->lock);
+ 
+ 	if (CIRC_SPACE(queue->tx_head, queue->tx_tail, bp->tx_ring_size) < 1)
+--
+2.44.0
@@ -0,0 +1,98 @@
+From 0000000000000000000000000000000000000002 Mon Sep 17 00:00:00 2001
+From: Lukasz Raczylo <[email protected]>
+Date: Fri, 24 Apr 2026 00:00:00 +0000
+Subject: [PATCH 2/3] net: macb: re-check ISR after IER re-enable in
+ macb_tx_poll
+
+macb_tx_poll() runs with TCOMP masked, drains the TX ring, then
+calls napi_complete_done() and re-enables TCOMP via IER.  An
+existing comment in the function notes:
+
+	/* Packet completions only seem to propagate to raise
+	 * interrupts when interrupts are enabled at the time, so if
+	 * packets were sent while interrupts were disabled,
+	 * they will not cause another interrupt to be generated when
+	 * interrupts are re-enabled.
+	 */
+
+and mitigates this by calling macb_tx_complete_pending() to look
+for a completed descriptor whose TX_USED bit the hardware has
+DMA'd but whose completion we processed without ever seeing an
+interrupt for.
+
+macb_tx_complete_pending() only inspects driver-visible ring state
+(descriptor->ctrl, after rmb()).  On PCIe-attached parts (BCM2712 +
+RP1 on Raspberry Pi 5 in particular) the descriptor DMA write that
+sets TX_USED can still be in flight in the PCIe fabric when we
+check.  The read-memory-barrier synchronises the CPU view of earlier
+CPU writes, but does not force the peripheral's in-flight DMA to
+retire.  In that window the check returns false, napi exits, the
+IER re-enable does not re-fire (the quirk above), and the queue
+stalls silently.
+
+Re-check the hardware's own ISR state as well.  Reading a MAC
+register after IER re-enable serves two purposes:
+
+  (1) It drains any in-flight PCIe DMA writes of descriptor state,
+      so a subsequent macb_tx_complete_pending() sees an accurate
+      view of TX_USED.
+
+  (2) It directly observes whether the hardware currently has a
+      pending TCOMP signal, catching the case the existing driver
+      comment describes (completions raised while masked, not
+      re-fired).
+
+If either path indicates pending work, schedule NAPI again.
+
+Combined with the PCIe posted-write flush in patch 1/3, this closes
+the observed silent-TX-stall path on BCM2712/RP1 reported at the
+links below.
+
+Link: https://github.com/cilium/cilium/issues/43198
+Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
+Signed-off-by: Lukasz Raczylo <[email protected]>
+---
+ drivers/net/ethernet/cadence/macb_main.c | 25 +++++++++++++++++-------
+ 1 file changed, 18 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
+--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
+@@ -2000,17 +2000,25 @@
+ 	if (work_done < budget && napi_complete_done(napi, work_done)) {
+ 		queue_writel(queue, IER, MACB_BIT(TCOMP));
+ 
+-		/* Packet completions only seem to propagate to raise
+-		 * interrupts when interrupts are enabled at the time, so if
+-		 * packets were sent while interrupts were disabled,
+-		 * they will not cause another interrupt to be generated when
+-		 * interrupts are re-enabled.
+-		 * Check for this case here to avoid losing a wakeup. This can
+-		 * potentially race with the interrupt handler doing the same
+-		 * actions if an interrupt is raised just after enabling them,
+-		 * but this should be harmless.
+		/*
+		 * TCOMP events that fire while the interrupt is masked do
+		 * not re-fire when IER is re-enabled.  Catch this two ways
+		 * to avoid losing a wakeup:
+		 *
+		 *   (1) Read ISR -- catches completions the hardware flagged
+		 *       but that we did not see as an interrupt.  The MMIO
+		 *       read doubles as a PCIe read barrier, flushing any
+		 *       in-flight descriptor TX_USED DMA writes into memory.
+		 *   (2) macb_tx_complete_pending() inspects the ring after
+		 *       that flush, catching a descriptor whose TX_USED is
+		 *       now visible as a result of the barrier.
+		 *
+		 * This can race with the interrupt handler taking the same
+		 * path if an interrupt fires just after the IER write;
+		 * rescheduling NAPI in that case is harmless.
+ 		 */
+-		if (macb_tx_complete_pending(queue)) {
+		if ((queue_readl(queue, ISR) & MACB_BIT(TCOMP)) ||
+		    macb_tx_complete_pending(queue)) {
+ 			queue_writel(queue, IDR, MACB_BIT(TCOMP));
+ 			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
+ 				queue_writel(queue, ISR, MACB_BIT(TCOMP));
+--
+2.44.0
@@ -0,0 +1,143 @@
+From 0000000000000000000000000000000000000003 Mon Sep 17 00:00:00 2001
+From: Lukasz Raczylo <[email protected]>
+Date: Fri, 24 Apr 2026 00:00:00 +0000
+Subject: [PATCH 3/3] net: macb: add TX stall watchdog to recover from lost
+ TCOMP
+
+Patches 1/3 and 2/3 close two races by which a TCOMP interrupt can
+be lost on PCIe-attached macb instances.  This patch adds a
+defence-in-depth safety net: a per-queue delayed_work that calls
+macb_tx_restart() if queue->tx_tail has not advanced in one second
+despite the ring being non-empty.
+
+The watchdog introduces no new recovery logic.  macb_tx_restart()
+already exists, is correctly locked, and already checks the
+hardware's TBQP against the driver's head index before writing
+TSTART: on a healthy ring it is a no-op at the hardware level.  All
+the watchdog adds is the trigger.
+
+If patches 1/3 and 2/3 completely eliminate the stall, this code
+never does anything beyond a spin_lock/unlock and a branch per
+second per queue.  If a further race remains -- hardware or
+driver-level -- this turns a multi-minute silent hang into a
+one-second bump.
+
+On our 24-node Raspberry Pi 5 fleet this was empirically needed:
+before the patches in this series, multiple nodes per day hit the
+stall and required external watchdog intervention to recover.
+
+Link: https://github.com/cilium/cilium/issues/43198
+Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
+Signed-off-by: Lukasz Raczylo <[email protected]>
+---
+ drivers/net/ethernet/cadence/macb.h      |  4 ++
+ drivers/net/ethernet/cadence/macb_main.c | 63 ++++++++++++++++++++++++
+ 2 files changed, 67 insertions(+)
+
+diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
+--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
+@@ -1294,6 +1294,11 @@
+ 	struct work_struct	tx_error_task;
+ 	bool			txubr_pending;
+ 	bool			tx_pending;
+
+	/* TX stall watchdog -- see macb_tx_stall_watchdog() in macb_main.c */
+	struct delayed_work	tx_stall_watchdog_work;
+	unsigned int		tx_stall_last_tail;
+
+ 	struct napi_struct	napi_tx;
+ 
+ 	dma_addr_t		rx_ring_dma;
+diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
+--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
+@@ -2030,6 +2030,59 @@
+ 	return work_done;
+ }
+ 
+#define MACB_TX_STALL_INTERVAL_MS	1000
+
+/*
+ * TX stall watchdog.
+ *
+ * Defence-in-depth against lost TCOMP interrupts.  macb already has a
+ * recovery chain (tx_pending -> txubr_pending -> macb_tx_restart())
+ * that fires on TCOMP; if TCOMP itself is lost the TX ring stalls
+ * silently until something else kicks TSTART.  This watchdog runs
+ * once per second per queue, snapshots tx_tail, and calls
+ * macb_tx_restart() if the ring is non-empty and tx_tail has not
+ * advanced since the previous tick.
+ *
+ * macb_tx_restart() already checks the hardware's TBQP against the
+ * driver's head index before re-asserting TSTART, so on a healthy
+ * ring this is a no-op at the hardware level.  The watchdog only
+ * adds the missing trigger.
+ */
+static void macb_tx_stall_watchdog(struct work_struct *work)
+{
+	struct macb_queue *queue = container_of(to_delayed_work(work),
+						struct macb_queue,
+						tx_stall_watchdog_work);
+	struct macb *bp = queue->bp;
+	unsigned int cur_tail, cur_head;
+	bool stalled = false;
+	unsigned long flags;
+
+	if (!netif_running(bp->dev))
+		return;
+
+	spin_lock_irqsave(&queue->tx_ptr_lock, flags);
+	cur_tail = queue->tx_tail;
+	cur_head = queue->tx_head;
+	if (cur_head != cur_tail &&
+	    cur_tail == queue->tx_stall_last_tail)
+		stalled = true;
+	else
+		queue->tx_stall_last_tail = cur_tail;
+	spin_unlock_irqrestore(&queue->tx_ptr_lock, flags);
+
+	if (stalled) {
+		netdev_warn_once(bp->dev,
+				 "TX stall detected on queue %u (tail=%u head=%u); re-kicking TSTART\n",
+				 (unsigned int)(queue - bp->queues),
+				 cur_tail, cur_head);
+		macb_tx_restart(queue);
+	}
+
+	schedule_delayed_work(&queue->tx_stall_watchdog_work,
+			      msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS));
+}
+
+ static void macb_hresp_error_task(struct work_struct *work)
+ {
+ 	struct macb *bp = from_work(bp, work, hresp_err_bh_work);
+@@ -3297,6 +3350,9 @@
+ 	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+ 		napi_enable(&queue->napi_rx);
+ 		napi_enable(&queue->napi_tx);
+		queue->tx_stall_last_tail = queue->tx_tail;
+		schedule_delayed_work(&queue->tx_stall_watchdog_work,
+				      msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS));
+ 	}
+ 
+ 	macb_init_hw(bp);
+@@ -3343,6 +3399,7 @@
+ 	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+ 		napi_disable(&queue->napi_rx);
+ 		napi_disable(&queue->napi_tx);
+		cancel_delayed_work_sync(&queue->tx_stall_watchdog_work);
+ 		netdev_tx_reset_queue(netdev_get_tx_queue(dev, q));
+ 	}
+ 
+@@ -4941,6 +4998,8 @@
+ 		}
+ 
+ 		INIT_WORK(&queue->tx_error_task, macb_tx_error_task);
+		INIT_DELAYED_WORK(&queue->tx_stall_watchdog_work,
+				  macb_tx_stall_watchdog);
+ 		q++;
+ 	}
+ 
+--
+2.44.0