Add macb silent TX stall fix series to kernel build

Three-patch series targeting the BCM2712/RP1 (Raspberry Pi 5)
silent TX hang documented at:
  * https://github.com/cilium/cilium/issues/43198
  * https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877

  0001: flush PCIe posted write after TSTART doorbell
  0002: re-check ISR after IER re-enable in macb_tx_poll
  0003: add TX stall watchdog fallback for lost TCOMP

New patches live in patches/linux/ and are copied into
checkouts/pkgs/kernel/build/patches/ via a new 'patches-linux'
Makefile target, wired into the existing 'patches' aggregate.

Verified to apply cleanly against raspberrypi/linux @ f2f68e79f16f
(the ref pinned by the preceding commit).

Author of the patches: Lukasz Raczylo <lukasz@raczylo.com>.
This commit is contained in:
2026-04-24 18:47:51 +01:00
parent 511fbc0e97
commit 2ed45176d6
4 changed files with 323 additions and 4 deletions
+12 -4
View File
@@ -63,7 +63,7 @@ checkouts-clean:
#
# Patches
#
.PHONY: patches-pkgs patches-talos patches-sbc-raspberrypi patches patches
.PHONY: patches-pkgs patches-talos patches-sbc-raspberrypi patches-linux patches
patches-pkgs:
cd "$(CHECKOUTS_DIRECTORY)/pkgs" && \
git am "$(PATCHES_DIRECTORY)/siderolabs/pkgs/0001-Patched-for-Raspberry-Pi-5.patch"
@@ -77,10 +77,18 @@ patches-sbc-raspberrypi:
cd "$(CHECKOUTS_DIRECTORY)/sbc-raspberrypi" && \
git am "$(PATCHES_DIRECTORY)/siderolabs/sbc-raspberrypi/0001-Patched-for-Raspberry-Pi-5.patch"
patches: patches-pkgs patches-talos patches-sbc-raspberrypi
# Drop local kernel .patch files into the pkgs kernel/build patch dir so
# they are picked up by the patch loop in kernel/build/pkg.yaml. Sorted
# numeric filenames preserve apply order.
patches-linux:
@if [ -d "$(PATCHES_DIRECTORY)/linux" ] && ls "$(PATCHES_DIRECTORY)/linux"/*.patch >/dev/null 2>&1; then \
mkdir -p "$(CHECKOUTS_DIRECTORY)/pkgs/kernel/build/patches" && \
cp -v "$(PATCHES_DIRECTORY)/linux"/*.patch "$(CHECKOUTS_DIRECTORY)/pkgs/kernel/build/patches/"; \
else \
echo "No local kernel patches in $(PATCHES_DIRECTORY)/linux, skipping"; \
fi
# Backwards-compatible alias
patches: patches
patches: patches-pkgs patches-talos patches-sbc-raspberrypi patches-linux
.PHONY: kernel
kernel:
@@ -0,0 +1,70 @@
From 0000000000000000000000000000000000000001 Mon Sep 17 00:00:00 2001
From: Lukasz Raczylo <lukasz@raczylo.com>
Date: Fri, 24 Apr 2026 00:00:00 +0000
Subject: [PATCH 1/3] net: macb: flush PCIe posted write after TSTART doorbell
macb_start_xmit() and macb_tx_restart() both kick transmission by
OR-ing MACB_BIT(TSTART) into NCR. On PCIe-attached macb instances --
notably BCM2712 + RP1 PCIe south bridge on Raspberry Pi 5 -- the
doorbell write is a posted PCIe write that can sit in the fabric's
write queue until something drains it. A source-level comment at
the TSTART site already acknowledges the problem:
/* TSTART write might get dropped, so make the IRQ retrigger
* a buffer read */
and arms a recovery handshake via queue->tx_pending /
queue->txubr_pending that is picked up on the next TCOMP interrupt.
That recovery path only runs if a TCOMP interrupt actually fires;
if the lost doorbell means no TX starts, there is no TCOMP, and the
ring stalls silently.
Add a read-back of NCR after the TSTART write. The read serialises
the PCIe posted-write queue and ensures the doorbell reaches the MAC
before macb_start_xmit() / macb_tx_restart() return. The existing
'TSTART might get dropped' handshake is preserved as a safety net
for cases where the fabric genuinely drops the write despite the
read barrier, but with this barrier it should rarely if ever be
needed on PCIe-attached parts.
Observed to be the most common trigger for the silent TX stall
documented in the linked reports.
Link: https://github.com/cilium/cilium/issues/43198
Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
Signed-off-by: Lukasz Raczylo <lukasz@raczylo.com>
---
drivers/net/ethernet/cadence/macb_main.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1949,6 +1949,13 @@
spin_lock(&bp->lock);
macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
+ /*
+ * Flush the PCIe posted-write queue so the TSTART doorbell
+ * reliably reaches the MAC. Without this, the write can sit
+ * in the fabric and the MAC never advances, causing a silent
+ * TX stall.
+ */
+ (void)macb_readl(bp, NCR);
spin_unlock(&bp->lock);
out_tx_ptr_unlock:
@@ -2630,6 +2637,11 @@
queue->tx_pending = 1;
macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
+ /*
+ * Flush the PCIe posted-write queue; see the comment in
+ * macb_tx_restart() for the reasoning.
+ */
+ (void)macb_readl(bp, NCR);
spin_unlock(&bp->lock);
if (CIRC_SPACE(queue->tx_head, queue->tx_tail, bp->tx_ring_size) < 1)
--
2.44.0
@@ -0,0 +1,98 @@
From 0000000000000000000000000000000000000002 Mon Sep 17 00:00:00 2001
From: Lukasz Raczylo <lukasz@raczylo.com>
Date: Fri, 24 Apr 2026 00:00:00 +0000
Subject: [PATCH 2/3] net: macb: re-check ISR after IER re-enable in
macb_tx_poll
macb_tx_poll() runs with TCOMP masked, drains the TX ring, then
calls napi_complete_done() and re-enables TCOMP via IER. An
existing comment in the function notes:
/* Packet completions only seem to propagate to raise
* interrupts when interrupts are enabled at the time, so if
* packets were sent while interrupts were disabled,
* they will not cause another interrupt to be generated when
* interrupts are re-enabled.
*/
and mitigates this by calling macb_tx_complete_pending() to look
for a completed descriptor whose TX_USED bit the hardware has
DMA'd but whose completion we processed without ever seeing an
interrupt for.
macb_tx_complete_pending() only inspects driver-visible ring state
(descriptor->ctrl, after rmb()). On PCIe-attached parts (BCM2712 +
RP1 on Raspberry Pi 5 in particular) the descriptor DMA write that
sets TX_USED can still be in flight in the PCIe fabric when we
check. The read-memory-barrier synchronises the CPU view of earlier
CPU writes, but does not force the peripheral's in-flight DMA to
retire. In that window the check returns false, napi exits, the
IER re-enable does not re-fire (the quirk above), and the queue
stalls silently.
Re-check the hardware's own ISR state as well. Reading a MAC
register after IER re-enable serves two purposes:
(1) It drains any in-flight PCIe DMA writes of descriptor state,
so a subsequent macb_tx_complete_pending() sees an accurate
view of TX_USED.
(2) It directly observes whether the hardware currently has a
pending TCOMP signal, catching the case the existing driver
comment describes (completions raised while masked, not
re-fired).
If either path indicates pending work, schedule NAPI again.
Combined with the PCIe posted-write flush in patch 1/3, this closes
the observed silent-TX-stall path on BCM2712/RP1 reported at the
links below.
Link: https://github.com/cilium/cilium/issues/43198
Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
Signed-off-by: Lukasz Raczylo <lukasz@raczylo.com>
---
drivers/net/ethernet/cadence/macb_main.c | 25 +++++++++++++++++-------
1 file changed, 18 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2000,17 +2000,25 @@
if (work_done < budget && napi_complete_done(napi, work_done)) {
queue_writel(queue, IER, MACB_BIT(TCOMP));
- /* Packet completions only seem to propagate to raise
- * interrupts when interrupts are enabled at the time, so if
- * packets were sent while interrupts were disabled,
- * they will not cause another interrupt to be generated when
- * interrupts are re-enabled.
- * Check for this case here to avoid losing a wakeup. This can
- * potentially race with the interrupt handler doing the same
- * actions if an interrupt is raised just after enabling them,
- * but this should be harmless.
+ /*
+ * TCOMP events that fire while the interrupt is masked do
+ * not re-fire when IER is re-enabled. Catch this two ways
+ * to avoid losing a wakeup:
+ *
+ * (1) Read ISR -- catches completions the hardware flagged
+ * but that we did not see as an interrupt. The MMIO
+ * read doubles as a PCIe read barrier, flushing any
+ * in-flight descriptor TX_USED DMA writes into memory.
+ * (2) macb_tx_complete_pending() inspects the ring after
+ * that flush, catching a descriptor whose TX_USED is
+ * now visible as a result of the barrier.
+ *
+ * This can race with the interrupt handler taking the same
+ * path if an interrupt fires just after the IER write;
+ * rescheduling NAPI in that case is harmless.
*/
- if (macb_tx_complete_pending(queue)) {
+ if ((queue_readl(queue, ISR) & MACB_BIT(TCOMP)) ||
+ macb_tx_complete_pending(queue)) {
queue_writel(queue, IDR, MACB_BIT(TCOMP));
if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
queue_writel(queue, ISR, MACB_BIT(TCOMP));
--
2.44.0
@@ -0,0 +1,143 @@
From 0000000000000000000000000000000000000003 Mon Sep 17 00:00:00 2001
From: Lukasz Raczylo <lukasz@raczylo.com>
Date: Fri, 24 Apr 2026 00:00:00 +0000
Subject: [PATCH 3/3] net: macb: add TX stall watchdog to recover from lost
TCOMP
Patches 1/3 and 2/3 close two races by which a TCOMP interrupt can
be lost on PCIe-attached macb instances. This patch adds a
defence-in-depth safety net: a per-queue delayed_work that calls
macb_tx_restart() if queue->tx_tail has not advanced in one second
despite the ring being non-empty.
The watchdog introduces no new recovery logic. macb_tx_restart()
already exists, is correctly locked, and already checks the
hardware's TBQP against the driver's head index before writing
TSTART: on a healthy ring it is a no-op at the hardware level. All
the watchdog adds is the trigger.
If patches 1/3 and 2/3 completely eliminate the stall, this code
never does anything beyond a spin_lock/unlock and a branch per
second per queue. If a further race remains -- hardware or
driver-level -- this turns a multi-minute silent hang into a
one-second bump.
On our 24-node Raspberry Pi 5 fleet this was empirically needed:
before the patches in this series, multiple nodes per day hit the
stall and required external watchdog intervention to recover.
Link: https://github.com/cilium/cilium/issues/43198
Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
Signed-off-by: Lukasz Raczylo <lukasz@raczylo.com>
---
drivers/net/ethernet/cadence/macb.h | 4 ++
drivers/net/ethernet/cadence/macb_main.c | 63 ++++++++++++++++++++++++
2 files changed, 67 insertions(+)
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -1294,6 +1294,11 @@
struct work_struct tx_error_task;
bool txubr_pending;
bool tx_pending;
+
+ /* TX stall watchdog -- see macb_tx_stall_watchdog() in macb_main.c */
+ struct delayed_work tx_stall_watchdog_work;
+ unsigned int tx_stall_last_tail;
+
struct napi_struct napi_tx;
dma_addr_t rx_ring_dma;
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2030,6 +2030,59 @@
return work_done;
}
+#define MACB_TX_STALL_INTERVAL_MS 1000
+
+/*
+ * TX stall watchdog.
+ *
+ * Defence-in-depth against lost TCOMP interrupts. macb already has a
+ * recovery chain (tx_pending -> txubr_pending -> macb_tx_restart())
+ * that fires on TCOMP; if TCOMP itself is lost the TX ring stalls
+ * silently until something else kicks TSTART. This watchdog runs
+ * once per second per queue, snapshots tx_tail, and calls
+ * macb_tx_restart() if the ring is non-empty and tx_tail has not
+ * advanced since the previous tick.
+ *
+ * macb_tx_restart() already checks the hardware's TBQP against the
+ * driver's head index before re-asserting TSTART, so on a healthy
+ * ring this is a no-op at the hardware level. The watchdog only
+ * adds the missing trigger.
+ */
+static void macb_tx_stall_watchdog(struct work_struct *work)
+{
+ struct macb_queue *queue = container_of(to_delayed_work(work),
+ struct macb_queue,
+ tx_stall_watchdog_work);
+ struct macb *bp = queue->bp;
+ unsigned int cur_tail, cur_head;
+ bool stalled = false;
+ unsigned long flags;
+
+ if (!netif_running(bp->dev))
+ return;
+
+ spin_lock_irqsave(&queue->tx_ptr_lock, flags);
+ cur_tail = queue->tx_tail;
+ cur_head = queue->tx_head;
+ if (cur_head != cur_tail &&
+ cur_tail == queue->tx_stall_last_tail)
+ stalled = true;
+ else
+ queue->tx_stall_last_tail = cur_tail;
+ spin_unlock_irqrestore(&queue->tx_ptr_lock, flags);
+
+ if (stalled) {
+ netdev_warn_once(bp->dev,
+ "TX stall detected on queue %u (tail=%u head=%u); re-kicking TSTART\n",
+ (unsigned int)(queue - bp->queues),
+ cur_tail, cur_head);
+ macb_tx_restart(queue);
+ }
+
+ schedule_delayed_work(&queue->tx_stall_watchdog_work,
+ msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS));
+}
+
static void macb_hresp_error_task(struct work_struct *work)
{
struct macb *bp = from_work(bp, work, hresp_err_bh_work);
@@ -3297,6 +3350,9 @@
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
napi_enable(&queue->napi_rx);
napi_enable(&queue->napi_tx);
+ queue->tx_stall_last_tail = queue->tx_tail;
+ schedule_delayed_work(&queue->tx_stall_watchdog_work,
+ msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS));
}
macb_init_hw(bp);
@@ -3343,6 +3399,7 @@
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
napi_disable(&queue->napi_rx);
napi_disable(&queue->napi_tx);
+ cancel_delayed_work_sync(&queue->tx_stall_watchdog_work);
netdev_tx_reset_queue(netdev_get_tx_queue(dev, q));
}
@@ -4941,6 +4998,8 @@
}
INIT_WORK(&queue->tx_error_task, macb_tx_error_task);
+ INIT_DELAYED_WORK(&queue->tx_stall_watchdog_work,
+ macb_tx_stall_watchdog);
q++;
}
--
2.44.0