mirror of
https://github.com/lukaszraczylo/talos-builder.git
synced 2026-06-05 23:03:36 +00:00
Add macb silent TX stall fix series to kernel build
Three-patch series targeting the BCM2712/RP1 (Raspberry Pi 5) silent TX hang documented at: * https://github.com/cilium/cilium/issues/43198 * https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877 0001: flush PCIe posted write after TSTART doorbell 0002: re-check ISR after IER re-enable in macb_tx_poll 0003: add TX stall watchdog fallback for lost TCOMP New patches live in patches/linux/ and are copied into checkouts/pkgs/kernel/build/patches/ via a new 'patches-linux' Makefile target, wired into the existing 'patches' aggregate. Verified to apply cleanly against raspberrypi/linux @ f2f68e79f16f (the ref pinned by the preceding commit). Author of the patches: Lukasz Raczylo <lukasz@raczylo.com>.
This commit is contained in:
@@ -63,7 +63,7 @@ checkouts-clean:
|
||||
#
|
||||
# Patches
|
||||
#
|
||||
.PHONY: patches-pkgs patches-talos patches-sbc-raspberrypi patches patches
|
||||
.PHONY: patches-pkgs patches-talos patches-sbc-raspberrypi patches-linux patches
|
||||
patches-pkgs:
|
||||
cd "$(CHECKOUTS_DIRECTORY)/pkgs" && \
|
||||
git am "$(PATCHES_DIRECTORY)/siderolabs/pkgs/0001-Patched-for-Raspberry-Pi-5.patch"
|
||||
@@ -77,10 +77,18 @@ patches-sbc-raspberrypi:
|
||||
cd "$(CHECKOUTS_DIRECTORY)/sbc-raspberrypi" && \
|
||||
git am "$(PATCHES_DIRECTORY)/siderolabs/sbc-raspberrypi/0001-Patched-for-Raspberry-Pi-5.patch"
|
||||
|
||||
patches: patches-pkgs patches-talos patches-sbc-raspberrypi
|
||||
# Drop local kernel .patch files into the pkgs kernel/build patch dir so
|
||||
# they are picked up by the patch loop in kernel/build/pkg.yaml. Sorted
|
||||
# numeric filenames preserve apply order.
|
||||
patches-linux:
|
||||
@if [ -d "$(PATCHES_DIRECTORY)/linux" ] && ls "$(PATCHES_DIRECTORY)/linux"/*.patch >/dev/null 2>&1; then \
|
||||
mkdir -p "$(CHECKOUTS_DIRECTORY)/pkgs/kernel/build/patches" && \
|
||||
cp -v "$(PATCHES_DIRECTORY)/linux"/*.patch "$(CHECKOUTS_DIRECTORY)/pkgs/kernel/build/patches/"; \
|
||||
else \
|
||||
echo "No local kernel patches in $(PATCHES_DIRECTORY)/linux, skipping"; \
|
||||
fi
|
||||
|
||||
# Backwards-compatible alias
|
||||
patches: patches
|
||||
patches: patches-pkgs patches-talos patches-sbc-raspberrypi patches-linux
|
||||
|
||||
.PHONY: kernel
|
||||
kernel:
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
From 0000000000000000000000000000000000000001 Mon Sep 17 00:00:00 2001
|
||||
From: Lukasz Raczylo <lukasz@raczylo.com>
|
||||
Date: Fri, 24 Apr 2026 00:00:00 +0000
|
||||
Subject: [PATCH 1/3] net: macb: flush PCIe posted write after TSTART doorbell
|
||||
|
||||
macb_start_xmit() and macb_tx_restart() both kick transmission by
|
||||
OR-ing MACB_BIT(TSTART) into NCR. On PCIe-attached macb instances --
|
||||
notably BCM2712 + RP1 PCIe south bridge on Raspberry Pi 5 -- the
|
||||
doorbell write is a posted PCIe write that can sit in the fabric's
|
||||
write queue until something drains it. A source-level comment at
|
||||
the TSTART site already acknowledges the problem:
|
||||
|
||||
/* TSTART write might get dropped, so make the IRQ retrigger
|
||||
* a buffer read */
|
||||
|
||||
and arms a recovery handshake via queue->tx_pending /
|
||||
queue->txubr_pending that is picked up on the next TCOMP interrupt.
|
||||
That recovery path only runs if a TCOMP interrupt actually fires;
|
||||
if the lost doorbell means no TX starts, there is no TCOMP, and the
|
||||
ring stalls silently.
|
||||
|
||||
Add a read-back of NCR after the TSTART write. The read serialises
|
||||
the PCIe posted-write queue and ensures the doorbell reaches the MAC
|
||||
before macb_start_xmit() / macb_tx_restart() return. The existing
|
||||
'TSTART might get dropped' handshake is preserved as a safety net
|
||||
for cases where the fabric genuinely drops the write despite the
|
||||
read barrier, but with this barrier it should rarely if ever be
|
||||
needed on PCIe-attached parts.
|
||||
|
||||
Observed to be the most common trigger for the silent TX stall
|
||||
documented in the linked reports.
|
||||
|
||||
Link: https://github.com/cilium/cilium/issues/43198
|
||||
Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
|
||||
Signed-off-by: Lukasz Raczylo <lukasz@raczylo.com>
|
||||
---
|
||||
drivers/net/ethernet/cadence/macb_main.c | 14 ++++++++++++++
|
||||
1 file changed, 14 insertions(+)
|
||||
|
||||
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
|
||||
--- a/drivers/net/ethernet/cadence/macb_main.c
|
||||
+++ b/drivers/net/ethernet/cadence/macb_main.c
|
||||
@@ -1949,6 +1949,13 @@
|
||||
|
||||
spin_lock(&bp->lock);
|
||||
macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
|
||||
+ /*
|
||||
+ * Flush the PCIe posted-write queue so the TSTART doorbell
|
||||
+ * reliably reaches the MAC. Without this, the write can sit
|
||||
+ * in the fabric and the MAC never advances, causing a silent
|
||||
+ * TX stall.
|
||||
+ */
|
||||
+ (void)macb_readl(bp, NCR);
|
||||
spin_unlock(&bp->lock);
|
||||
|
||||
out_tx_ptr_unlock:
|
||||
@@ -2630,6 +2637,11 @@
|
||||
queue->tx_pending = 1;
|
||||
|
||||
macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
|
||||
+ /*
|
||||
+ * Flush the PCIe posted-write queue; see the comment in
|
||||
+ * macb_tx_restart() for the reasoning.
|
||||
+ */
|
||||
+ (void)macb_readl(bp, NCR);
|
||||
spin_unlock(&bp->lock);
|
||||
|
||||
if (CIRC_SPACE(queue->tx_head, queue->tx_tail, bp->tx_ring_size) < 1)
|
||||
--
|
||||
2.44.0
|
||||
@@ -0,0 +1,98 @@
|
||||
From 0000000000000000000000000000000000000002 Mon Sep 17 00:00:00 2001
|
||||
From: Lukasz Raczylo <lukasz@raczylo.com>
|
||||
Date: Fri, 24 Apr 2026 00:00:00 +0000
|
||||
Subject: [PATCH 2/3] net: macb: re-check ISR after IER re-enable in
|
||||
macb_tx_poll
|
||||
|
||||
macb_tx_poll() runs with TCOMP masked, drains the TX ring, then
|
||||
calls napi_complete_done() and re-enables TCOMP via IER. An
|
||||
existing comment in the function notes:
|
||||
|
||||
/* Packet completions only seem to propagate to raise
|
||||
* interrupts when interrupts are enabled at the time, so if
|
||||
* packets were sent while interrupts were disabled,
|
||||
* they will not cause another interrupt to be generated when
|
||||
* interrupts are re-enabled.
|
||||
*/
|
||||
|
||||
and mitigates this by calling macb_tx_complete_pending() to look
|
||||
for a completed descriptor whose TX_USED bit the hardware has
|
||||
DMA'd but whose completion we processed without ever seeing an
|
||||
interrupt for.
|
||||
|
||||
macb_tx_complete_pending() only inspects driver-visible ring state
|
||||
(descriptor->ctrl, after rmb()). On PCIe-attached parts (BCM2712 +
|
||||
RP1 on Raspberry Pi 5 in particular) the descriptor DMA write that
|
||||
sets TX_USED can still be in flight in the PCIe fabric when we
|
||||
check. The read-memory-barrier synchronises the CPU view of earlier
|
||||
CPU writes, but does not force the peripheral's in-flight DMA to
|
||||
retire. In that window the check returns false, napi exits, the
|
||||
IER re-enable does not re-fire (the quirk above), and the queue
|
||||
stalls silently.
|
||||
|
||||
Re-check the hardware's own ISR state as well. Reading a MAC
|
||||
register after IER re-enable serves two purposes:
|
||||
|
||||
(1) It drains any in-flight PCIe DMA writes of descriptor state,
|
||||
so a subsequent macb_tx_complete_pending() sees an accurate
|
||||
view of TX_USED.
|
||||
|
||||
(2) It directly observes whether the hardware currently has a
|
||||
pending TCOMP signal, catching the case the existing driver
|
||||
comment describes (completions raised while masked, not
|
||||
re-fired).
|
||||
|
||||
If either path indicates pending work, schedule NAPI again.
|
||||
|
||||
Combined with the PCIe posted-write flush in patch 1/3, this closes
|
||||
the observed silent-TX-stall path on BCM2712/RP1 reported at the
|
||||
links below.
|
||||
|
||||
Link: https://github.com/cilium/cilium/issues/43198
|
||||
Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
|
||||
Signed-off-by: Lukasz Raczylo <lukasz@raczylo.com>
|
||||
---
|
||||
drivers/net/ethernet/cadence/macb_main.c | 25 +++++++++++++++++-------
|
||||
1 file changed, 18 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
|
||||
--- a/drivers/net/ethernet/cadence/macb_main.c
|
||||
+++ b/drivers/net/ethernet/cadence/macb_main.c
|
||||
@@ -2000,17 +2000,25 @@
|
||||
if (work_done < budget && napi_complete_done(napi, work_done)) {
|
||||
queue_writel(queue, IER, MACB_BIT(TCOMP));
|
||||
|
||||
- /* Packet completions only seem to propagate to raise
|
||||
- * interrupts when interrupts are enabled at the time, so if
|
||||
- * packets were sent while interrupts were disabled,
|
||||
- * they will not cause another interrupt to be generated when
|
||||
- * interrupts are re-enabled.
|
||||
- * Check for this case here to avoid losing a wakeup. This can
|
||||
- * potentially race with the interrupt handler doing the same
|
||||
- * actions if an interrupt is raised just after enabling them,
|
||||
- * but this should be harmless.
|
||||
+ /*
|
||||
+ * TCOMP events that fire while the interrupt is masked do
|
||||
+ * not re-fire when IER is re-enabled. Catch this two ways
|
||||
+ * to avoid losing a wakeup:
|
||||
+ *
|
||||
+ * (1) Read ISR -- catches completions the hardware flagged
|
||||
+ * but that we did not see as an interrupt. The MMIO
|
||||
+ * read doubles as a PCIe read barrier, flushing any
|
||||
+ * in-flight descriptor TX_USED DMA writes into memory.
|
||||
+ * (2) macb_tx_complete_pending() inspects the ring after
|
||||
+ * that flush, catching a descriptor whose TX_USED is
|
||||
+ * now visible as a result of the barrier.
|
||||
+ *
|
||||
+ * This can race with the interrupt handler taking the same
|
||||
+ * path if an interrupt fires just after the IER write;
|
||||
+ * rescheduling NAPI in that case is harmless.
|
||||
*/
|
||||
- if (macb_tx_complete_pending(queue)) {
|
||||
+ if ((queue_readl(queue, ISR) & MACB_BIT(TCOMP)) ||
|
||||
+ macb_tx_complete_pending(queue)) {
|
||||
queue_writel(queue, IDR, MACB_BIT(TCOMP));
|
||||
if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
|
||||
queue_writel(queue, ISR, MACB_BIT(TCOMP));
|
||||
--
|
||||
2.44.0
|
||||
@@ -0,0 +1,143 @@
|
||||
From 0000000000000000000000000000000000000003 Mon Sep 17 00:00:00 2001
|
||||
From: Lukasz Raczylo <lukasz@raczylo.com>
|
||||
Date: Fri, 24 Apr 2026 00:00:00 +0000
|
||||
Subject: [PATCH 3/3] net: macb: add TX stall watchdog to recover from lost
|
||||
TCOMP
|
||||
|
||||
Patches 1/3 and 2/3 close two races by which a TCOMP interrupt can
|
||||
be lost on PCIe-attached macb instances. This patch adds a
|
||||
defence-in-depth safety net: a per-queue delayed_work that calls
|
||||
macb_tx_restart() if queue->tx_tail has not advanced in one second
|
||||
despite the ring being non-empty.
|
||||
|
||||
The watchdog introduces no new recovery logic. macb_tx_restart()
|
||||
already exists, is correctly locked, and already checks the
|
||||
hardware's TBQP against the driver's head index before writing
|
||||
TSTART: on a healthy ring it is a no-op at the hardware level. All
|
||||
the watchdog adds is the trigger.
|
||||
|
||||
If patches 1/3 and 2/3 completely eliminate the stall, this code
|
||||
never does anything beyond a spin_lock/unlock and a branch per
|
||||
second per queue. If a further race remains -- hardware or
|
||||
driver-level -- this turns a multi-minute silent hang into a
|
||||
one-second bump.
|
||||
|
||||
On our 24-node Raspberry Pi 5 fleet this was empirically needed:
|
||||
before the patches in this series, multiple nodes per day hit the
|
||||
stall and required external watchdog intervention to recover.
|
||||
|
||||
Link: https://github.com/cilium/cilium/issues/43198
|
||||
Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
|
||||
Signed-off-by: Lukasz Raczylo <lukasz@raczylo.com>
|
||||
---
|
||||
drivers/net/ethernet/cadence/macb.h | 4 ++
|
||||
drivers/net/ethernet/cadence/macb_main.c | 63 ++++++++++++++++++++++++
|
||||
2 files changed, 67 insertions(+)
|
||||
|
||||
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
|
||||
--- a/drivers/net/ethernet/cadence/macb.h
|
||||
+++ b/drivers/net/ethernet/cadence/macb.h
|
||||
@@ -1294,6 +1294,11 @@
|
||||
struct work_struct tx_error_task;
|
||||
bool txubr_pending;
|
||||
bool tx_pending;
|
||||
+
|
||||
+ /* TX stall watchdog -- see macb_tx_stall_watchdog() in macb_main.c */
|
||||
+ struct delayed_work tx_stall_watchdog_work;
|
||||
+ unsigned int tx_stall_last_tail;
|
||||
+
|
||||
struct napi_struct napi_tx;
|
||||
|
||||
dma_addr_t rx_ring_dma;
|
||||
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
|
||||
--- a/drivers/net/ethernet/cadence/macb_main.c
|
||||
+++ b/drivers/net/ethernet/cadence/macb_main.c
|
||||
@@ -2030,6 +2030,59 @@
|
||||
return work_done;
|
||||
}
|
||||
|
||||
+#define MACB_TX_STALL_INTERVAL_MS 1000
|
||||
+
|
||||
+/*
|
||||
+ * TX stall watchdog.
|
||||
+ *
|
||||
+ * Defence-in-depth against lost TCOMP interrupts. macb already has a
|
||||
+ * recovery chain (tx_pending -> txubr_pending -> macb_tx_restart())
|
||||
+ * that fires on TCOMP; if TCOMP itself is lost the TX ring stalls
|
||||
+ * silently until something else kicks TSTART. This watchdog runs
|
||||
+ * once per second per queue, snapshots tx_tail, and calls
|
||||
+ * macb_tx_restart() if the ring is non-empty and tx_tail has not
|
||||
+ * advanced since the previous tick.
|
||||
+ *
|
||||
+ * macb_tx_restart() already checks the hardware's TBQP against the
|
||||
+ * driver's head index before re-asserting TSTART, so on a healthy
|
||||
+ * ring this is a no-op at the hardware level. The watchdog only
|
||||
+ * adds the missing trigger.
|
||||
+ */
|
||||
+static void macb_tx_stall_watchdog(struct work_struct *work)
|
||||
+{
|
||||
+ struct macb_queue *queue = container_of(to_delayed_work(work),
|
||||
+ struct macb_queue,
|
||||
+ tx_stall_watchdog_work);
|
||||
+ struct macb *bp = queue->bp;
|
||||
+ unsigned int cur_tail, cur_head;
|
||||
+ bool stalled = false;
|
||||
+ unsigned long flags;
|
||||
+
|
||||
+ if (!netif_running(bp->dev))
|
||||
+ return;
|
||||
+
|
||||
+ spin_lock_irqsave(&queue->tx_ptr_lock, flags);
|
||||
+ cur_tail = queue->tx_tail;
|
||||
+ cur_head = queue->tx_head;
|
||||
+ if (cur_head != cur_tail &&
|
||||
+ cur_tail == queue->tx_stall_last_tail)
|
||||
+ stalled = true;
|
||||
+ else
|
||||
+ queue->tx_stall_last_tail = cur_tail;
|
||||
+ spin_unlock_irqrestore(&queue->tx_ptr_lock, flags);
|
||||
+
|
||||
+ if (stalled) {
|
||||
+ netdev_warn_once(bp->dev,
|
||||
+ "TX stall detected on queue %u (tail=%u head=%u); re-kicking TSTART\n",
|
||||
+ (unsigned int)(queue - bp->queues),
|
||||
+ cur_tail, cur_head);
|
||||
+ macb_tx_restart(queue);
|
||||
+ }
|
||||
+
|
||||
+ schedule_delayed_work(&queue->tx_stall_watchdog_work,
|
||||
+ msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS));
|
||||
+}
|
||||
+
|
||||
static void macb_hresp_error_task(struct work_struct *work)
|
||||
{
|
||||
struct macb *bp = from_work(bp, work, hresp_err_bh_work);
|
||||
@@ -3297,6 +3350,9 @@
|
||||
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
|
||||
napi_enable(&queue->napi_rx);
|
||||
napi_enable(&queue->napi_tx);
|
||||
+ queue->tx_stall_last_tail = queue->tx_tail;
|
||||
+ schedule_delayed_work(&queue->tx_stall_watchdog_work,
|
||||
+ msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS));
|
||||
}
|
||||
|
||||
macb_init_hw(bp);
|
||||
@@ -3343,6 +3399,7 @@
|
||||
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
|
||||
napi_disable(&queue->napi_rx);
|
||||
napi_disable(&queue->napi_tx);
|
||||
+ cancel_delayed_work_sync(&queue->tx_stall_watchdog_work);
|
||||
netdev_tx_reset_queue(netdev_get_tx_queue(dev, q));
|
||||
}
|
||||
|
||||
@@ -4941,6 +4998,8 @@
|
||||
}
|
||||
|
||||
INIT_WORK(&queue->tx_error_task, macb_tx_error_task);
|
||||
+ INIT_DELAYED_WORK(&queue->tx_stall_watchdog_work,
|
||||
+ macb_tx_stall_watchdog);
|
||||
q++;
|
||||
}
|
||||
|
||||
--
|
||||
2.44.0
|
||||
Reference in New Issue
Block a user