diff --git a/Makefile b/Makefile index 74ff7af..c514e6b 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,7 @@ checkouts-clean: # # Patches # -.PHONY: patches-pkgs patches-talos patches-sbc-raspberrypi patches patches +.PHONY: patches-pkgs patches-talos patches-sbc-raspberrypi patches-linux patches patches-pkgs: cd "$(CHECKOUTS_DIRECTORY)/pkgs" && \ git am "$(PATCHES_DIRECTORY)/siderolabs/pkgs/0001-Patched-for-Raspberry-Pi-5.patch" @@ -77,10 +77,18 @@ patches-sbc-raspberrypi: cd "$(CHECKOUTS_DIRECTORY)/sbc-raspberrypi" && \ git am "$(PATCHES_DIRECTORY)/siderolabs/sbc-raspberrypi/0001-Patched-for-Raspberry-Pi-5.patch" -patches: patches-pkgs patches-talos patches-sbc-raspberrypi +# Drop local kernel .patch files into the pkgs kernel/build patch dir so +# they are picked up by the patch loop in kernel/build/pkg.yaml. Sorted +# numeric filenames preserve apply order. +patches-linux: + @if [ -d "$(PATCHES_DIRECTORY)/linux" ] && ls "$(PATCHES_DIRECTORY)/linux"/*.patch >/dev/null 2>&1; then \ + mkdir -p "$(CHECKOUTS_DIRECTORY)/pkgs/kernel/build/patches" && \ + cp -v "$(PATCHES_DIRECTORY)/linux"/*.patch "$(CHECKOUTS_DIRECTORY)/pkgs/kernel/build/patches/"; \ + else \ + echo "No local kernel patches in $(PATCHES_DIRECTORY)/linux, skipping"; \ + fi -# Backwards-compatible alias -patches: patches +patches: patches-pkgs patches-talos patches-sbc-raspberrypi patches-linux .PHONY: kernel kernel: diff --git a/patches/linux/0001-net-macb-flush-PCIe-posted-write-after-TSTART-doorbe.patch b/patches/linux/0001-net-macb-flush-PCIe-posted-write-after-TSTART-doorbe.patch new file mode 100644 index 0000000..1ad061f --- /dev/null +++ b/patches/linux/0001-net-macb-flush-PCIe-posted-write-after-TSTART-doorbe.patch @@ -0,0 +1,70 @@ +From 0000000000000000000000000000000000000001 Mon Sep 17 00:00:00 2001 +From: Lukasz Raczylo +Date: Fri, 24 Apr 2026 00:00:00 +0000 +Subject: [PATCH 1/3] net: macb: flush PCIe posted write after TSTART doorbell + +macb_start_xmit() and macb_tx_restart() both kick transmission by +OR-ing MACB_BIT(TSTART) into NCR. On PCIe-attached macb instances -- +notably BCM2712 + RP1 PCIe south bridge on Raspberry Pi 5 -- the +doorbell write is a posted PCIe write that can sit in the fabric's +write queue until something drains it. A source-level comment at +the TSTART site already acknowledges the problem: + + /* TSTART write might get dropped, so make the IRQ retrigger + * a buffer read */ + +and arms a recovery handshake via queue->tx_pending / +queue->txubr_pending that is picked up on the next TCOMP interrupt. +That recovery path only runs if a TCOMP interrupt actually fires; +if the lost doorbell means no TX starts, there is no TCOMP, and the +ring stalls silently. + +Add a read-back of NCR after the TSTART write. The read serialises +the PCIe posted-write queue and ensures the doorbell reaches the MAC +before macb_start_xmit() / macb_tx_restart() return. The existing +'TSTART might get dropped' handshake is preserved as a safety net +for cases where the fabric genuinely drops the write despite the +read barrier, but with this barrier it should rarely if ever be +needed on PCIe-attached parts. + +Observed to be the most common trigger for the silent TX stall +documented in the linked reports. + +Link: https://github.com/cilium/cilium/issues/43198 +Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877 +Signed-off-by: Lukasz Raczylo +--- + drivers/net/ethernet/cadence/macb_main.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c +--- a/drivers/net/ethernet/cadence/macb_main.c ++++ b/drivers/net/ethernet/cadence/macb_main.c +@@ -1949,6 +1949,13 @@ + + spin_lock(&bp->lock); + macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART)); ++ /* ++ * Flush the PCIe posted-write queue so the TSTART doorbell ++ * reliably reaches the MAC. Without this, the write can sit ++ * in the fabric and the MAC never advances, causing a silent ++ * TX stall. ++ */ ++ (void)macb_readl(bp, NCR); + spin_unlock(&bp->lock); + + out_tx_ptr_unlock: +@@ -2630,6 +2637,11 @@ + queue->tx_pending = 1; + + macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART)); ++ /* ++ * Flush the PCIe posted-write queue; see the comment in ++ * macb_tx_restart() for the reasoning. ++ */ ++ (void)macb_readl(bp, NCR); + spin_unlock(&bp->lock); + + if (CIRC_SPACE(queue->tx_head, queue->tx_tail, bp->tx_ring_size) < 1) +-- +2.44.0 diff --git a/patches/linux/0002-net-macb-re-check-ISR-after-IER-re-enable-in-macb_tx.patch b/patches/linux/0002-net-macb-re-check-ISR-after-IER-re-enable-in-macb_tx.patch new file mode 100644 index 0000000..2a5f6dc --- /dev/null +++ b/patches/linux/0002-net-macb-re-check-ISR-after-IER-re-enable-in-macb_tx.patch @@ -0,0 +1,98 @@ +From 0000000000000000000000000000000000000002 Mon Sep 17 00:00:00 2001 +From: Lukasz Raczylo +Date: Fri, 24 Apr 2026 00:00:00 +0000 +Subject: [PATCH 2/3] net: macb: re-check ISR after IER re-enable in + macb_tx_poll + +macb_tx_poll() runs with TCOMP masked, drains the TX ring, then +calls napi_complete_done() and re-enables TCOMP via IER. An +existing comment in the function notes: + + /* Packet completions only seem to propagate to raise + * interrupts when interrupts are enabled at the time, so if + * packets were sent while interrupts were disabled, + * they will not cause another interrupt to be generated when + * interrupts are re-enabled. + */ + +and mitigates this by calling macb_tx_complete_pending() to look +for a completed descriptor whose TX_USED bit the hardware has +DMA'd but whose completion we processed without ever seeing an +interrupt for. + +macb_tx_complete_pending() only inspects driver-visible ring state +(descriptor->ctrl, after rmb()). On PCIe-attached parts (BCM2712 + +RP1 on Raspberry Pi 5 in particular) the descriptor DMA write that +sets TX_USED can still be in flight in the PCIe fabric when we +check. The read-memory-barrier synchronises the CPU view of earlier +CPU writes, but does not force the peripheral's in-flight DMA to +retire. In that window the check returns false, napi exits, the +IER re-enable does not re-fire (the quirk above), and the queue +stalls silently. + +Re-check the hardware's own ISR state as well. Reading a MAC +register after IER re-enable serves two purposes: + + (1) It drains any in-flight PCIe DMA writes of descriptor state, + so a subsequent macb_tx_complete_pending() sees an accurate + view of TX_USED. + + (2) It directly observes whether the hardware currently has a + pending TCOMP signal, catching the case the existing driver + comment describes (completions raised while masked, not + re-fired). + +If either path indicates pending work, schedule NAPI again. + +Combined with the PCIe posted-write flush in patch 1/3, this closes +the observed silent-TX-stall path on BCM2712/RP1 reported at the +links below. + +Link: https://github.com/cilium/cilium/issues/43198 +Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877 +Signed-off-by: Lukasz Raczylo +--- + drivers/net/ethernet/cadence/macb_main.c | 25 +++++++++++++++++------- + 1 file changed, 18 insertions(+), 7 deletions(-) + +diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c +--- a/drivers/net/ethernet/cadence/macb_main.c ++++ b/drivers/net/ethernet/cadence/macb_main.c +@@ -2000,17 +2000,25 @@ + if (work_done < budget && napi_complete_done(napi, work_done)) { + queue_writel(queue, IER, MACB_BIT(TCOMP)); + +- /* Packet completions only seem to propagate to raise +- * interrupts when interrupts are enabled at the time, so if +- * packets were sent while interrupts were disabled, +- * they will not cause another interrupt to be generated when +- * interrupts are re-enabled. +- * Check for this case here to avoid losing a wakeup. This can +- * potentially race with the interrupt handler doing the same +- * actions if an interrupt is raised just after enabling them, +- * but this should be harmless. ++ /* ++ * TCOMP events that fire while the interrupt is masked do ++ * not re-fire when IER is re-enabled. Catch this two ways ++ * to avoid losing a wakeup: ++ * ++ * (1) Read ISR -- catches completions the hardware flagged ++ * but that we did not see as an interrupt. The MMIO ++ * read doubles as a PCIe read barrier, flushing any ++ * in-flight descriptor TX_USED DMA writes into memory. ++ * (2) macb_tx_complete_pending() inspects the ring after ++ * that flush, catching a descriptor whose TX_USED is ++ * now visible as a result of the barrier. ++ * ++ * This can race with the interrupt handler taking the same ++ * path if an interrupt fires just after the IER write; ++ * rescheduling NAPI in that case is harmless. + */ +- if (macb_tx_complete_pending(queue)) { ++ if ((queue_readl(queue, ISR) & MACB_BIT(TCOMP)) || ++ macb_tx_complete_pending(queue)) { + queue_writel(queue, IDR, MACB_BIT(TCOMP)); + if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) + queue_writel(queue, ISR, MACB_BIT(TCOMP)); +-- +2.44.0 diff --git a/patches/linux/0003-net-macb-add-TX-stall-watchdog-to-recover-from-lost-.patch b/patches/linux/0003-net-macb-add-TX-stall-watchdog-to-recover-from-lost-.patch new file mode 100644 index 0000000..9d10ed2 --- /dev/null +++ b/patches/linux/0003-net-macb-add-TX-stall-watchdog-to-recover-from-lost-.patch @@ -0,0 +1,143 @@ +From 0000000000000000000000000000000000000003 Mon Sep 17 00:00:00 2001 +From: Lukasz Raczylo +Date: Fri, 24 Apr 2026 00:00:00 +0000 +Subject: [PATCH 3/3] net: macb: add TX stall watchdog to recover from lost + TCOMP + +Patches 1/3 and 2/3 close two races by which a TCOMP interrupt can +be lost on PCIe-attached macb instances. This patch adds a +defence-in-depth safety net: a per-queue delayed_work that calls +macb_tx_restart() if queue->tx_tail has not advanced in one second +despite the ring being non-empty. + +The watchdog introduces no new recovery logic. macb_tx_restart() +already exists, is correctly locked, and already checks the +hardware's TBQP against the driver's head index before writing +TSTART: on a healthy ring it is a no-op at the hardware level. All +the watchdog adds is the trigger. + +If patches 1/3 and 2/3 completely eliminate the stall, this code +never does anything beyond a spin_lock/unlock and a branch per +second per queue. If a further race remains -- hardware or +driver-level -- this turns a multi-minute silent hang into a +one-second bump. + +On our 24-node Raspberry Pi 5 fleet this was empirically needed: +before the patches in this series, multiple nodes per day hit the +stall and required external watchdog intervention to recover. + +Link: https://github.com/cilium/cilium/issues/43198 +Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877 +Signed-off-by: Lukasz Raczylo +--- + drivers/net/ethernet/cadence/macb.h | 4 ++ + drivers/net/ethernet/cadence/macb_main.c | 63 ++++++++++++++++++++++++ + 2 files changed, 67 insertions(+) + +diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h +--- a/drivers/net/ethernet/cadence/macb.h ++++ b/drivers/net/ethernet/cadence/macb.h +@@ -1294,6 +1294,11 @@ + struct work_struct tx_error_task; + bool txubr_pending; + bool tx_pending; ++ ++ /* TX stall watchdog -- see macb_tx_stall_watchdog() in macb_main.c */ ++ struct delayed_work tx_stall_watchdog_work; ++ unsigned int tx_stall_last_tail; ++ + struct napi_struct napi_tx; + + dma_addr_t rx_ring_dma; +diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c +--- a/drivers/net/ethernet/cadence/macb_main.c ++++ b/drivers/net/ethernet/cadence/macb_main.c +@@ -2030,6 +2030,59 @@ + return work_done; + } + ++#define MACB_TX_STALL_INTERVAL_MS 1000 ++ ++/* ++ * TX stall watchdog. ++ * ++ * Defence-in-depth against lost TCOMP interrupts. macb already has a ++ * recovery chain (tx_pending -> txubr_pending -> macb_tx_restart()) ++ * that fires on TCOMP; if TCOMP itself is lost the TX ring stalls ++ * silently until something else kicks TSTART. This watchdog runs ++ * once per second per queue, snapshots tx_tail, and calls ++ * macb_tx_restart() if the ring is non-empty and tx_tail has not ++ * advanced since the previous tick. ++ * ++ * macb_tx_restart() already checks the hardware's TBQP against the ++ * driver's head index before re-asserting TSTART, so on a healthy ++ * ring this is a no-op at the hardware level. The watchdog only ++ * adds the missing trigger. ++ */ ++static void macb_tx_stall_watchdog(struct work_struct *work) ++{ ++ struct macb_queue *queue = container_of(to_delayed_work(work), ++ struct macb_queue, ++ tx_stall_watchdog_work); ++ struct macb *bp = queue->bp; ++ unsigned int cur_tail, cur_head; ++ bool stalled = false; ++ unsigned long flags; ++ ++ if (!netif_running(bp->dev)) ++ return; ++ ++ spin_lock_irqsave(&queue->tx_ptr_lock, flags); ++ cur_tail = queue->tx_tail; ++ cur_head = queue->tx_head; ++ if (cur_head != cur_tail && ++ cur_tail == queue->tx_stall_last_tail) ++ stalled = true; ++ else ++ queue->tx_stall_last_tail = cur_tail; ++ spin_unlock_irqrestore(&queue->tx_ptr_lock, flags); ++ ++ if (stalled) { ++ netdev_warn_once(bp->dev, ++ "TX stall detected on queue %u (tail=%u head=%u); re-kicking TSTART\n", ++ (unsigned int)(queue - bp->queues), ++ cur_tail, cur_head); ++ macb_tx_restart(queue); ++ } ++ ++ schedule_delayed_work(&queue->tx_stall_watchdog_work, ++ msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS)); ++} ++ + static void macb_hresp_error_task(struct work_struct *work) + { + struct macb *bp = from_work(bp, work, hresp_err_bh_work); +@@ -3297,6 +3350,9 @@ + for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) { + napi_enable(&queue->napi_rx); + napi_enable(&queue->napi_tx); ++ queue->tx_stall_last_tail = queue->tx_tail; ++ schedule_delayed_work(&queue->tx_stall_watchdog_work, ++ msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS)); + } + + macb_init_hw(bp); +@@ -3343,6 +3399,7 @@ + for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) { + napi_disable(&queue->napi_rx); + napi_disable(&queue->napi_tx); ++ cancel_delayed_work_sync(&queue->tx_stall_watchdog_work); + netdev_tx_reset_queue(netdev_get_tx_queue(dev, q)); + } + +@@ -4941,6 +4998,8 @@ + } + + INIT_WORK(&queue->tx_error_task, macb_tx_error_task); ++ INIT_DELAYED_WORK(&queue->tx_stall_watchdog_work, ++ macb_tx_stall_watchdog); + q++; + } + +-- +2.44.0