tulip bug, alternative fix, #3 (tested)

Stephen R. van den Berg srb@cuci.nl
Sun Dec 12 15:44:32 1999


Wolfgang Walter wrote:
>I sent description of the bugs to this list and to Donald Becker. As I haven't
>got an answer from him, yet, I don't know if he is already working on a fix or
>what he thinks about this patch. Maybe he's already working on a better fix.

Well, I looked at the patch.  Haven't yet doublechecked the exact
IO-commands, but, a casual comparison of what you did and past discussions
on the list reveals that you simply disable the "Too much work during
interrupt" compensations.  Donald remarked that this is not a good idea.
I'm not sure if he's right (didn't have time to check the logic yet), but
simply skipping it seems almost a bit too easy.

Comparing his patches for v0.91u with the ones you supplied, I come up
with a merged version (still suitable for standalone operation with
2.2.x) that takes the INTR_MITIGATION support into account (I'm not even
sure if that can make a difference in this lockup case).

Oh yes, incidentally, your patches seem relative to a different 0.91g version
than the one I have (and picked up from Donald's website).  Tiny differences,
but enough to screw up the diffs/patch.

If people could review this merged version (even test it perhaps?), and
tell me if I made a mess out of it or not?  This patch is relative to
the v0.91g which I got from Donald's website.

--- tulip.c.91g	Sat Dec 11 19:37:43 1999
+++ ../tulip.c	Sun Dec 12 06:27:12 1999
@@ -18,7 +18,7 @@
 */
 
 #define SMP_CHECK
-static const char version[] = "tulip.c:v0.91g 7/16/99 becker@cesdis.gsfc.nasa.gov\n";
+static const char version[] = "tulip.c:v0.91gB 7/16/99 becker@cesdis.gsfc.nasa.gov\n";
 
 /* A few user-configurable values. */
 
@@ -51,6 +51,7 @@
    bonding and packet priority.
    There are no ill effects from too-large receive rings. */
 #define TX_RING_SIZE	16
+#define TX_QUEUE_LEN	10
 #define RX_RING_SIZE	32
 
 /* Set the copy breakpoint for the copy-only-tiny-buffer Rx structure. */
@@ -361,7 +362,7 @@
 	HAS_MII=1, HAS_MEDIA_TABLE=2, CSR12_IN_SROM=4, ALWAYS_CHECK_MII=8,
 	HAS_PWRDWN=0x10, MC_HASH_ONLY=0x20, /* Hash-only multicast filter. */
 	HAS_PNICNWAY=0x80, HAS_NWAY143=0x40,	/* Uses internal NWay xcvr. */
-	HAS_8023X=0x100,
+	HAS_INTR_MITIGATION=0x100, HAS_8023X=0x400,
 };
 static struct tulip_chip_table {
 	char *chip_name;
@@ -375,7 +376,8 @@
   { "Digital DS21140 Tulip", 128, 0x0001ebef,
 	HAS_MII | HAS_MEDIA_TABLE | CSR12_IN_SROM, tulip_timer },
   { "Digital DS21143 Tulip", 128, 0x0801fbff,
-	HAS_MII | HAS_MEDIA_TABLE | ALWAYS_CHECK_MII | HAS_PWRDWN | HAS_NWAY143,
+	HAS_MII | HAS_MEDIA_TABLE | ALWAYS_CHECK_MII | HAS_PWRDWN | HAS_NWAY143
+	| HAS_INTR_MITIGATION,
 	t21142_timer },
   { "Lite-On 82c168 PNIC", 256, 0x0001ebef,
 	HAS_MII | HAS_PNICNWAY, pnic_timer },
@@ -433,7 +435,7 @@
 
 /* The bits in the CSR5 status registers, mostly interrupt sources. */
 enum status_bits {
-	TimerInt=0x800, TPLnkFail=0x1000, TPLnkPass=0x10,
+	TimerInt=0x800, SytemError=0x2000, TPLnkFail=0x1000, TPLnkPass=0x10,
 	NormalIntr=0x10000, AbnormalIntr=0x8000,
 	RxJabber=0x200, RxDied=0x100, RxNoBuf=0x80, RxIntr=0x40,
 	TxFIFOUnderflow=0x20, TxJabber=0x08, TxNoBuf=0x04, TxDied=0x02, TxIntr=0x01,
@@ -530,6 +532,9 @@
 	int cur_index;						/* Current media index. */
 	int saved_if_port;
 	unsigned char pci_bus, pci_devfn;
+	int ttimer;
+	int susp_rx;
+	unsigned long nir;
 	int pad0, pad1;						/* Used for 8-byte alignment */
 };
 
@@ -548,6 +553,7 @@
 static void tulip_tx_timeout(struct device *dev);
 static void tulip_init_ring(struct device *dev);
 static int tulip_start_xmit(struct sk_buff *skb, struct device *dev);
+static int tulip_refill_rx(struct device *dev);
 static int tulip_rx(struct device *dev);
 static void tulip_interrupt(int irq, void *dev_instance, struct pt_regs *regs);
 static int tulip_close(struct device *dev);
@@ -2484,6 +2490,9 @@
 	tp->tx_full = 0;
 	tp->cur_rx = tp->cur_tx = 0;
 	tp->dirty_rx = tp->dirty_tx = 0;
+	tp->susp_rx = 0;
+	tp->ttimer = 0;
+	tp->nir = 0;
 
 	for (i = 0; i < RX_RING_SIZE; i++) {
 		tp->rx_ring[i].status = 0x00000000;
@@ -2545,11 +2554,11 @@
 	tp->tx_skbuff[entry] = skb;
 	tp->tx_ring[entry].buffer1 = virt_to_bus(skb->data);
 
-	if (tp->cur_tx - tp->dirty_tx < TX_RING_SIZE/2) {/* Typical path */
+	if (tp->cur_tx - tp->dirty_tx < TX_QUEUE_LEN/2) {/* Typical path */
 		flag = 0x60000000; /* No interrupt */
-	} else if (tp->cur_tx - tp->dirty_tx == TX_RING_SIZE/2) {
+	} else if (tp->cur_tx - tp->dirty_tx == TX_QUEUE_LEN/2) {
 		flag = 0xe0000000; /* Tx-done intr. */
-	} else if (tp->cur_tx - tp->dirty_tx < TX_RING_SIZE - 2) {
+	} else if (tp->cur_tx - tp->dirty_tx < TX_QUEUE_LEN - 2) {
 		flag = 0x60000000; /* No Tx-done intr. */
 	} else {		/* Leave room for set_rx_mode() to fill entries. */
 		tp->tx_full = 1;
@@ -2578,7 +2587,15 @@
 	struct device *dev = (struct device *)dev_instance;
 	struct tulip_private *tp = (struct tulip_private *)dev->priv;
 	long ioaddr = dev->base_addr;
-	int csr5, work_budget = max_interrupt_work;
+	int csr5;
+	int entry;
+	int missed;
+	int rx = 0;
+	int tx = 0;
+	int oi = 0;
+	int maxrx = RX_RING_SIZE;
+	int maxtx = TX_RING_SIZE;
+	int maxoi = TX_RING_SIZE;
 
 #if defined(__i386__) && defined(SMP_CHECK)
 	if (test_and_set_bit(0, (void*)&dev->interrupt)) {
@@ -2596,6 +2613,8 @@
 	dev->interrupt = 1;
 #endif
 
+	tp->nir++;
+
 	do {
 		csr5 = inl(ioaddr + CSR5);
 		/* Acknowledge all of the current interrupt sources ASAP. */
@@ -2608,10 +2627,12 @@
 		if ((csr5 & (NormalIntr|AbnormalIntr)) == 0)
 			break;
 
-		if (csr5 & (RxIntr | RxNoBuf))
-			work_budget -= tulip_rx(dev);
+		if (csr5 & (RxIntr | RxNoBuf)) {
+			rx += tulip_rx(dev);
+			tulip_refill_rx(dev);
+		}
 
-		if (csr5 & (TxNoBuf | TxDied | TxIntr)) {
+		if (csr5 & (TxNoBuf | TxDied | TxIntr | TimerInt)) {
 			unsigned int dirty_tx;
 
 			for (dirty_tx = tp->dirty_tx; tp->cur_tx - dirty_tx > 0;
@@ -2656,6 +2677,7 @@
 				/* Free the original skb. */
 				dev_free_skb(tp->tx_skbuff[entry]);
 				tp->tx_skbuff[entry] = 0;
+				tx++;
 			}
 
 #ifndef final_version
@@ -2667,7 +2689,7 @@
 #endif
 
 			if (tp->tx_full && dev->tbusy
-				&& tp->cur_tx - dirty_tx  < TX_RING_SIZE - 2) {
+				&& tp->cur_tx - dirty_tx  < TX_QUEUE_LEN - 4) {
 				/* The ring is no longer full, clear tbusy. */
 				tp->tx_full = 0;
 				dev->tbusy = 0;
@@ -2698,38 +2720,77 @@
 				/* Restart the transmit process. */
 				outl(tp->csr6 | 0x0002, ioaddr + CSR6);
 				outl(tp->csr6 | 0x2002, ioaddr + CSR6);
+				outl(0, ioaddr + CSR1);
 			}
 			if (csr5 & RxDied) {		/* Missed a Rx frame. */
 				tp->stats.rx_errors++;
 				tp->stats.rx_missed_errors += inl(ioaddr + CSR8) & 0xffff;
 				outl(tp->csr6 | 0x2002, ioaddr + CSR6);
 			}
-			if (csr5 & TimerInt) {
-				if (tulip_debug > 2)
-					printk(KERN_ERR "%s: Re-enabling interrupts, %8.8x.\n",
-						   dev->name, csr5);
-				outl(tulip_tbl[tp->chip_id].valid_intrs, ioaddr + CSR7);
-			}
 			if (csr5 & (TPLnkPass | TPLnkFail | 0x08000000)) {
 				if (tp->link_change)
 					(tp->link_change)(dev, csr5);
 			}
+			if (csr5 & SytemError) {
+				printk(KERN_ERR "%s: (%lu) System Error occured\n", dev->name, tp->nir);
+			}
 			/* Clear all error sources, included undocumented ones! */
 			outl(0x0800f7ba, ioaddr + CSR5);
+			oi++;
 		}
-		if (--work_budget < 0) {
+		if (csr5 & TimerInt) {
+#if 0
+			if (tulip_debug > 2)
+				printk(KERN_ERR "%s: Re-enabling interrupts, %8.8x.\n",
+					   dev->name, csr5);
+			outl(tulip_tbl[tp->chip_id].valid_intrs, ioaddr + CSR7);
+#endif
+			tp->ttimer = 0;
+			oi++;
+		}
+		if (tx > maxtx || rx > maxrx || oi > maxoi) {
+#if 0
 			if (tulip_debug > 1)
+#endif
 				printk(KERN_WARNING "%s: Too much work during an interrupt, "
-					   "csr5=0x%8.8x.\n", dev->name, csr5);
+					   "csr5=0x%8.8x. (%lu) (%d,%d,%d)\n", dev->name, csr5, tp->nir, tx, rx, oi);
 			/* Acknowledge all interrupt sources. */
+#if 1
 			outl(0x8001ffff, ioaddr + CSR5);
-			/* Clear all interrupting sources, set timer to re-enable. */
-			outl(((~csr5) & 0x0001ebef) | 0x0800, ioaddr + CSR7);
-			outl(12, ioaddr + CSR11);
+			if (tp->flags & HAS_INTR_MITIGATION)
+				outl(0x45240000, ioaddr + CSR11);
+			else {
+				/* Mask all interrupting sources, set timer to e-enable. */
+				outl(((~csr5) & 0x0001ebef) | AbnormalIntr | TimerInt,
+					 ioaddr + CSR7);
+				outl(0x0012, ioaddr + CSR11);
+			}
+ 			tp->ttimer = 1;
+#endif
 			break;
 		}
 	} while (1);
 
+	tulip_refill_rx(dev);
+
+	/* check if we card is in suspend mode */
+	entry = tp->dirty_rx % RX_RING_SIZE;
+	if (tp->rx_skbuff[entry] == NULL) {
+		printk(KERN_WARNING "%s: in rx suspend mode: (%lu) (tp->cur_rx = %u, ttimer = %d, rx = %d) go/stay in suspend mode\n", dev->name, tp->nir, tp->cur_rx, tp->ttimer, rx);
+		if (tp->ttimer == 0 || (inl(ioaddr + CSR11) & 0xffff) == 0) {
+			printk(KERN_WARNING "%s: in rx suspend mode: (%lu) set timer\n", dev->name, tp->nir);
+			outl(tulip_tbl[tp->chip_id].valid_intrs | TimerInt,
+				ioaddr + CSR7);
+			outl(TimerInt, ioaddr + CSR5);
+			outl(12, ioaddr + CSR11);
+			tp->ttimer = 1;
+		}
+	}
+
+	if ((missed = inl(ioaddr + CSR8) & 0x1ffff)) {
+		tp->stats.rx_dropped += missed & 0x10000 ? 0x10000 : missed;
+	}
+
 	if (tulip_debug > 3)
 		printk(KERN_DEBUG "%s: exiting interrupt, csr5=%#4.4x.\n",
 			   dev->name, inl(ioaddr + CSR5));
@@ -2742,13 +2803,36 @@
 	return;
 }
 
+static int tulip_refill_rx(struct device *dev)
+{
+	struct tulip_private *tp = (struct tulip_private *)dev->priv;
+	int entry;
+	int refilled = 0;
+
+	/* Refill the Rx ring buffers. */
+	for (; tp->cur_rx - tp->dirty_rx > 0; tp->dirty_rx++) {
+		entry = tp->dirty_rx % RX_RING_SIZE;
+		if (tp->rx_skbuff[entry] == NULL) {
+			struct sk_buff *skb;
+			skb = tp->rx_skbuff[entry] = dev_alloc_skb(PKT_BUF_SZ);
+			if (skb == NULL)
+				break;
+			skb->dev = dev;			/* Mark as being used by this device. */
+			tp->rx_ring[entry].buffer1 = virt_to_bus(skb->tail);
+			refilled++;
+		}
+		tp->rx_ring[entry].status = cpu_to_le32(DescOwned);
+	}
+	return refilled;
+}
+
 static int
 tulip_rx(struct device *dev)
 {
 	struct tulip_private *tp = (struct tulip_private *)dev->priv;
 	int entry = tp->cur_rx % RX_RING_SIZE;
 	int rx_work_limit = tp->dirty_rx + RX_RING_SIZE - tp->cur_rx;
-	int work_done = 0;
+	int received = 0;
 
 	if (tulip_debug > 4)
 		printk(KERN_DEBUG " In tulip_rx(), entry %d %8.8x.\n", entry,
@@ -2810,7 +2894,6 @@
 				memcpy(skb_put(skb, pkt_len),
 					   bus_to_virt(tp->rx_ring[entry].buffer1), pkt_len);
 #endif
-				work_done++;
 			} else { 	/* Pass up the skb already on the Rx ring. */
 				char *temp = skb_put(skb = tp->rx_skbuff[entry], pkt_len);
 				tp->rx_skbuff[entry] = NULL;
@@ -2830,25 +2913,11 @@
 			tp->stats.rx_bytes += pkt_len;
 #endif
 		}
+		received++;
 		entry = (++tp->cur_rx) % RX_RING_SIZE;
 	}
 
-	/* Refill the Rx ring buffers. */
-	for (; tp->cur_rx - tp->dirty_rx > 0; tp->dirty_rx++) {
-		entry = tp->dirty_rx % RX_RING_SIZE;
-		if (tp->rx_skbuff[entry] == NULL) {
-			struct sk_buff *skb;
-			skb = tp->rx_skbuff[entry] = dev_alloc_skb(PKT_BUF_SZ);
-			if (skb == NULL)
-				break;
-			skb->dev = dev;			/* Mark as being used by this device. */
-			tp->rx_ring[entry].buffer1 = virt_to_bus(skb->tail);
-			work_done++;
-		}
-		tp->rx_ring[entry].status = DescOwned;
-	}
-
-	return work_done;
+	return received;
 }
 
 static int
-- 
Sincerely,                                                          srb@cuci.nl
           Stephen R. van den Berg (AKA BuGless).

Climate is what you expect.  Weather is what you get.