two bugs and a patch

Love lha at stacken.kth.se
Sat Jan 6 23:22:34 CET 2001


Nickolai Zeldovich <kolya at mit.edu> writes:

> It looks like rxi_DecongestionEvent decrements the refcount on its
> rx_peer too early, so it can potentially be recycled while it is
> still running (arlad crashed on me once earlier today, and rx_peer
> was filled with 0xAA, which my osi_Free memsets all freed memory to.)
> Attached below is a patch which should fix this.

That might be true if rxi_Start() blocks, I've applied the patch since it
seems more right. It seems to be that it might be a problem if
rxi_DecongestionEvent() can run concurrently for same peer, that might
cause problems. I just can't figure out if its happening.
 
> Should there be some per-entry locking on rx_connHashTable, and maybe
> some other hash tables in arla as well? It looks like LWP is preemptive
> so something of this sort would be required, unless I'm confused.

LWP isn't preemptive, its cooperative. Locking is needed when a thread can
block in select or yield. In case you have multi-homed fileservers, the
patch attached might help you.

Love



Version: 2

When changing the peer, update all connections using the old peer.

Index: rx.c
===================================================================
RCS file: /afs/stacken.kth.se/src/SourceRepository/arla/rx/rx.c,v
retrieving revision 1.17
diff -u -w -r1.17 rx/rx.c
--- rx/rx.c	2001/01/05 20:42:10	1.17
+++ rx/rx.c	2001/01/06 22:03:04
@@ -306,6 +306,7 @@
     conn->cid = cid;
     conn->epoch = rx_epoch;
     conn->peer = rxi_FindPeer(shost, sport);
+    queue_Append(&conn->peer->connQueue, conn);
     conn->serviceId = sservice;
     conn->securityObject = securityObject;
     conn->securityData = (void *) 0;
@@ -423,6 +424,7 @@
     /* Make sure that the connection is completely reset before deleting it. */
     rxi_ResetConnection(conn);
 
+    queue_Remove(conn);
     if (--conn->peer->refCount == 0)
 	conn->peer->idleWhen = clock_Sec();
 
@@ -1262,6 +1264,7 @@
 rxi_DestroyPeer(struct rx_peer * peer)
 {
     rxi_RemovePeer(peer);
+    assert(queue_IsEmpty(&peer->connQueue));
     rxi_FreePeer(peer);
     rx_stats.nPeerStructs--;
 }
@@ -1282,9 +1285,19 @@
 	    break;
     }
     if (pp != NULL) {
+	struct rx_connection *conn, *next;
+
 	pp->refCount  += peer->refCount;
 	pp->nSent     += peer->nSent;
 	pp->reSends   += peer->reSends;
+
+	for (queue_Scan(&peer->connQueue, conn, next, rx_connection)) {
+	    conn->peer = pp;
+	    queue_Remove(conn);
+	    queue_Append(&pp->connQueue, conn);
+	}
+
+	assert(queue_IsEmpty(&peer->connQueue));
 	rxi_FreePeer(peer);
 	rx_stats.nPeerStructs--;
 	return pp;
@@ -1361,6 +1374,7 @@
 	conn->next = rx_connHashTable[hashindex];
 	rx_connHashTable[hashindex] = conn;
 	conn->peer = rxi_FindPeer(host, port);
+	queue_Append(&conn->peer->connQueue, conn);
 	conn->maxPacketSize = MIN(conn->peer->packetSize, OLD_MAX_PACKET_SIZE);
 	conn->type = RX_SERVER_CONNECTION;
 	conn->lastSendTime = clock_Sec();	/* don't GC immediately */
Index: rx.h
===================================================================
RCS file: /afs/stacken.kth.se/src/SourceRepository/arla/rx/rx.h,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -w -u -w -r1.18 -r1.19
--- rx/rx.h	2000/11/26 19:42:14	1.18
+++ rx/rx.h	2001/01/06 21:54:20	1.19
@@ -422,6 +422,7 @@
  */
 struct rx_peer {
     struct rx_peer *next;	       /* Next in hash conflict or free list */
+    struct rx_queue connQueue;	       /* a list of all conn use this peer */
     u_long host;		       /* Remote IP address, in net byte
 				        * order */
     u_short port;		       /* Remote UDP port, in net byte order */
@@ -489,6 +490,7 @@
  * limited multiple asynchronous conversations.
  */
 struct rx_connection {
+    struct rx_queue queue_item;        /* conns on same peer */
     struct rx_connection *next;	       /* on hash chain _or_ free list */
     struct rx_peer *peer;
 #ifdef	RX_ENABLE_LOCKS





More information about the Arla-drinkers mailing list