Drop rconn's responsibility for limiting the tx queue.

author Ben Pfaff <blp@nicira.com>

Mon, 11 Aug 2008 23:24:24 +0000 (16:24 -0700)

committer Ben Pfaff <blp@nicira.com>

Tue, 12 Aug 2008 22:21:30 +0000 (15:21 -0700)
author Ben Pfaff <blp@nicira.com>
Mon, 11 Aug 2008 23:24:24 +0000 (16:24 -0700)
committer Ben Pfaff <blp@nicira.com>
Tue, 12 Aug 2008 22:21:30 +0000 (15:21 -0700)
diff --git a/controller/controller.c b/controller/controller.c

index 2176fa778b458cf993e62e0e020c0631595c959a..bc18748e44febd4403f442a596de88f1b4fbcf77 100644 (file)
--- a/controller/controller.c
+++ b/controller/controller.c
@@ -197,7 +197,7 @@ main(int argc, char *argv[])
  static void
  new_switch(struct switch_ *sw, struct vconn *vconn, const char *name)
  {
-    sw->rconn = rconn_new_from_vconn(name, 128, vconn);
+    sw->rconn = rconn_new_from_vconn(name, vconn);
      sw->lswitch = lswitch_create(sw->rconn, learn_macs,
                                   setup_flows ? max_idle : -1);
  }
diff --git a/include/buffer.h b/include/buffer.h

index e1718007b538f1f67a2c22fabb6ac7d26dfd698a..08680694506b14d211c17869cd3916150d822696 100644 (file)
--- a/include/buffer.h
+++ b/include/buffer.h
@@ -51,6 +51,7 @@ struct buffer {
      void *l7;                   /* Application data. */
  
      struct buffer *next;        /* Next in a list of buffers. */
+    void *private;              /* Private pointer for use by owner. */
  };
  
  void buffer_use(struct buffer *, void *, size_t);
diff --git a/include/queue.h b/include/queue.h

index c2d0f22d13b60d4e4ba6d07c6085a22d11f453b1..113d5013a710f6e523eb7ea5ccc01b9dd078efed 100644 (file)
--- a/include/queue.h
+++ b/include/queue.h
@@ -46,5 +46,6 @@ void queue_destroy(struct queue *);
  void queue_clear(struct queue *);
  void queue_advance_head(struct queue *, struct buffer *next);
  void queue_push_tail(struct queue *, struct buffer *);
+struct buffer *queue_pop_head(struct queue *);
  
  #endif /* queue.h */
diff --git a/include/rconn.h b/include/rconn.h

index 228b8e89480ea0d19c18544cbace3f852ad7a03c..26e4be72082e6a3a18996582e66981c72f3023cc 100644 (file)
--- a/include/rconn.h
+++ b/include/rconn.h
@@ -53,12 +53,10 @@
  
  struct vconn;
  
-struct rconn *rconn_new(const char *name, int txq_limit,
+struct rconn *rconn_new(const char *name, 
                          int inactivity_probe_interval, int max_backoff);
-struct rconn *rconn_new_from_vconn(const char *name, int txq_limit,
-                                   struct vconn *);
-struct rconn *rconn_create(int txq_limit, int inactivity_probe_interval,
-                           int max_backoff);
+struct rconn *rconn_new_from_vconn(const char *name, struct vconn *);
+struct rconn *rconn_create(int inactivity_probe_interval, int max_backoff);
  int rconn_connect(struct rconn *, const char *name);
  void rconn_connect_unreliably(struct rconn *,
                                const char *name, struct vconn *vconn);
@@ -69,9 +67,9 @@ void rconn_run(struct rconn *);
  void rconn_run_wait(struct rconn *);
  struct buffer *rconn_recv(struct rconn *);
  void rconn_recv_wait(struct rconn *);
-int rconn_send(struct rconn *, struct buffer *);
-int rconn_force_send(struct rconn *, struct buffer *);
-bool rconn_is_full(const struct rconn *);
+int rconn_send(struct rconn *, struct buffer *, int *n_queued);
+int rconn_send_with_limit(struct rconn *, struct buffer *,
+                          int *n_queued, int queue_limit);
  unsigned int rconn_packets_sent(const struct rconn *);
  
  const char *rconn_get_name(const struct rconn *);
diff --git a/lib/buffer.c b/lib/buffer.c

index 3dfd41a29ebd4a50d4e3a68cdc75869c70890f21..47600e6ae3346d6befc3b98305be48df7067fc25 100644 (file)
--- a/lib/buffer.c
+++ b/lib/buffer.c
@@ -53,6 +53,7 @@ buffer_use(struct buffer *b, void *base, size_t allocated)
      b->size = 0;
      b->l2 = b->l3 = b->l4 = b->l7 = NULL;
      b->next = NULL;
+    b->private = NULL;
  }
  
  /* Initializes 'b' as a buffer with an initial capacity of 'size' bytes. */
diff --git a/lib/learning-switch.c b/lib/learning-switch.c

index d41602d6ba88ae34e756a565857f615ca81db2fb..27271fa2dd6c814c7c0daa91b7dcd82912955bdd 100644 (file)
--- a/lib/learning-switch.c
+++ b/lib/learning-switch.c
@@ -63,6 +63,9 @@ struct lswitch {
      uint64_t datapath_id;
      time_t last_features_request;
      struct mac_learning *ml;    /* NULL to act as hub instead of switch. */
+
+    /* Number of outgoing queued packets on the rconn. */
+    int n_queued;
  };
  
  static void queue_tx(struct lswitch *, struct rconn *, struct buffer *);
@@ -85,8 +88,7 @@ static void process_echo_request(struct lswitch *, struct rconn *,
  struct lswitch *
  lswitch_create(struct rconn *rconn, bool learn_macs, int max_idle)
  {
-    struct lswitch *sw = xmalloc(sizeof *sw);
-    memset(sw, 0, sizeof *sw);
+    struct lswitch *sw = xcalloc(1, sizeof *sw);
      sw->max_idle = max_idle;
      sw->datapath_id = 0;
      sw->last_features_request = time_now() - 1;
@@ -182,7 +184,7 @@ send_features_request(struct lswitch *sw, struct rconn *rconn)
  static void
  queue_tx(struct lswitch *sw, struct rconn *rconn, struct buffer *b)
  {
-    int retval = rconn_send(rconn, b);
+    int retval = rconn_send_with_limit(rconn, b, &sw->n_queued, 10);
      if (retval) {
          if (retval == EAGAIN) {
              /* FIXME: ratelimit. */
@@ -193,7 +195,6 @@ queue_tx(struct lswitch *sw, struct rconn *rconn, struct buffer *b)
              /* FIXME: ratelimit. */
              VLOG_WARN("%s: send: %s", rconn_get_name(rconn), strerror(retval));
          }
-        buffer_delete(b);
      }
  }
  
diff --git a/lib/queue.c b/lib/queue.c

index 635e14426192df7bc3fc52cf2d6685fbd3aa6125..f06c8e9a77ba51b3dd51fc6dc4942a20398faddb 100644 (file)
--- a/lib/queue.c
+++ b/lib/queue.c
@@ -101,6 +101,17 @@ queue_push_tail(struct queue *q, struct buffer *b)
      check_queue(q);
  }
  
+/* Removes the first buffer from 'q', which must not be empty, and returns
+ * it.  The caller must free the buffer (with buffer_delete()) when it is no
+ * longer needed. */
+struct buffer *
+queue_pop_head(struct queue *q)
+{
+    struct buffer *head = q->head;
+    queue_advance_head(q, head->next);
+    return head;
+}
+
  /* Checks the internal integrity of 'q'.  For use in debugging. */
  static void
  check_queue(struct queue *q)
diff --git a/lib/rconn.c b/lib/rconn.c

index 0093c3ab5d4a05672c21a466956e9ea4589d425f..6bc371ad523468cc79b2b66ac1e394c5e07c6a25 100644 (file)
--- a/lib/rconn.c
+++ b/lib/rconn.c
@@ -83,7 +83,6 @@ struct rconn {
      bool reliable;
  
      struct queue txq;
-    int txq_limit;
  
      int backoff;
      int max_backoff;
@@ -119,31 +118,28 @@ static void state_transition(struct rconn *, enum state);
  static int try_send(struct rconn *);
  static int reconnect(struct rconn *);
  static void disconnect(struct rconn *, int error);
+static void flush_queue(struct rconn *);
  static void question_connectivity(struct rconn *);
  
  /* Creates a new rconn, connects it (reliably) to 'name', and returns it. */
  struct rconn *
-rconn_new(const char *name, int txq_limit, int inactivity_probe_interval,
-          int max_backoff) 
+rconn_new(const char *name, int inactivity_probe_interval, int max_backoff)
  {
-    struct rconn *rc = rconn_create(txq_limit, inactivity_probe_interval,
-                                    max_backoff);
+    struct rconn *rc = rconn_create(inactivity_probe_interval, max_backoff);
      rconn_connect(rc, name);
      return rc;
  }
  
  /* Creates a new rconn, connects it (unreliably) to 'vconn', and returns it. */
  struct rconn *
-rconn_new_from_vconn(const char *name, int txq_limit, struct vconn *vconn) 
+rconn_new_from_vconn(const char *name, struct vconn *vconn) 
  {
-    struct rconn *rc = rconn_create(txq_limit, 60, 0);
+    struct rconn *rc = rconn_create(60, 0);
      rconn_connect_unreliably(rc, name, vconn);
      return rc;
  }
  
  /* Creates and returns a new rconn.
- *
- * 'txq_limit' is the maximum length of the send queue, in packets.
   *
   * 'probe_interval' is a number of seconds.  If the interval passes once
   * without an OpenFlow message being received from the peer, the rconn sends
@@ -156,7 +152,7 @@ rconn_new_from_vconn(const char *name, int txq_limit, struct vconn *vconn)
   * failure until it reaches 'max_backoff'.  If 0 is specified, the default of
   * 60 seconds is used. */
  struct rconn *
-rconn_create(int txq_limit, int probe_interval, int max_backoff)
+rconn_create(int probe_interval, int max_backoff)
  {
      struct rconn *rc = xcalloc(1, sizeof *rc);
  
@@ -168,8 +164,6 @@ rconn_create(int txq_limit, int probe_interval, int max_backoff)
      rc->reliable = false;
  
      queue_init(&rc->txq);
-    assert(txq_limit > 0);
-    rc->txq_limit = txq_limit;
  
      rc->backoff = 0;
      rc->max_backoff = max_backoff ? max_backoff : 60;
@@ -235,6 +229,7 @@ rconn_destroy(struct rconn *rc)
      if (rc) {
          free(rc->name);
          vconn_close(rc->vconn);
+        flush_queue(rc);
          queue_destroy(&rc->txq);
          free(rc);
      }
@@ -435,55 +430,59 @@ rconn_recv_wait(struct rconn *rc)
      }
  }
  
-/* Sends 'b' on 'rc'.  Returns 0 if successful, EAGAIN if at least 'txq_limit'
- * packets are already queued, otherwise a positive errno value. */
+/* Sends 'b' on 'rc'.  Returns 0 if successful (in which case 'b' is
+ * destroyed), or ENOTCONN if 'rc' is not currently connected (in which case
+ * the caller retains ownership of 'b').
+ *
+ * If 'n_queued' is non-null, then '*n_queued' will be incremented while the
+ * packet is in flight, then decremented when it has been sent (or discarded
+ * due to disconnection).  Because 'b' may be sent (or discarded) before this
+ * function returns, the caller may not be able to observe any change in
+ * '*n_queued'.
+ *
+ * There is no rconn_send_wait() function: an rconn has a send queue that it
+ * takes care of sending if you call rconn_run(), which will have the side
+ * effect of waking up poll_block(). */
  int
-do_send(struct rconn *rc, struct buffer *b, int txq_limit)
+rconn_send(struct rconn *rc, struct buffer *b, int *n_queued)
  {
      if (rc->vconn) {
-        if (rc->txq.n < txq_limit) {
-            queue_push_tail(&rc->txq, b);
-            if (rc->txq.n == 1) {
-                try_send(rc);
-            }
-            return 0;
-        } else {
-            return EAGAIN;
+        b->private = n_queued;
+        if (n_queued) {
+            ++*n_queued;
          }
+        queue_push_tail(&rc->txq, b);
+        if (rc->txq.n == 1) {
+            try_send(rc);
+        }
+        return 0;
      } else {
          return ENOTCONN;
      }
  }
  
-/* Sends 'b' on 'rc'.  Returns 0 if successful, EAGAIN if the send queue is
- * full, or ENOTCONN if 'rc' is not currently connected.
+/* Sends 'b' on 'rc'.  Increments '*n_queued' while the packet is in flight; it
+ * will be decremented when it has been sent (or discarded due to
+ * disconnection).  Returns 0 if successful, EAGAIN if '*n_queued' is already
+ * at least as large of 'queue_limit', or ENOTCONN if 'rc' is not currently
+ * connected.  Regardless of return value, 'b' is destroyed.
+ *
+ * Because 'b' may be sent (or discarded) before this function returns, the
+ * caller may not be able to observe any change in '*n_queued'.
   *
   * There is no rconn_send_wait() function: an rconn has a send queue that it
   * takes care of sending if you call rconn_run(), which will have the side
   * effect of waking up poll_block(). */
  int
-rconn_send(struct rconn *rc, struct buffer *b)
-{
-    return do_send(rc, b, rc->txq_limit);
-}
-
-/* Sends 'b' on 'rc'.  Returns 0 if successful, EAGAIN if the send queue is
- * full, otherwise a positive errno value.
- *
- * Compared to rconn_send(), this function relaxes the queue limit, allowing
- * more packets than usual to be queued. */
-int
-rconn_force_send(struct rconn *rc, struct buffer *b)
+rconn_send_with_limit(struct rconn *rc, struct buffer *b,
+                      int *n_queued, int queue_limit)
  {
-    return do_send(rc, b, 2 * rc->txq_limit);
-}
-
-/* Returns true if 'rc''s send buffer is full,
- * false if it has room for at least one more packet. */
-bool
-rconn_is_full(const struct rconn *rc)
-{
-    return rc->txq.n >= rc->txq_limit;
+    int retval;
+    retval = *n_queued >= queue_limit ? EAGAIN : rconn_send(rc, b, n_queued);
+    if (retval) {
+        buffer_delete(b);
+    }
+    return retval;
  }
  
  /* Returns the total number of packets successfully sent on the underlying
@@ -556,6 +555,7 @@ try_send(struct rconn *rc)
  {
      int retval = 0;
      struct buffer *next = rc->txq.head->next;
+    int *n_queued = rc->txq.head->private;
      retval = vconn_send(rc->vconn, rc->txq.head);
      if (retval) {
          if (retval != EAGAIN) {
@@ -564,6 +564,9 @@ try_send(struct rconn *rc)
          return retval;
      }
      rc->packets_sent++;
+    if (n_queued) {
+        --*n_queued;
+    }
      queue_advance_head(&rc->txq, next);
      return 0;
  }
@@ -590,7 +593,7 @@ disconnect(struct rconn *rc, int error)
              }
              vconn_close(rc->vconn);
              rc->vconn = NULL;
-            queue_clear(&rc->txq);
+            flush_queue(rc);
          }
  
          if (now >= rc->backoff_deadline) {
@@ -610,6 +613,21 @@ disconnect(struct rconn *rc, int error)
      }
  }
  
+/* Drops all the packets from 'rc''s send queue and decrements their queue
+ * counts. */
+static void
+flush_queue(struct rconn *rc)
+{
+    while (rc->txq.n > 0) {
+        struct buffer *b = queue_pop_head(&rc->txq);
+        int *n_queued = b->private;
+        if (n_queued) {
+            --*n_queued;
+        }
+        buffer_delete(b);
+    }
+}
+
  static unsigned int
  elapsed_in_this_state(const struct rconn *rc)
  {
diff --git a/secchan/secchan.c b/secchan/secchan.c

index e4fc62f66c03ceba20b9c50ff272dd5609cae295..e4f283d1ad17b822b0f5986df083c31acde3efec 100644 (file)
--- a/secchan/secchan.c
+++ b/secchan/secchan.c
@@ -103,6 +103,7 @@ struct settings {
  struct half {
      struct rconn *rconn;
      struct buffer *rxbuf;
+    int n_txq;                  /* No. of packets queued for tx on 'rconn'. */
  };
  
  struct relay {
@@ -195,11 +196,11 @@ main(int argc, char *argv[])
      daemonize();
  
      /* Connect to datapath. */
-    local_rconn = rconn_create(1, 0, s.max_backoff);
+    local_rconn = rconn_create(0, s.max_backoff);
      rconn_connect(local_rconn, s.nl_name);
  
      /* Connect to controller. */
-    remote_rconn = rconn_create(1, s.probe_interval, s.max_backoff);
+    remote_rconn = rconn_create(s.probe_interval, s.max_backoff);
      if (s.controller_name) {
          retval = rconn_connect(remote_rconn, s.controller_name);
          if (retval == EAFNOSUPPORT) {
@@ -321,11 +322,11 @@ relay_accept(const struct settings *s, struct vconn *listen_vconn)
      }
  
      /* Create and return relay. */
-    r1 = rconn_create(1, 0, 0);
+    r1 = rconn_create(0, 0);
      rconn_connect_unreliably(r1, nl_name_without_subscription, new_local);
      free(nl_name_without_subscription);
  
-    r2 = rconn_create(1, 0, 0);
+    r2 = rconn_create(0, 0);
      rconn_connect_unreliably(r2, "passive", new_remote);
  
      return relay_create(r1, r2, true);
@@ -334,15 +335,9 @@ relay_accept(const struct settings *s, struct vconn *listen_vconn)
  static struct relay *
  relay_create(struct rconn *local, struct rconn *remote, bool is_mgmt_conn)
  {
-    struct relay *r;
-    int i;
-
-    r = xmalloc(sizeof *r);
+    struct relay *r = xcalloc(1, sizeof *r);
      r->halves[HALF_LOCAL].rconn = local;
      r->halves[HALF_REMOTE].rconn = remote;
-    for (i = 0; i < 2; i++) {
-        r->halves[i].rxbuf = NULL;
-    }
      r->is_mgmt_conn = is_mgmt_conn;
      return r;
  }
@@ -379,8 +374,9 @@ relay_run(struct relay *r, const struct hook hooks[], size_t n_hooks)
                  }
              }
  
-            if (this->rxbuf) {
-                int retval = rconn_send(peer->rconn, this->rxbuf);
+            if (this->rxbuf && !this->n_txq) {
+                int retval = rconn_send(peer->rconn, this->rxbuf,
+                                        &this->n_txq);
                  if (retval != EAGAIN) {
                      if (!retval) {
                          progress = true;
@@ -443,14 +439,13 @@ struct in_band_data {
      struct mac_learning *ml;
      struct netdev *of_device;
      uint8_t mac[ETH_ADDR_LEN];
+    int n_queued;
  };
  
  static void
-queue_tx(struct rconn *rc, struct buffer *b)
+queue_tx(struct rconn *rc, struct in_band_data *in_band, struct buffer *b)
  {
-    if (rconn_force_send(rc, b)) {
-        buffer_delete(b);
-    }
+    rconn_send_with_limit(rc, b, &in_band->n_queued, 10);
  }
  
  static const uint8_t *
@@ -567,8 +562,8 @@ in_band_packet_cb(struct relay *r, int half, void *in_band_)
                 && in_port == mac_learning_lookup(in_band->ml,
                                                   controller_mac)) {
          /* Drop controller traffic that arrives on the controller port. */
-        queue_tx(rc, make_add_flow(&flow, ntohl(opi->buffer_id),
-                                   in_band->s->max_idle, 0));
+        queue_tx(rc, in_band, make_add_flow(&flow, ntohl(opi->buffer_id),
+                                            in_band->s->max_idle, 0));
          return true;
      } else {
          return false;
@@ -576,12 +571,14 @@ in_band_packet_cb(struct relay *r, int half, void *in_band_)
  
      if (out_port != OFPP_FLOOD) {
          /* The output port is known, so add a new flow. */
-        queue_tx(rc, make_add_simple_flow(&flow, ntohl(opi->buffer_id),
-                                          out_port, in_band->s->max_idle));
+        queue_tx(rc, in_band,
+                 make_add_simple_flow(&flow, ntohl(opi->buffer_id),
+                                      out_port, in_band->s->max_idle));
  
          /* If the switch didn't buffer the packet, we need to send a copy. */
          if (ntohl(opi->buffer_id) == UINT32_MAX) {
-            queue_tx(rc, make_unbuffered_packet_out(&pkt, in_port, out_port));
+            queue_tx(rc, in_band,
+                     make_unbuffered_packet_out(&pkt, in_port, out_port));
          }
      } else {
          /* We don't know that MAC.  Send along the packet without setting up a
@@ -593,7 +590,7 @@ in_band_packet_cb(struct relay *r, int half, void *in_band_)
              b = make_buffered_packet_out(ntohl(opi->buffer_id),
                                           in_port, out_port);
          }
-        queue_tx(rc, b);
+        queue_tx(rc, in_band, b);
      }
      return true;
  }
@@ -604,7 +601,7 @@ in_band_hook_create(const struct settings *s)
      struct in_band_data *in_band;
      int retval;
  
-    in_band = xmalloc(sizeof *in_band);
+    in_band = xcalloc(1, sizeof *in_band);
      in_band->s = s;
      in_band->ml = mac_learning_create();
      retval = netdev_open(s->of_name, NETDEV_ETH_TYPE_NONE,
diff --git a/switch/datapath.c b/switch/datapath.c

index 31b9af2f5053e7f994a78d897222495ce3888709..1d10ed421ff4d1d4ca3c7ac21b1e9f103c66db2f 100644 (file)
--- a/switch/datapath.c
+++ b/switch/datapath.c
@@ -90,6 +90,8 @@ struct sender {
  struct remote {
      struct list node;
      struct rconn *rconn;
+#define TXQ_LIMIT 128           /* Max number of packets to queue for tx. */
+    int n_txq;                  /* Number of packets queued for tx on rconn. */
  
      /* Support for reliable, multi-message replies to requests.
       *
@@ -331,7 +333,7 @@ dp_run(struct datapath *dp)
                  }
                  break;
              }
-            remote_create(dp, rconn_new_from_vconn("passive", 128, new_vconn));
+            remote_create(dp, rconn_new_from_vconn("passive", new_vconn));
          }
      }
  }
@@ -367,7 +369,7 @@ remote_run(struct datapath *dp, struct remote *r)
              }
              buffer_delete(buffer); 
          } else {
-            if (!rconn_is_full(r->rconn)) {
+            if (r->n_txq < TXQ_LIMIT) {
                  int error = r->cb_dump(dp, r->cb_aux);
                  if (error <= 0) {
                      if (error) {
@@ -576,7 +578,9 @@ send_openflow_buffer(struct datapath *dp, struct buffer *buffer,
      int retval;
  
      update_openflow_length(buffer);
-    retval = rconn_send(rconn, buffer);
+    retval = (remote->n_txq < TXQ_LIMIT
+              ? rconn_send(rconn, buffer, &remote->n_txq)
+              : EAGAIN);
      if (retval) {
          VLOG_WARN("send to %s failed: %s",
                    rconn_get_name(rconn), strerror(retval));
diff --git a/switch/switch.c b/switch/switch.c

index 8224089d3b593a5491a9bd873e9596aded1dec37..3a02f6460fafcd5dd1f3a3d7b1ce6657392c9548 100644 (file)
--- a/switch/switch.c
+++ b/switch/switch.c
@@ -86,7 +86,7 @@ main(int argc, char *argv[])
          fatal(0, "missing controller argument; use --help for usage");
      }
  
-    rconn = rconn_create(128, 60, max_backoff);
+    rconn = rconn_create(60, max_backoff);
      error = rconn_connect(rconn, argv[optind]);
      if (error == EAFNOSUPPORT) {
          fatal(0, "no support for %s vconn", argv[optind]);
author	Ben Pfaff <blp@nicira.com>
	Mon, 11 Aug 2008 23:24:24 +0000 (16:24 -0700)
committer	Ben Pfaff <blp@nicira.com>
	Tue, 12 Aug 2008 22:21:30 +0000 (15:21 -0700)
controller/controller.c		patch \| blob \| history
include/buffer.h		patch \| blob \| history
include/queue.h		patch \| blob \| history
include/rconn.h		patch \| blob \| history
lib/buffer.c		patch \| blob \| history
lib/learning-switch.c		patch \| blob \| history
lib/queue.c		patch \| blob \| history
lib/rconn.c		patch \| blob \| history
secchan/secchan.c		patch \| blob \| history
switch/datapath.c		patch \| blob \| history
switch/switch.c		patch \| blob \| history