From 7d7447df778f7c9eacd0506369103e823bf20d6f Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@nicira.com>
Date: Mon, 16 Apr 2012 16:01:01 -0700
Subject: [PATCH] netlink: Postpone choosing sequence numbers until send time.

Choosing sequence numbers at time of creating a packet means that
nl_sock_transact_multiple() has to search for the sequence number
of a reply, because the sequence numbers of the requests aren't
necessarily sequential.  This commit makes it possible to avoid
the search, by deferring choice of sequence numbers until the
time that we send the packets.  It doesn't actually modify
nl_sock_transact_multiple(), which will happen in a later commit.

Previously, I was concerned about a theoretical race condition
described in a comment in the old versino of this code:

    This implementation uses sequence numbers that are unique
    process-wide, to avoid a hypothetical race: send request, close
    socket, open new socket that reuses the old socket's PID value,
    send request on new socket, receive reply from kernel to old
    socket but with same PID and sequence number.  (This race could be
    avoided other ways, e.g. by preventing PIDs from being quickly
    reused).

However, I no longer believe that this can be a real problem,
because Netlink operates synchronously.  The reply to a request
will always arrive before the socket can be closed and a new
socket opened with the old socket's PID.

Signed-off-by: Ben Pfaff <blp@nicira.com>
---
 lib/netlink-socket.c | 34 +++++++++++++++++++++++++++-------
 lib/netlink.c        | 29 +++++------------------------
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c
index 15a800b4..9501e453 100644
--- a/lib/netlink-socket.c
+++ b/lib/netlink-socket.c
@@ -54,6 +54,7 @@ COVERAGE_DEFINE(netlink_sent);
  * information.  Also, at high logging levels we log *all* Netlink messages. */
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 600);
 
+static uint32_t nl_sock_allocate_seq(struct nl_sock *, unsigned int n);
 static void log_nlmsg(const char *function, int error,
                       const void *message, size_t size, int protocol);
 
@@ -62,6 +63,7 @@ static void log_nlmsg(const char *function, int error,
 struct nl_sock
 {
     int fd;
+    uint32_t next_seq;
     uint32_t pid;
     int protocol;
     struct nl_dump *dump;
@@ -122,6 +124,7 @@ nl_sock_create(int protocol, struct nl_sock **sockp)
     }
     sock->protocol = protocol;
     sock->dump = NULL;
+    sock->next_seq = 1;
 
     rcvbuf = 1024 * 1024;
     if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUFFORCE,
@@ -256,6 +259,7 @@ nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg, bool wait)
     int error;
 
     nlmsg->nlmsg_len = msg->size;
+    nlmsg->nlmsg_seq = nl_sock_allocate_seq(sock, 1);
     nlmsg->nlmsg_pid = sock->pid;
     do {
         int retval;
@@ -522,7 +526,6 @@ nl_sock_transact_multiple__(struct nl_sock *sock,
  *
  * Before sending each message, this function will finalize nlmsg_len in each
  * 'request' to match the ofpbuf's size, and set nlmsg_pid to 'sock''s pid.
- * NLM_F_ACK will be added to some requests' nlmsg_flags.
  *
  * Bare Netlink is an unreliable transport protocol.  This function layers
  * reliable delivery and reply semantics on top of bare Netlink.  See
@@ -597,9 +600,9 @@ nl_sock_transact_multiple(struct nl_sock *sock,
  * on failure '*replyp' is set to NULL.  If 'replyp' is null, then the kernel's
  * reply, if any, is discarded.
  *
- * nlmsg_len in 'msg' will be finalized to match msg->size, and nlmsg_pid will
- * be set to 'sock''s pid, before the message is sent.  NLM_F_ACK will be set
- * in nlmsg_flags.
+ * Before the message is sent, nlmsg_len in 'request' will be finalized to
+ * match msg->size, nlmsg_pid will be set to 'sock''s pid, and nlmsg_seq will
+ * be initialized, NLM_F_ACK will be set in nlmsg_flags.
  *
  * The caller is responsible for destroying 'request'.
  *
@@ -721,9 +724,6 @@ void
 nl_dump_start(struct nl_dump *dump,
               struct nl_sock *sock, const struct ofpbuf *request)
 {
-    struct nlmsghdr *nlmsghdr = nl_msg_nlmsghdr(request);
-    nlmsghdr->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;
-    dump->seq = nlmsghdr->nlmsg_seq;
     dump->buffer = NULL;
     if (sock->dump) {
         /* 'sock' already has an ongoing dump.  Clone the socket because
@@ -737,7 +737,10 @@ nl_dump_start(struct nl_dump *dump,
         dump->sock = sock;
         dump->status = 0;
     }
+
+    nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;
     dump->status = nl_sock_send__(sock, request, true);
+    dump->seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
 }
 
 /* Helper function for nl_dump_next(). */
@@ -1057,6 +1060,23 @@ nl_lookup_genl_family(const char *name, int *number)
     return *number > 0 ? 0 : -*number;
 }
 
+static uint32_t
+nl_sock_allocate_seq(struct nl_sock *sock, unsigned int n)
+{
+    uint32_t seq = sock->next_seq;
+
+    sock->next_seq += n;
+
+    /* Make it impossible for the next request for sequence numbers to wrap
+     * around to 0.  Start over with 1 to avoid ever using a sequence number of
+     * 0, because the kernel uses sequence number 0 for notifications. */
+    if (sock->next_seq >= UINT32_MAX / 2) {
+        sock->next_seq = 1;
+    }
+
+    return seq;
+}
+
 static void
 nlmsghdr_to_string(const struct nlmsghdr *h, int protocol, struct ds *ds)
 {
diff --git a/lib/netlink.c b/lib/netlink.c
index 64450497..6e0701c4 100644
--- a/lib/netlink.c
+++ b/lib/netlink.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
+ * Copyright (c) 2008, 2009, 2010, 2011, 2012 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -88,26 +88,6 @@ nl_msg_reserve(struct ofpbuf *msg, size_t size)
     ofpbuf_prealloc_tailroom(msg, NLMSG_ALIGN(size));
 }
 
-static uint32_t
-get_nlmsg_seq(void)
-{
-    /* Next nlmsghdr sequence number.
-     *
-     * This implementation uses sequence numbers that are unique process-wide,
-     * to avoid a hypothetical race: send request, close socket, open new
-     * socket that reuses the old socket's PID value, send request on new
-     * socket, receive reply from kernel to old socket but with same PID and
-     * sequence number.  (This race could be avoided other ways, e.g. by
-     * preventing PIDs from being quickly reused). */
-    static uint32_t next_seq;
-
-    if (next_seq == 0) {
-        /* Pick initial sequence number. */
-        next_seq = getpid() ^ time_wall();
-    }
-    return next_seq++;
-}
-
 /* Puts a nlmsghdr at the beginning of 'msg', which must be initially empty.
  * Uses the given 'type' and 'flags'.  'expected_payload' should be
  * an estimate of the number of payload bytes to be supplied; if the size of
@@ -121,8 +101,9 @@ get_nlmsg_seq(void)
  * is often NLM_F_REQUEST indicating that a request is being made, commonly
  * or'd with NLM_F_ACK to request an acknowledgement.
  *
- * Sets the new nlmsghdr's nlmsg_pid field to 0 for now.  nl_sock_send() will
- * fill it in just before sending the message.
+ * Sets the new nlmsghdr's nlmsg_len, nlmsg_seq, and nlmsg_pid fields to 0 for
+ * now.  Functions that send Netlink messages will fill these in just before
+ * sending the message.
  *
  * nl_msg_put_genlmsghdr() is more convenient for composing a Generic Netlink
  * message. */
@@ -139,7 +120,7 @@ nl_msg_put_nlmsghdr(struct ofpbuf *msg,
     nlmsghdr->nlmsg_len = 0;
     nlmsghdr->nlmsg_type = type;
     nlmsghdr->nlmsg_flags = flags;
-    nlmsghdr->nlmsg_seq = get_nlmsg_seq();
+    nlmsghdr->nlmsg_seq = 0;
     nlmsghdr->nlmsg_pid = 0;
 }
 
-- 
2.30.2