2 * Copyright (c) 2008, 2009 Nicira Networks.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 #include "reconnect.h"
23 #include "poll-loop.h"
25 #define THIS_MODULE VLM_reconnect
30 STATE(BACKOFF, 1 << 1) \
31 STATE(CONNECTING, 1 << 2) \
32 STATE(ACTIVE, 1 << 3) \
34 STATE(RECONNECT, 1 << 5)
36 #define STATE(NAME, VALUE) S_##NAME = VALUE,
42 is_connected_state(enum state state)
44 return (state & (S_ACTIVE | S_IDLE)) != 0;
56 long long int state_entered;
58 long long int last_received;
59 long long int last_connected;
61 /* These values are simply for statistics reporting, not otherwise used
62 * directly by anything internal. */
63 long long int creation_time;
64 unsigned int n_attempted_connections, n_successful_connections;
65 unsigned int total_connected_duration;
69 static void reconnect_transition__(struct reconnect *, long long int now,
71 static long long int reconnect_deadline__(const struct reconnect *);
74 reconnect_state_name__(enum state state)
77 #define STATE(NAME, VALUE) case S_##NAME: return #NAME;
84 /* Creates and returns a new reconnect FSM with default settings. The FSM is
85 * initially disabled. The caller will likely want to call reconnect_enable()
86 * and reconnect_set_name() on the returned object. */
88 reconnect_create(long long int now)
90 struct reconnect *fsm = xzalloc(sizeof *fsm);
92 fsm->name = xstrdup("void");
93 fsm->min_backoff = 1000;
94 fsm->max_backoff = 8000;
95 fsm->probe_interval = 5000;
98 fsm->state_entered = now;
100 fsm->last_received = now;
101 fsm->last_connected = now;
102 fsm->creation_time = now;
109 reconnect_destroy(struct reconnect *fsm)
117 /* Returns 'fsm''s name. */
119 reconnect_get_name(const struct reconnect *fsm)
124 /* Sets 'fsm''s name to 'name'. If 'name' is null, then "void" is used
127 * The name set for 'fsm' is used in log messages. */
129 reconnect_set_name(struct reconnect *fsm, const char *name)
132 fsm->name = xstrdup(name ? name : "void");
135 /* Return the minimum number of milliseconds to back off between consecutive
136 * connection attempts. The default is 1000 ms. */
138 reconnect_get_min_backoff(const struct reconnect *fsm)
140 return fsm->min_backoff;
143 /* Return the maximum number of milliseconds to back off between consecutive
144 * connection attempts. The default is 8000 ms. */
146 reconnect_get_max_backoff(const struct reconnect *fsm)
148 return fsm->max_backoff;
151 /* Returns the "probe interval" for 'fsm' in milliseconds. If this is zero, it
152 * disables the connection keepalive feature. If it is nonzero, then if the
153 * interval passes while 'fsm' is connected and without reconnect_received()
154 * being called for 'fsm', reconnect_run() returns RECONNECT_PROBE. If the
155 * interval passes again without reconnect_received() being called,
156 * reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'. */
158 reconnect_get_probe_interval(const struct reconnect *fsm)
160 return fsm->probe_interval;
163 /* Configures the backoff parameters for 'fsm'. 'min_backoff' is the minimum
164 * number of milliseconds, and 'max_backoff' is the maximum, between connection
167 * 'min_backoff' must be at least 1000, and 'max_backoff' must be greater than
168 * or equal to 'min_backoff'. */
170 reconnect_set_backoff(struct reconnect *fsm, int min_backoff, int max_backoff)
172 fsm->min_backoff = MAX(min_backoff, 1000);
173 fsm->max_backoff = max_backoff ? MAX(max_backoff, 1000) : 8000;
174 if (fsm->min_backoff > fsm->max_backoff) {
175 fsm->max_backoff = fsm->min_backoff;
178 if (fsm->state == S_BACKOFF && fsm->backoff > max_backoff) {
179 fsm->backoff = max_backoff;
183 /* Sets the "probe interval" for 'fsm' to 'probe_interval', in milliseconds.
184 * If this is zero, it disables the connection keepalive feature. If it is
185 * nonzero, then if the interval passes while 'fsm' is connected and without
186 * reconnect_received() being called for 'fsm', reconnect_run() returns
187 * RECONNECT_PROBE. If the interval passes again without reconnect_received()
188 * being called, reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'.
190 * If 'probe_interval' is nonzero, then it will be forced to a value of at
193 reconnect_set_probe_interval(struct reconnect *fsm, int probe_interval)
195 fsm->probe_interval = probe_interval ? MAX(1000, probe_interval) : 0;
198 /* Returns true if 'fsm' has been enabled with reconnect_enable(). Calling
199 * another function that indicates a change in connection state, such as
200 * reconnect_disconnected() or reconnect_force_reconnect(), will also enable
201 * a reconnect FSM. */
203 reconnect_is_enabled(const struct reconnect *fsm)
205 return fsm->state != S_VOID;
208 /* If 'fsm' is disabled (the default for newly created FSMs), enables it, so
209 * that the next call to reconnect_run() for 'fsm' will return
212 * If 'fsm' is not disabled, this function has no effect. */
214 reconnect_enable(struct reconnect *fsm, long long int now)
216 if (fsm->state == S_VOID) {
217 reconnect_transition__(fsm, now, S_BACKOFF);
222 /* Disables 'fsm'. Until 'fsm' is enabled again, reconnect_run() will always
225 reconnect_disable(struct reconnect *fsm, long long int now)
227 if (fsm->state != S_VOID) {
228 reconnect_transition__(fsm, now, S_VOID);
232 /* If 'fsm' is enabled and currently connected (or attempting to connect),
233 * forces reconnect_run() for 'fsm' to return RECONNECT_DISCONNECT the next
234 * time it is called, which should cause the client to drop the connection (or
235 * attempt), back off, and then reconnect. */
237 reconnect_force_reconnect(struct reconnect *fsm, long long int now)
239 if (fsm->state & (S_CONNECTING | S_ACTIVE | S_IDLE)) {
240 reconnect_transition__(fsm, now, S_RECONNECT);
244 /* Tell 'fsm' that the connection dropped or that a connection attempt failed.
245 * 'error' specifies the reason: a positive value represents an errno value,
246 * EOF indicates that the connection was closed by the peer (e.g. read()
247 * returned 0), and 0 indicates no specific error.
249 * The FSM will back off, then reconnect. */
251 reconnect_disconnected(struct reconnect *fsm, long long int now, int error)
253 if (fsm->state != S_BACKOFF) {
254 /* Report what happened. */
255 if (fsm->state & (S_ACTIVE | S_IDLE)) {
257 VLOG_WARN("%s: connection dropped (%s)",
258 fsm->name, strerror(error));
259 } else if (error == EOF) {
260 VLOG_INFO("%s: connection closed by peer", fsm->name);
262 VLOG_INFO("%s: connection dropped", fsm->name);
266 VLOG_WARN("%s: connection attempt failed (%s)",
267 fsm->name, strerror(error));
269 VLOG_INFO("%s: connection attempt timed out", fsm->name);
274 if (fsm->state & (S_ACTIVE | S_IDLE)
275 && fsm->last_received - fsm->last_connected >= fsm->backoff) {
276 fsm->backoff = fsm->min_backoff;
278 if (fsm->backoff < fsm->min_backoff) {
279 fsm->backoff = fsm->min_backoff;
280 } else if (fsm->backoff >= fsm->max_backoff / 2) {
281 fsm->backoff = fsm->max_backoff;
285 VLOG_INFO("%s: waiting %.3g seconds before reconnect\n",
286 fsm->name, fsm->backoff / 1000.0);
288 reconnect_transition__(fsm, now, S_BACKOFF);
292 /* Tell 'fsm' that a connection attempt is in progress.
294 * The FSM will start a timer, after which the connection attempt will be
295 * aborted (by returning RECONNECT_DISCONNECT from reconect_run()). */
297 reconnect_connecting(struct reconnect *fsm, long long int now)
299 if (fsm->state != S_CONNECTING) {
300 VLOG_INFO("%s: connecting...", fsm->name);
301 reconnect_transition__(fsm, now, S_CONNECTING);
305 /* Tell 'fsm' that the connection was successful.
307 * The FSM will start the probe interval timer, which is reset by
308 * reconnect_received(). If the timer expires, a probe will be sent (by
309 * returning RECONNECT_PROBE from reconnect_run()). If the timer expires
310 * again without being reset, the connection will be aborted (by returning
311 * RECONNECT_DISCONNECT from reconnect_run()). */
313 reconnect_connected(struct reconnect *fsm, long long int now)
315 if (!is_connected_state(fsm->state)) {
316 reconnect_connecting(fsm, now);
318 VLOG_INFO("%s: connected", fsm->name);
319 reconnect_transition__(fsm, now, S_ACTIVE);
320 fsm->last_connected = now;
324 /* Tell 'fsm' that the connection attempt failed.
326 * The FSM will back off and attempt to reconnect. */
328 reconnect_connect_failed(struct reconnect *fsm, long long int now, int error)
330 reconnect_connecting(fsm, now);
331 reconnect_disconnected(fsm, now, error);
334 /* Tell 'fsm' that some data was received. This resets the probe interval
335 * timer, so that the connection is known not to be idle. */
337 reconnect_received(struct reconnect *fsm, long long int now)
339 if (fsm->state != S_ACTIVE) {
340 reconnect_transition__(fsm, now, S_ACTIVE);
342 fsm->last_received = now;
346 reconnect_transition__(struct reconnect *fsm, long long int now,
349 if (fsm->state == S_CONNECTING) {
350 fsm->n_attempted_connections++;
351 if (state == S_ACTIVE) {
352 fsm->n_successful_connections++;
355 if (is_connected_state(fsm->state) != is_connected_state(state)) {
356 if (is_connected_state(fsm->state)) {
357 fsm->total_connected_duration += now - fsm->last_connected;
362 VLOG_DBG("%s: entering %s", fsm->name, reconnect_state_name__(state));
364 fsm->state_entered = now;
368 reconnect_deadline__(const struct reconnect *fsm)
370 assert(fsm->state_entered != LLONG_MIN);
371 switch (fsm->state) {
376 return fsm->state_entered + fsm->backoff;
379 return fsm->state_entered + MAX(1000, fsm->backoff);
382 if (fsm->probe_interval) {
383 long long int base = MAX(fsm->last_received, fsm->state_entered);
384 return base + fsm->probe_interval;
389 return fsm->state_entered + fsm->probe_interval;
392 return fsm->state_entered;
398 /* Assesses whether any action should be taken on 'fsm'. The return value is
401 * - 0: The client need not take any action.
403 * - RECONNECT_CONNECT: The client should start a connection attempt and
404 * indicate this by calling reconnect_connecting(). If the connection
405 * attempt has definitely succeeded, it should call
406 * reconnect_connected(). If the connection attempt has definitely
407 * failed, it should call reconnect_connect_failed().
409 * The FSM is smart enough to back off correctly after successful
410 * connections that quickly abort, so it is OK to call
411 * reconnect_connected() after a low-level successful connection
412 * (e.g. connect()) even if the connection might soon abort due to a
413 * failure at a high-level (e.g. SSL negotiation failure).
415 * - RECONNECT_DISCONNECT: The client should abort the current connection
416 * or connection attempt and call reconnect_disconnected() or
417 * reconnect_connect_failed() to indicate it.
419 * - RECONNECT_PROBE: The client should send some kind of request to the
420 * peer that will elicit a response, to ensure that the connection is
421 * indeed in working order. (This will only be returned if the "probe
422 * interval" is nonzero--see reconnect_set_probe_interval()).
424 enum reconnect_action
425 reconnect_run(struct reconnect *fsm, long long int now)
427 if (now >= reconnect_deadline__(fsm)) {
428 switch (fsm->state) {
433 return RECONNECT_CONNECT;
436 return RECONNECT_DISCONNECT;
439 VLOG_DBG("%s: idle %lld ms, sending inactivity probe", fsm->name,
440 now - MAX(fsm->last_received, fsm->state_entered));
441 reconnect_transition__(fsm, now, S_IDLE);
442 return RECONNECT_PROBE;
445 VLOG_ERR("%s: no response to inactivity probe after %.3g "
446 "seconds, disconnecting",
447 fsm->name, (now - fsm->state_entered) / 1000.0);
448 return RECONNECT_DISCONNECT;
451 return RECONNECT_DISCONNECT;
456 return fsm->state == S_CONNECTING ? RECONNECT_CONNECT : 0;
460 /* Causes the next call to poll_block() to wake up when reconnect_run() should
461 * be called on 'fsm'. */
463 reconnect_wait(struct reconnect *fsm, long long int now)
465 int timeout = reconnect_timeout(fsm, now);
467 poll_timer_wait(timeout);
471 /* Returns the number of milliseconds after which reconnect_run() should be
472 * called on 'fsm' if nothing else notable happens in the meantime, or a
473 * negative number if this is currently unnecessary. */
475 reconnect_timeout(struct reconnect *fsm, long long int now)
477 long long int deadline = reconnect_deadline__(fsm);
478 if (deadline != LLONG_MAX) {
479 long long int remaining = deadline - now;
480 return MAX(0, MIN(INT_MAX, remaining));
485 /* Returns true if 'fsm' is currently believed to be connected, that is, if
486 * reconnect_connected() was called more recently than any call to
487 * reconnect_connect_failed() or reconnect_disconnected() or
488 * reconnect_disable(), and false otherwise. */
490 reconnect_is_connected(const struct reconnect *fsm)
492 return is_connected_state(fsm->state);
495 /* Returns the number of milliseconds for which 'fsm' has been continuously
496 * connected to its peer. (If 'fsm' is not currently connected, this is 0.) */
498 reconnect_get_connection_duration(const struct reconnect *fsm,
501 return reconnect_is_connected(fsm) ? now - fsm->last_connected : 0;
504 /* Copies various statistics for 'fsm' into '*stats'. */
506 reconnect_get_stats(const struct reconnect *fsm, long long int now,
507 struct reconnect_stats *stats)
509 stats->creation_time = fsm->creation_time;
510 stats->last_received = fsm->last_received;
511 stats->last_connected = fsm->last_connected;
512 stats->backoff = fsm->backoff;
513 stats->seqno = fsm->seqno;
514 stats->is_connected = reconnect_is_connected(fsm);
515 stats->current_connection_duration
516 = reconnect_get_connection_duration(fsm, now);
517 stats->total_connected_duration = (stats->current_connection_duration
518 + fsm->total_connected_duration);
519 stats->n_attempted_connections = fsm->n_attempted_connections;
520 stats->n_successful_connections = fsm->n_successful_connections;
521 stats->state = reconnect_state_name__(fsm->state);
522 stats->state_elapsed = now - fsm->state_entered;