shrub/v/raft.c

1724 lines
43 KiB
C
Raw Normal View History

/* v/raft.c
**
** This file is in the public domain.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <uv.h>
#include "all.h"
#include "v/vere.h"
2014-02-27 03:46:34 +04:00
2014-03-06 22:18:36 +04:00
/* u2_rent: Log entry wire format.
*/
2014-03-05 05:43:40 +04:00
typedef struct {
c3_w tem_w; // Log entry term
c3_w typ_w; // Entry type, %ra|%ov
2014-03-06 22:18:36 +04:00
c3_w len_w; // Word length of blob
2014-03-05 05:43:40 +04:00
c3_w* bob_w; // Blob
} u2_rent;
2014-03-06 22:18:36 +04:00
/* u2_rmsg: Raft RPC wire format.
*/
2014-03-05 05:43:40 +04:00
typedef struct _u2_rmsg {
2014-03-06 22:18:36 +04:00
c3_w ver_w; // version, mug('a')...
2014-03-05 05:43:40 +04:00
c3_d len_d; // Words in message
c3_w tem_w; // Current term
c3_w typ_w; // %apen|%revo|%rasp
union {
struct {
c3_w suc_w; // Request successful
} rasp;
struct {
c3_d lai_d; // Last log index
c3_w lat_w; // Last log term
2014-03-06 22:18:36 +04:00
c3_w nam_w; // Name word length
2014-03-05 05:43:40 +04:00
c3_c* nam_c; // Requestor name
union {
struct {
c3_d cit_d; // Leader commitIndex
c3_d ent_d; // Number of entries
u2_rent* ent_u; // Entries
} apen;
};
} rest;
};
} u2_rmsg;
2014-03-06 03:38:06 +04:00
static ssize_t _raft_rmsg_read(const u2_rbuf* buf_u, u2_rmsg* msg_u);
static void _raft_rmsg_send(u2_rcon* ron_u, const u2_rmsg* msg_u);
2014-03-05 05:43:40 +04:00
static void _raft_rmsg_free(u2_rmsg* msg_u);
2014-02-27 03:46:34 +04:00
static void _raft_conn_dead(u2_rcon* ron_u);
2014-03-03 23:30:00 +04:00
static u2_bean _raft_remove_run(u2_rcon* ron_u);
2014-03-06 07:40:35 +04:00
static void _raft_send_rasp(u2_rcon* ron_u, c3_t suc_t);
static void _raft_rreq_free(u2_rreq* req_u);
2014-03-06 03:38:06 +04:00
static void _raft_time_cb(uv_timer_t* tim_u, c3_i sas_i);
static void
_raft_rnam_free(u2_rnam* nam_u)
{
if ( nam_u ) {
c3_assert(0 == nam_u->ron_u);
c3_assert(0 == nam_u->nex_u);
free(nam_u->str_c);
free(nam_u->nam_c);
free(nam_u->por_c);
}
free(nam_u);
}
/* _raft_readname(): parse a raft host:port peer name.
*/
static u2_rnam*
_raft_readname(const c3_c* str_c, c3_w siz_w)
{
u2_rnam* nam_u = calloc(1, sizeof(*nam_u));
c3_c* col_c;
c3_w nam_w;
nam_u->str_c = malloc(siz_w + 1);
strncpy(nam_u->str_c, str_c, siz_w);
nam_u->str_c[siz_w] = '\0';
//fprintf(stderr, "raft: peer %s\n", nam_u->str_c);
if ( 0 == (col_c = strchr(nam_u->str_c, ':')) ) {
uL(fprintf(uH, "raft: invalid name %s\n", nam_u->str_c));
_raft_rnam_free(nam_u);
nam_u = 0;
}
else {
2014-02-27 03:46:34 +04:00
nam_w = col_c - nam_u->str_c + 1;
nam_u->nam_c = malloc(nam_w);
2014-02-27 03:46:34 +04:00
uv_strlcpy(nam_u->nam_c, nam_u->str_c, nam_w);
nam_u->por_c = strdup(col_c + 1);
}
return nam_u;
}
2014-02-28 04:35:50 +04:00
/* u2_raft_readopt(): parse a string into a list of raft peers.
*/
u2_rnam*
u2_raft_readopt(const c3_c* arg_c, c3_c* our_c, c3_s oup_s)
{
u2_rnam* nam_u;
u2_rnam* nex_u;
c3_c* com_c;
if ( 0 == (com_c = strchr(arg_c, ',')) ) {
nam_u = _raft_readname(arg_c, strlen(arg_c));
nex_u = 0;
}
else {
nam_u = _raft_readname(arg_c, com_c - arg_c);
nex_u = u2_raft_readopt(com_c + 1, our_c, oup_s);
}
if ( nam_u ) {
c3_c* end_c;
c3_w por_w = strtoul(nam_u->por_c, &end_c, 10);
if ( '\0' == *nam_u->por_c || '\0' != *end_c || por_w > 65536 ) {
uL(fprintf(uH, "raft: invalid port %s\n", nam_u->por_c));
_raft_rnam_free(nam_u);
_raft_rnam_free(nex_u);
nam_u = 0;
}
else {
if ( oup_s == por_w && 0 == strcmp(our_c, nam_u->nam_c) ) {
_raft_rnam_free(nam_u);
nam_u = nex_u;
}
else nam_u->nex_u = nex_u;
}
}
else _raft_rnam_free(nex_u);
return nam_u;
}
2014-02-27 03:46:34 +04:00
/* _raft_alloc(): libuv-style allocator for raft.
*/
static uv_buf_t
_raft_alloc(uv_handle_t* had_u, size_t siz_i)
{
uv_buf_t buf_u = { .base = malloc(siz_i), .len = siz_i };
return buf_u;
}
/* _raft_election_rand(): pseudorandom component of election timeout.
*/
static c3_w
_raft_election_rand()
{
return ((float) rand() / RAND_MAX) * 150;
}
2014-03-01 02:52:00 +04:00
/* _raft_promote(): actions on raft leader election.
*/
static void
_raft_promote(u2_raft* raf_u)
{
2014-03-06 07:40:35 +04:00
if ( u2_raty_lead == raf_u->typ_e ) {
uL(fprintf(uH, "raft: double promote; ignoring\n"));
}
else {
c3_i sas_i;
uL(fprintf(uH, "raft: promoting to leader\n"));
if ( 1 == raf_u->pop_w ) {
raf_u->typ_e = u2_raty_lead;
}
else {
c3_assert(u2_raty_cand == raf_u->typ_e);
sas_i = uv_timer_stop(&raf_u->tim_u);
c3_assert(0 == sas_i);
raf_u->typ_e = u2_raty_lead;
sas_i = uv_timer_start(&raf_u->tim_u, _raft_time_cb, 50, 0);
c3_assert(0 == sas_i);
}
}
2014-03-01 02:52:00 +04:00
2014-03-06 07:40:35 +04:00
/* TODO */
if ( 1 == raf_u->pop_w ) {
u2_sist_boot();
if ( u2_no == u2_Host.ops_u.bat ) {
u2_lo_lead(u2A);
}
}
}
/* _raft_demote(): demote to follower.
*/
static void
_raft_demote(u2_raft* raf_u)
{
if ( u2_raty_lead == raf_u->typ_e ) {
uL(fprintf(uH, "raft: demoting leader\n"));
/* TODO just start dropping events */
exit(1);
}
else {
c3_assert(u2_raty_cand == raf_u->typ_e);
uL(fprintf(uH, "raft: demoting to follower\n"));
free(raf_u->vog_c);
2014-03-06 07:40:35 +04:00
raf_u->vog_c = 0;
u2_sist_nil("vote");
2014-03-06 07:40:35 +04:00
raf_u->vot_w = 0;
raf_u->typ_e = u2_raty_foll;
}
}
2014-03-07 04:04:52 +04:00
/* _raft_note_term(): note a term from the network, demoting if it is newer.
2014-03-06 07:40:35 +04:00
*/
static void
2014-03-07 04:04:52 +04:00
_raft_note_term(u2_raft* raf_u, c3_w tem_w)
2014-03-06 07:40:35 +04:00
{
if ( raf_u->tem_w < tem_w ) {
uL(fprintf(uH, "raft: got term from network: %d\n", tem_w));
raf_u->tem_w = tem_w;
u2_sist_put("term", (c3_y*)&raf_u->tem_w, sizeof(c3_w));
2014-03-06 07:40:35 +04:00
c3_assert(raf_u->typ_e != u2_raty_none);
if ( raf_u->typ_e == u2_raty_foll ) {
c3_assert(0 == raf_u->vot_w);
} else _raft_demote(raf_u);
2014-03-01 02:52:00 +04:00
}
}
2014-03-06 03:38:06 +04:00
/* _raft_rest_name(): update conn name from incoming request.
**
** If this connection already has a name, make sure the passed name
** matches. Otherwise, try to associate it with a name, killing old
** connections to that name.
2014-02-28 04:35:50 +04:00
*/
static void
2014-03-06 03:38:06 +04:00
_raft_rest_name(u2_rcon* ron_u, const c3_c* nam_c)
2014-02-27 03:46:34 +04:00
{
2014-03-06 03:38:06 +04:00
if ( 0 != ron_u->nam_u ) {
if ( 0 != strcmp(ron_u->nam_u->str_c, nam_c) ) {
uL(fprintf(uH, "raft: names disagree o:%s n:%s\n",
ron_u->nam_u->str_c, nam_c));
_raft_conn_dead(ron_u);
}
}
else {
2014-02-27 03:46:34 +04:00
u2_raft* raf_u = ron_u->raf_u;
u2_rnam* nam_u = raf_u->nam_u;
while ( nam_u ) {
2014-03-06 03:38:06 +04:00
if ( 0 == strcmp(nam_u->str_c, nam_c) ) {
2014-02-27 03:46:34 +04:00
if ( nam_u->ron_u ) {
2014-03-03 23:30:00 +04:00
c3_assert(nam_u->ron_u != ron_u);
2014-03-07 05:25:53 +04:00
//uL(fprintf(uH, "raft: closing old conn %p to %s (%p)\n",
// nam_u->ron_u, nam_u->str_c, ron_u));
2014-02-27 03:46:34 +04:00
_raft_conn_dead(nam_u->ron_u);
}
uL(fprintf(uH, "raft: incoming conn from %s\n", nam_u->str_c));
2014-02-27 03:46:34 +04:00
nam_u->ron_u = ron_u;
ron_u->nam_u = nam_u;
_raft_remove_run(ron_u);
break;
}
else nam_u = nam_u->nex_u;
}
2014-03-06 03:38:06 +04:00
if ( 0 == ron_u->nam_u ) {
uL(fprintf(uH, "connection from unkown peer %s\n", nam_c));
_raft_conn_dead(ron_u);
}
2014-02-27 03:46:34 +04:00
}
2014-03-06 03:38:06 +04:00
}
2014-02-27 03:46:34 +04:00
2014-03-06 03:38:06 +04:00
/* _raft_do_rest(): effects of an incoming request.
*/
static void
_raft_do_rest(u2_rcon* ron_u, const u2_rmsg* msg_u)
{
u2_raft* raf_u = ron_u->raf_u;
2014-03-07 04:04:52 +04:00
if ( u2_raty_cand == raf_u->typ_e || u2_raty_foll == raf_u->typ_e ) {
c3_i sas_i;
2014-03-06 03:38:06 +04:00
2014-03-07 04:04:52 +04:00
sas_i = uv_timer_stop(&raf_u->tim_u);
c3_assert(0 == sas_i);
sas_i = uv_timer_start(&raf_u->tim_u, &_raft_time_cb,
150 + _raft_election_rand(), 0);
c3_assert(0 == sas_i);
2014-02-27 03:46:34 +04:00
}
2014-03-07 04:04:52 +04:00
2014-03-06 03:38:06 +04:00
_raft_rest_name(ron_u, msg_u->rest.nam_c);
2014-03-07 04:04:52 +04:00
_raft_note_term(raf_u, msg_u->tem_w);
2014-02-27 03:46:34 +04:00
}
2014-03-05 05:43:40 +04:00
/* _raft_do_apen(): Handle incoming AppendEntries.
*/
static void
_raft_do_apen(u2_rcon* ron_u, const u2_rmsg* msg_u)
{
c3_assert(c3__apen == msg_u->typ_w);
2014-03-06 03:38:06 +04:00
_raft_do_rest(ron_u, msg_u);
/* TODO respond */
2014-03-05 05:43:40 +04:00
}
2014-03-06 07:40:35 +04:00
/* _raft_apen_done(): process AppendEntries response.
*/
static void
_raft_apen_done(u2_rreq* req_u, c3_w suc_w)
{
c3_assert(c3__apen == req_u->msg_u->typ_w);
2014-03-06 07:40:35 +04:00
/* TODO */
}
2014-03-05 05:43:40 +04:00
/* _raft_do_revo(): Handle incoming RequestVote.
*/
static void
_raft_do_revo(u2_rcon* ron_u, const u2_rmsg* msg_u)
{
2014-03-06 07:40:35 +04:00
u2_raft* raf_u = ron_u->raf_u;
2014-03-05 05:43:40 +04:00
c3_assert(c3__revo == msg_u->typ_w);
2014-03-06 03:38:06 +04:00
_raft_do_rest(ron_u, msg_u);
2014-03-06 07:40:35 +04:00
c3_assert(0 != ron_u->nam_u);
if ( msg_u->tem_w >= raf_u->tem_w &&
(0 == raf_u->vog_c ||
0 == strcmp(raf_u->vog_c, ron_u->nam_u->str_c)) &&
(raf_u->lat_w < msg_u->rest.lat_w ||
(raf_u->lat_w == msg_u->rest.lat_w &&
raf_u->ent_w <= msg_u->rest.lai_d)) )
{
_raft_send_rasp(ron_u, 1);
}
else _raft_send_rasp(ron_u, 0);
}
/* _raft_revo_done(): process RequestVote response.
*/
static void
_raft_revo_done(u2_rreq* req_u, c3_w suc_w)
{
u2_rcon* ron_u = req_u->ron_u;
u2_raft* raf_u = ron_u->raf_u;
c3_assert(c3__revo == req_u->msg_u->typ_w);
2014-03-06 07:40:35 +04:00
if ( suc_w ) {
raf_u->vot_w++;
}
if ( raf_u->vot_w > raf_u->pop_w / 2 ) {
uL(fprintf(uH, "raft: got majority of %d for term %d\n",
raf_u->vot_w, raf_u->tem_w));
_raft_promote(raf_u);
}
2014-03-05 05:43:40 +04:00
}
2014-02-28 04:35:50 +04:00
/* _raft_do_rasp(): act on an incoming raft RPC response.
*/
2014-02-27 03:46:34 +04:00
static void
2014-03-05 05:43:40 +04:00
_raft_do_rasp(u2_rcon* ron_u, u2_rmsg* msg_u)
{
2014-03-06 07:40:35 +04:00
u2_raft* raf_u = ron_u->raf_u;
2014-03-05 05:43:40 +04:00
c3_assert(c3__rasp == msg_u->typ_w);
2014-02-27 03:46:34 +04:00
if ( 0 == ron_u->nam_u ) {
2014-03-06 07:40:35 +04:00
uL(fprintf(uH, "raft: invalid connection from unknown host\n"));
2014-02-27 03:46:34 +04:00
_raft_conn_dead(ron_u);
}
else {
2014-03-06 07:40:35 +04:00
u2_rreq* req_u = ron_u->out_u;
if ( !req_u ) {
uL(fprintf(uH, "raft: response with no request from %s\n",
ron_u->nam_u->str_c));
2014-03-06 07:40:35 +04:00
_raft_conn_dead(ron_u);
}
else {
switch ( req_u->msg_u->typ_w ) {
default: {
uL(fprintf(uH, "raft: bogus request type %x?!\n",
req_u->msg_u->typ_w));
c3_assert(0);
}
case c3__apen: {
_raft_apen_done(req_u, msg_u->rasp.suc_w);
break;
}
case c3__revo: {
_raft_revo_done(req_u, msg_u->rasp.suc_w);
break;
}
}
2014-03-07 04:04:52 +04:00
_raft_note_term(raf_u, msg_u->tem_w);
2014-03-06 07:40:35 +04:00
ron_u->out_u = req_u->nex_u;
if ( 0 == req_u->nex_u ) {
c3_assert(req_u == ron_u->tou_u);
ron_u->tou_u = 0;
}
_raft_rreq_free(req_u);
}
2014-02-27 03:46:34 +04:00
}
}
2014-03-06 03:38:06 +04:00
/* _raft_rmsg_read(): read a u2_rmsg from a buffer.
2014-03-05 05:43:40 +04:00
**
** Returns <0 on parse failure.
2014-03-06 03:38:06 +04:00
** Returns 0 on partial data.
** Returns bytes read on successful read.
2014-03-05 05:43:40 +04:00
**
** If successful, caller must eventually call _raft_free_rmsg() on msg_u.
2014-02-28 04:35:50 +04:00
*/
2014-03-05 05:43:40 +04:00
static ssize_t
2014-03-06 03:38:06 +04:00
_raft_rmsg_read(const u2_rbuf* buf_u, u2_rmsg* msg_u)
2014-02-27 03:46:34 +04:00
{
2014-03-05 05:43:40 +04:00
ssize_t red_i = 0;
2014-03-06 03:38:06 +04:00
c3_d ben_d;
2014-02-27 03:46:34 +04:00
2014-03-06 03:38:06 +04:00
if ( buf_u->len_w < sizeof(c3_w) + sizeof(c3_d) ) {
2014-03-05 05:43:40 +04:00
return 0;
}
2014-03-06 03:38:06 +04:00
memcpy(&msg_u->ver_w, buf_u->buf_y + red_i, sizeof(c3_w));
red_i += sizeof(c3_w);
if ( msg_u->ver_w != u2_cr_mug('a') ) {
uL(fprintf(uH, "raft: versions don't match: %x %x\n",
msg_u->ver_w, u2_cr_mug('a')));
return -1;
}
memcpy(&msg_u->len_d, buf_u->buf_y + red_i, sizeof(c3_d));
2014-03-05 05:43:40 +04:00
red_i += sizeof(c3_d);
2014-02-27 03:46:34 +04:00
2014-03-06 03:38:06 +04:00
if ( msg_u->len_d < 4 ) {
2014-03-07 23:59:37 +04:00
uL(fprintf(uH, "raft: length too short (a) %llu\n", msg_u->len_d));
2014-03-05 05:43:40 +04:00
return -1;
}
2014-03-06 03:38:06 +04:00
ben_d = 4ULL * msg_u->len_d;
if ( buf_u->len_w < ben_d ) {
return 0;
2014-03-05 05:43:40 +04:00
}
2014-03-06 03:38:06 +04:00
if ( ben_d < red_i + 2 * sizeof(c3_w) ) {
2014-03-07 23:59:37 +04:00
uL(fprintf(uH, "raft: length too short (b) %llu\n", msg_u->len_d));
2014-03-06 03:38:06 +04:00
return -1;
2014-03-05 05:43:40 +04:00
}
memcpy(&msg_u->tem_w, buf_u->buf_y + red_i, sizeof(c3_w));
red_i += sizeof(c3_w);
memcpy(&msg_u->typ_w, buf_u->buf_y + red_i, sizeof(c3_w));
red_i += sizeof(c3_w);
switch ( msg_u->typ_w ) {
default: {
2014-03-06 03:38:06 +04:00
uL(fprintf(uH, "raft: unknown msg type %x\n", msg_u->typ_w));
2014-03-05 05:43:40 +04:00
return -1;
}
case c3__rasp: {
2014-03-06 03:38:06 +04:00
if ( ben_d < red_i + sizeof(c3_w) ) {
2014-03-07 23:59:37 +04:00
uL(fprintf(uH, "raft: length too short (c) %llu\n", msg_u->len_d));
2014-03-06 03:38:06 +04:00
return -1;
2014-02-27 03:46:34 +04:00
}
2014-03-05 05:43:40 +04:00
memcpy(&msg_u->rasp.suc_w, buf_u->buf_y + red_i, sizeof(c3_w));
red_i += sizeof(c3_w);
break;
}
case c3__apen: case c3__revo: {
2014-03-06 03:38:06 +04:00
if ( ben_d < red_i + sizeof(c3_d) + 2 * sizeof(c3_w) ) {
2014-03-07 23:59:37 +04:00
uL(fprintf(uH, "raft: length too short (d) %llu\n", msg_u->len_d));
2014-03-06 03:38:06 +04:00
return -1;
2014-03-05 05:43:40 +04:00
}
memcpy(&msg_u->rest.lai_d, buf_u->buf_y + red_i, sizeof(c3_d));
red_i += sizeof(c3_d);
memcpy(&msg_u->rest.lat_w, buf_u->buf_y + red_i, sizeof(c3_w));
red_i += sizeof(c3_w);
memcpy(&msg_u->rest.nam_w, buf_u->buf_y + red_i, sizeof(c3_w));
red_i += sizeof(c3_w);
2014-03-06 03:38:06 +04:00
if ( ben_d < red_i + 4 * msg_u->rest.nam_w ) {
2014-03-07 23:59:37 +04:00
uL(fprintf(uH, "raft: length too short (e) %llu\n", msg_u->len_d));
2014-03-06 03:38:06 +04:00
return -1;
2014-03-05 05:43:40 +04:00
}
2014-03-06 03:38:06 +04:00
msg_u->rest.nam_c = malloc(4 * msg_u->rest.nam_w);
uv_strlcpy(msg_u->rest.nam_c, (const char*)(buf_u->buf_y + red_i),
4 * msg_u->rest.nam_w);
red_i += 4 * msg_u->rest.nam_w;
2014-03-05 05:43:40 +04:00
break;
}
}
2014-02-27 03:46:34 +04:00
2014-03-05 05:43:40 +04:00
if ( c3__apen == msg_u->typ_w ) {
2014-03-06 03:38:06 +04:00
if ( ben_d < red_i + 2 * sizeof(c3_d) ) {
2014-03-07 23:59:37 +04:00
uL(fprintf(uH, "raft: length too short (f) %llu\n", msg_u->len_d));
2014-03-06 03:38:06 +04:00
red_i = -1;
2014-03-05 05:43:40 +04:00
goto fail;
}
memcpy(&msg_u->rest.apen.cit_d, buf_u->buf_y + red_i, sizeof(c3_d));
red_i += sizeof(c3_d);
memcpy(&msg_u->rest.apen.ent_d, buf_u->buf_y + red_i, sizeof(c3_d));
red_i += sizeof(c3_d);
2014-02-27 03:46:34 +04:00
2014-03-05 05:43:40 +04:00
msg_u->rest.apen.ent_u = calloc(
1, msg_u->rest.apen.ent_d * sizeof(u2_rent));
{
c3_d i_d;
u2_rent* ent_u = msg_u->rest.apen.ent_u;
for ( i_d = 0; i_d < msg_u->rest.apen.ent_d; i_d++ ) {
2014-03-06 03:38:06 +04:00
if ( ben_d < red_i + 3 * sizeof(c3_w) ) {
2014-03-07 23:59:37 +04:00
uL(fprintf(uH, "raft: length too short (g) %llu\n", msg_u->len_d));
2014-03-06 03:38:06 +04:00
red_i = -1;
2014-03-05 05:43:40 +04:00
goto fail;
}
memcpy(&ent_u[i_d].tem_w, buf_u->buf_y + red_i, sizeof(c3_w));
red_i += sizeof(c3_w);
memcpy(&ent_u[i_d].typ_w, buf_u->buf_y + red_i, sizeof(c3_w));
red_i += sizeof(c3_w);
memcpy(&ent_u[i_d].len_w, buf_u->buf_y + red_i, sizeof(c3_w));
red_i += sizeof(c3_w);
2014-03-06 08:23:33 +04:00
if ( ben_d < red_i + 4 * ent_u[i_d].len_w ) {
2014-03-07 23:59:37 +04:00
uL(fprintf(uH, "raft: length too short (h) %llu\n", msg_u->len_d));
2014-03-06 03:38:06 +04:00
red_i = -1;
goto fail;
2014-02-27 03:46:34 +04:00
}
2014-03-06 08:23:33 +04:00
ent_u[i_d].bob_w = malloc(4 * ent_u[i_d].len_w);
memcpy(ent_u[i_d].bob_w, buf_u->buf_y + red_i, 4 * ent_u[i_d].len_w);
red_i += 4 * ent_u[i_d].len_w;
2014-03-05 05:43:40 +04:00
}
}
}
2014-03-06 03:38:06 +04:00
if ( red_i != ben_d ) {
uL(fprintf(uH, "raft: sizes don't match r:%ld w:%llu\n", red_i, ben_d));
2014-03-05 05:43:40 +04:00
red_i = -1;
goto fail;
}
2014-02-27 03:46:34 +04:00
2014-03-05 05:43:40 +04:00
out:
return red_i;
fail:
_raft_rmsg_free(msg_u);
goto out;
}
/* _raft_rbuf_grow(): append data to the buffer, reallocating if needed.
**
** Returns new buffer location, as realloc.
*/
static u2_rbuf*
_raft_rbuf_grow(u2_rbuf* buf_u, const c3_y* buf_y, size_t siz_i)
2014-03-06 03:38:06 +04:00
{
if ( 0 == buf_u ) {
buf_u = malloc(sizeof(*buf_u) + siz_i);
buf_u->len_w = 0;
buf_u->cap_w = siz_i;
}
2014-03-06 03:38:06 +04:00
if ( buf_u->cap_w < buf_u->len_w + siz_i ) {
c3_w cap_w = c3_max(2 * buf_u->cap_w, buf_u->len_w + siz_i);
buf_u = realloc(buf_u, sizeof(*buf_u) + cap_w);
buf_u->cap_w = cap_w;
2014-03-06 03:38:06 +04:00
}
memcpy(buf_u->buf_y + buf_u->len_w, buf_y, siz_i);
buf_u->len_w += siz_i;
return buf_u;
2014-03-06 03:38:06 +04:00
}
/* _raft_bytes_send():
*/
2014-03-06 03:38:06 +04:00
static void
_raft_bytes_send(u2_rcon* ron_u, const void* ptr_v, size_t siz_i)
2014-03-06 03:38:06 +04:00
{
ron_u->wri_u = _raft_rbuf_grow(ron_u->wri_u, ptr_v, siz_i);
2014-03-06 03:38:06 +04:00
}
/* _raft_rmsg_send(): send a u2_rmsg over the wire.
*/
static void
_raft_rmsg_send(u2_rcon* ron_u, const u2_rmsg* msg_u)
2014-03-06 03:38:06 +04:00
{
c3_d len_d = sizeof(c3_d) + 3 * sizeof(c3_w);
_raft_bytes_send(ron_u, &msg_u->ver_w, sizeof(c3_w));
_raft_bytes_send(ron_u, &msg_u->len_d, sizeof(c3_d));
_raft_bytes_send(ron_u, &msg_u->tem_w, sizeof(c3_w));
_raft_bytes_send(ron_u, &msg_u->typ_w, sizeof(c3_w));
2014-03-06 03:38:06 +04:00
switch ( msg_u->typ_w ) {
default: {
uL(fprintf(uH, "raft: send: unknown message type\n"));
c3_assert(0);
}
case c3__rasp: {
len_d += sizeof(c3_w);
_raft_bytes_send(ron_u, &msg_u->rasp.suc_w, sizeof(c3_w));
2014-03-06 03:38:06 +04:00
break;
}
case c3__apen: case c3__revo: {
len_d += sizeof(c3_d) + 2 * sizeof(c3_w) + 4 * msg_u->rest.nam_w;
_raft_bytes_send(ron_u, &msg_u->rest.lai_d, sizeof(c3_d));
_raft_bytes_send(ron_u, &msg_u->rest.lat_w, sizeof(c3_w));
_raft_bytes_send(ron_u, &msg_u->rest.nam_w, sizeof(c3_w));
_raft_bytes_send(ron_u, msg_u->rest.nam_c, 4 * msg_u->rest.nam_w);
2014-03-06 03:38:06 +04:00
break;
}
}
if ( c3__apen == msg_u->typ_w ) {
c3_d i_d;
u2_rent* ent_u = msg_u->rest.apen.ent_u;
len_d += 2 * sizeof(c3_d);
_raft_bytes_send(ron_u, &msg_u->rest.apen.cit_d, sizeof(c3_d));
_raft_bytes_send(ron_u, &msg_u->rest.apen.ent_d, sizeof(c3_d));
2014-03-06 03:38:06 +04:00
for ( i_d = 0; i_d < msg_u->rest.apen.ent_d; i_d++ ) {
len_d += 3 * sizeof(c3_w) + ent_u[i_d].len_w;
_raft_bytes_send(ron_u, &ent_u[i_d].tem_w, sizeof(c3_w));
_raft_bytes_send(ron_u, &ent_u[i_d].typ_w, sizeof(c3_w));
_raft_bytes_send(ron_u, &ent_u[i_d].len_w, sizeof(c3_w));
_raft_bytes_send(ron_u, ent_u[i_d].bob_w, ent_u[i_d].len_w);
2014-03-06 03:38:06 +04:00
}
}
2014-03-08 00:17:08 +04:00
//uL(fprintf(uH, "raft: sent %llu (%llu) [%x]\n",
// len_d, msg_u->len_d, msg_u->typ_w));
2014-03-07 05:25:53 +04:00
c3_assert(len_d == 4 * msg_u->len_d);
2014-03-06 03:38:06 +04:00
}
2014-03-07 23:48:06 +04:00
/* _raft_rmsg_free(): free a u2_rmsg's resources (but not the msg itself).
*/
2014-03-05 05:43:40 +04:00
static void
_raft_rmsg_free(u2_rmsg* msg_u) {
if ( c3__apen == msg_u->typ_w && msg_u->rest.apen.ent_u ) {
c3_d i_d;
for ( i_d = 0; i_d < msg_u->rest.apen.ent_d; i_d++ ) {
free(msg_u->rest.apen.ent_u[i_d].bob_w);
}
free(msg_u->rest.apen.ent_u);
msg_u->rest.apen.ent_u = 0;
}
if ( c3__apen == msg_u->typ_w || c3__revo == msg_u->typ_w ) {
free(msg_u->rest.nam_c);
msg_u->rest.nam_c = 0;
}
}
2014-03-07 23:48:06 +04:00
/* An unusual lameness in libuv.
*/
struct _u2_write_t {
uv_write_t wri_u;
c3_y* buf_y;
};
2014-03-07 23:48:06 +04:00
/* _raft_write_cb(): generic write callback.
*/
static void
_raft_write_cb(uv_write_t* wri_u, c3_i sas_i)
{
struct _u2_write_t* req_u = (struct _u2_write_t*)wri_u;
if ( 0 != sas_i ) {
uL(fprintf(uH, "raft: write_cb: %s\n",
uv_strerror(uv_last_error(u2L))));
_raft_conn_dead((u2_rcon*)wri_u->handle);
}
free(req_u->buf_y);
free(req_u);
}
2014-03-05 05:43:40 +04:00
/* _raft_conn_work(): read and write requests and responses.
*/
static void
_raft_conn_work(u2_rcon* ron_u)
{
2014-03-07 05:25:53 +04:00
c3_assert(u2_yes == ron_u->liv);
2014-03-05 05:43:40 +04:00
if ( u2_yes == ron_u->red ) {
2014-03-06 03:38:06 +04:00
c3_assert(ron_u->red_u);
2014-03-05 05:43:40 +04:00
ron_u->red = u2_no;
while (1) {
u2_rmsg msg_u;
2014-03-06 03:38:06 +04:00
ssize_t ret_i = _raft_rmsg_read(ron_u->red_u, &msg_u);
2014-03-05 05:43:40 +04:00
if ( ret_i < 0 ) {
2014-03-06 03:38:06 +04:00
if ( ron_u->nam_u ) {
uL(fprintf(uH, "raft: conn_work: error reading from %s\n",
ron_u->nam_u->str_c));
}
else {
uL(fprintf(uH, "raft: conn_work: error reading\n"));
}
_raft_conn_dead(ron_u);
break;
}
else if ( ret_i == 0 ) {
2014-03-05 05:43:40 +04:00
break;
}
else {
2014-03-06 03:38:06 +04:00
if ( 4 * msg_u.len_d != ret_i ) {
uL(fprintf(uH, "raft: conn_work: lengths don't match\n"));
2014-03-05 05:43:40 +04:00
c3_assert(0);
}
2014-03-06 03:38:06 +04:00
else {
2014-03-05 05:43:40 +04:00
c3_assert(ron_u->red_u->len_w >= ret_i);
memmove(ron_u->red_u->buf_y,
ron_u->red_u->buf_y + ret_i,
ron_u->red_u->len_w - ret_i);
ron_u->red_u->len_w -= ret_i;
switch ( msg_u.typ_w ) {
default: {
2014-03-06 03:38:06 +04:00
uL(fprintf(uH, "raft: work: unknown message type %x\n",
msg_u.typ_w));
2014-03-05 05:43:40 +04:00
break;
}
case c3__apen: {
_raft_do_apen(ron_u, &msg_u);
break;
}
case c3__revo: {
_raft_do_revo(ron_u, &msg_u);
break;
}
case c3__rasp: {
_raft_do_rasp(ron_u, &msg_u);
break;
}
}
_raft_rmsg_free(&msg_u);
2014-02-27 03:46:34 +04:00
}
}
}
}
if ( ron_u->wri_u && ron_u->wri_u->len_w > 0 ) {
uv_buf_t buf_u;
struct _u2_write_t* req_u = malloc(sizeof(*req_u));
2014-03-05 05:43:40 +04:00
req_u->buf_y = malloc(ron_u->wri_u->len_w);
memcpy(req_u->buf_y, ron_u->wri_u->buf_y, ron_u->wri_u->len_w);
buf_u.base = (char*)req_u->buf_y;
buf_u.len = ron_u->wri_u->len_w;
2014-03-05 05:43:40 +04:00
if ( 0 != uv_write((uv_write_t*)req_u,
(uv_stream_t*)&ron_u->wax_u,
&buf_u,
1,
_raft_write_cb) )
{
uL(fprintf(uH, "raft: conn_work (write): %s\n",
uv_strerror(uv_last_error(u2L))));
free(req_u->buf_y);
free(req_u);
}
else {
ron_u->wri_u->len_w = 0;
}
2014-03-05 05:43:40 +04:00
}
}
2014-02-28 04:35:50 +04:00
/* _raft_conn_read_cb(): generic connection read callback.
*/
2014-02-27 03:46:34 +04:00
static void
_raft_conn_read_cb(uv_stream_t* tcp_u,
ssize_t siz_i,
uv_buf_t buf_u)
{
u2_rcon* ron_u = (u2_rcon*)tcp_u;
u2_lo_open();
{
if ( siz_i < 0 ) {
uv_err_t las_u = uv_last_error(u2L);
if ( UV_EOF != las_u.code ) {
uL(fprintf(uH, "raft: read: %s\n", uv_strerror(las_u)));
}
_raft_conn_dead(ron_u);
}
2014-03-06 03:38:06 +04:00
else if ( siz_i == 0 ) {
// do nothing
}
2014-02-27 03:46:34 +04:00
else {
2014-03-07 05:25:53 +04:00
if ( u2_yes == ron_u->liv ) {
ron_u->red_u = _raft_rbuf_grow(ron_u->red_u, (c3_y*)buf_u.base, siz_i);
2014-03-07 05:25:53 +04:00
ron_u->red = u2_yes;
_raft_conn_work(ron_u);
}
else uL(fprintf(uH, "XX raft: read on dead conn %p\n", ron_u));
2014-02-27 03:46:34 +04:00
}
}
2014-03-06 03:38:06 +04:00
free(buf_u.base);
2014-02-27 03:46:34 +04:00
u2_lo_shut(u2_no);
}
2014-02-28 04:35:50 +04:00
/* _raft_conn_new(): allocate a new raft connection.
*/
2014-03-03 23:30:00 +04:00
static u2_rcon*
2014-02-27 03:46:34 +04:00
_raft_conn_new(u2_raft* raf_u)
{
u2_rcon* ron_u = malloc(sizeof(*ron_u));
uv_tcp_init(u2L, &ron_u->wax_u);
2014-03-05 05:43:40 +04:00
ron_u->red_u = 0;
ron_u->out_u = ron_u->tou_u = 0;
ron_u->red_u = 0;
ron_u->red = u2_no;
ron_u->wri_u = 0;
2014-03-03 23:30:00 +04:00
ron_u->nam_u = 0;
ron_u->raf_u = raf_u;
ron_u->nex_u = 0;
ron_u->liv = u2_no;
2014-02-27 03:46:34 +04:00
2014-03-03 23:30:00 +04:00
return ron_u;
2014-02-27 03:46:34 +04:00
}
2014-02-28 04:35:50 +04:00
/* _raft_remove_run(): remove a connection from the list of unknowns.
*/
2014-03-03 23:30:00 +04:00
static u2_bean
2014-02-27 03:46:34 +04:00
_raft_remove_run(u2_rcon* ron_u)
{
u2_raft* raf_u = ron_u->raf_u;
2014-03-03 23:30:00 +04:00
u2_bean suc = u2_no;
2014-02-27 03:46:34 +04:00
if ( raf_u->run_u == ron_u ) {
raf_u->run_u = ron_u->nex_u;
2014-03-03 23:30:00 +04:00
suc = u2_yes;
2014-02-27 03:46:34 +04:00
}
else {
u2_rcon* pre_u = raf_u->run_u;
while ( pre_u ) {
if ( pre_u->nex_u == ron_u ) {
pre_u->nex_u = ron_u->nex_u;
2014-03-03 23:30:00 +04:00
suc = u2_yes;
2014-02-27 03:46:34 +04:00
break;
}
else pre_u = pre_u->nex_u;
}
}
2014-03-03 23:30:00 +04:00
return suc;
2014-02-27 03:46:34 +04:00
}
2014-03-06 03:38:06 +04:00
static u2_rreq*
_raft_rreq_new(u2_rcon* ron_u)
{
u2_rreq* req_u = malloc(sizeof(*req_u));
req_u->msg_u = malloc(sizeof(*req_u->msg_u));
req_u->nex_u = 0;
req_u->ron_u = ron_u;
if ( ron_u->tou_u ) {
c3_assert(ron_u->out_u);
ron_u->tou_u->nex_u = req_u;
ron_u->tou_u = req_u;
}
else {
c3_assert(0 == ron_u->out_u);
ron_u->tou_u = ron_u->out_u = req_u;
}
return req_u;
}
2014-03-05 05:43:40 +04:00
static void
2014-03-06 07:40:35 +04:00
_raft_rreq_free(u2_rreq* req_u)
2014-03-05 05:43:40 +04:00
{
2014-03-06 07:40:35 +04:00
_raft_rmsg_free(req_u->msg_u);
free(req_u->msg_u); // XX
free(req_u);
2014-03-05 05:43:40 +04:00
}
2014-02-28 04:35:50 +04:00
/* _raft_conn_free(): unlink a connection and free its resources.
*/
2014-02-27 03:46:34 +04:00
static void
_raft_conn_free(uv_handle_t* had_u)
{
u2_rcon* ron_u = (void*)had_u;
2014-03-07 05:25:53 +04:00
u2_raft* raf_u = ron_u->raf_u;
2014-02-27 03:46:34 +04:00
2014-03-03 23:34:20 +04:00
//uL(fprintf(uH, "raft: conn_free %p\n", ron_u));
2014-03-07 05:25:53 +04:00
// Unlink references.
if ( ron_u->nam_u ) {
c3_assert(u2_no == _raft_remove_run(ron_u));
if ( ron_u->nam_u->ron_u == ron_u ) {
ron_u->nam_u->ron_u = 0;
}
}
else {
u2_bean suc = _raft_remove_run(ron_u);
c3_assert(u2_yes == suc);
// Slow, expensive debug assert.
{
u2_rnam* nam_u = raf_u->nam_u;
while ( nam_u ) {
c3_assert(nam_u->ron_u != ron_u);
nam_u = nam_u->nex_u;
}
}
}
// Free requests.
2014-03-06 07:40:35 +04:00
{
u2_rreq* req_u = ron_u->out_u;
if ( 0 == req_u ) {
c3_assert(0 == ron_u->tou_u);
}
else {
while ( req_u ) {
if ( 0 == req_u->nex_u ) {
c3_assert(req_u == ron_u->tou_u);
}
ron_u->out_u = req_u->nex_u;
_raft_rreq_free(req_u);
req_u = ron_u->out_u;
}
}
}
2014-03-06 03:38:06 +04:00
free(ron_u->red_u);
free(ron_u->wri_u);
2014-02-27 03:46:34 +04:00
free(ron_u);
}
2014-02-28 04:35:50 +04:00
/* _raft_conn_dead(): kill a connection.
*/
2014-02-27 03:46:34 +04:00
static void
_raft_conn_dead(u2_rcon* ron_u)
{
if ( u2_no == ron_u->liv ) {
//uL(fprintf(uH, "raft: conn already dead %p\n", ron_u));
return;
}
2014-03-06 03:38:06 +04:00
else {
2014-03-08 01:24:33 +04:00
uL(fprintf(uH, "raft: conn_dead %p\n", ron_u));
2014-03-07 05:25:53 +04:00
ron_u->liv = u2_no;
2014-03-06 03:38:06 +04:00
}
2014-03-07 05:25:53 +04:00
uv_read_stop((uv_stream_t*)&ron_u->wax_u);
2014-02-27 03:46:34 +04:00
uv_close((uv_handle_t*)&ron_u->wax_u, _raft_conn_free);
}
2014-02-28 04:35:50 +04:00
/* _raft_listen_cb(): generic listen callback.
*/
2014-02-27 03:46:34 +04:00
static void
_raft_listen_cb(uv_stream_t* str_u, c3_i sas_i)
{
u2_raft* raf_u = (u2_raft*)str_u;
if ( 0 != sas_i ) {
uL(fprintf(uH, "raft: listen_cb: %s\n",
uv_strerror(uv_last_error(u2L))));
}
else {
2014-03-03 23:30:00 +04:00
u2_rcon* ron_u = _raft_conn_new(raf_u);
if ( 0 != uv_accept((uv_stream_t*)&raf_u->wax_u,
(uv_stream_t*)&ron_u->wax_u) )
{
uL(fprintf(uH, "raft: accept: %s\n",
uv_strerror(uv_last_error(u2L))));
uv_close((uv_handle_t*)&ron_u->wax_u, 0);
free(ron_u);
}
else {
ron_u->liv = u2_yes;
2014-03-03 23:30:00 +04:00
uv_read_start((uv_stream_t*)&ron_u->wax_u,
_raft_alloc,
_raft_conn_read_cb);
ron_u->nex_u = raf_u->run_u;
raf_u->run_u = ron_u;
}
2014-02-27 03:46:34 +04:00
}
}
2014-03-01 01:22:11 +04:00
/* _raft_connect_cb(): generic connection callback.
*/
static void
_raft_connect_cb(uv_connect_t* con_u, c3_i sas_i)
{
u2_rcon* ron_u = con_u->data;
free(con_u);
if ( 0 != sas_i ) {
2014-03-08 07:11:22 +04:00
uL(fprintf(uH, "raft: connect_cb: %s\n",
uv_strerror(uv_last_error(u2L))));
uv_close((uv_handle_t*)&ron_u->wax_u, _raft_conn_free);
2014-03-01 01:22:11 +04:00
}
else {
c3_assert(ron_u->nam_u);
uL(fprintf(uH, "raft: connected to %s\n", ron_u->nam_u->str_c));
ron_u->liv = u2_yes;
2014-03-06 05:47:22 +04:00
uv_read_start((uv_stream_t*)&ron_u->wax_u,
_raft_alloc,
_raft_conn_read_cb);
2014-03-01 01:22:11 +04:00
_raft_conn_work(ron_u);
}
}
/* _raft_getaddrinfo_cb(): generic getaddrinfo callback.
2014-02-28 04:35:50 +04:00
*/
2014-02-27 03:46:34 +04:00
static void
2014-03-01 00:30:06 +04:00
_raft_getaddrinfo_cb(uv_getaddrinfo_t* raq_u,
2014-02-27 03:46:34 +04:00
c3_i sas_i,
struct addrinfo* add_u)
{
2014-03-01 00:30:06 +04:00
struct addrinfo* res_u;
uv_connect_t* con_u = malloc(sizeof(*con_u));
u2_rcon* ron_u = raq_u->data;
2014-03-06 03:38:06 +04:00
//uL(fprintf(uH, "getaddrinfo_cb %s\n", ron_u->nam_u->nam_c));
2014-03-01 23:55:46 +04:00
2014-03-01 01:22:11 +04:00
con_u->data = ron_u;
2014-03-01 00:30:06 +04:00
for ( res_u = add_u; res_u; res_u = res_u->ai_next ) {
2014-03-01 01:22:11 +04:00
if ( 0 != uv_tcp_connect(con_u,
&ron_u->wax_u,
*(struct sockaddr_in*)res_u->ai_addr,
2014-03-01 00:30:06 +04:00
_raft_connect_cb) )
{
2014-03-01 01:22:11 +04:00
uL(fprintf(uH, "raft: getaddrinfo_cb: %s\n",
uv_strerror(uv_last_error(u2L))));
uv_close((uv_handle_t*)&ron_u->wax_u, 0);
continue;
}
else {
2014-03-07 05:25:53 +04:00
#if 0
2014-03-01 23:55:46 +04:00
c3_c add_c[17] = {'\0'};
uv_ip4_name((struct sockaddr_in*)res_u->ai_addr, add_c, 16);
uL(fprintf(uH, "raft: conn %s\n", add_c));
2014-03-07 05:25:53 +04:00
#endif
2014-03-01 01:22:11 +04:00
break; // Found one
2014-03-01 00:30:06 +04:00
}
}
2014-03-01 23:55:46 +04:00
if ( !res_u ) {
uL(fprintf(uH, "raft: getaddrinfo_cb: no address matched\n"));
2014-03-03 23:30:00 +04:00
_raft_conn_free((uv_handle_t*)&ron_u->wax_u);
free(con_u);
2014-03-01 23:55:46 +04:00
}
2014-03-01 01:22:11 +04:00
uv_freeaddrinfo(add_u);
free(raq_u);
2014-02-27 03:46:34 +04:00
}
2014-02-28 04:35:50 +04:00
/* _raft_conn_all(): ensure that we are connected to each peer.
*/
2014-02-27 03:46:34 +04:00
static void
_raft_conn_all(u2_raft* raf_u, void (*con_f)(u2_rcon* ron_u))
{
u2_rnam* nam_u = raf_u->nam_u;
u2_rcon* ron_u;
while ( nam_u ) {
2014-03-08 07:11:22 +04:00
if ( 0 == nam_u->ron_u || u2_no == nam_u->ron_u->liv ) {
2014-02-27 03:46:34 +04:00
struct addrinfo hit_u;
uv_getaddrinfo_t* raq_u = malloc(sizeof(*raq_u));
2014-03-07 05:25:53 +04:00
ron_u = _raft_conn_new(raf_u);
//uL(fprintf(uH, "raft: new conn to %s:%s %p\n",
// nam_u->nam_c, nam_u->por_c, ron_u));
2014-03-01 23:55:46 +04:00
2014-02-27 03:46:34 +04:00
memset(&hit_u, 0, sizeof(hit_u));
2014-03-01 01:22:11 +04:00
hit_u.ai_family = AF_INET;
2014-02-27 03:46:34 +04:00
hit_u.ai_socktype = SOCK_STREAM;
2014-03-01 23:55:46 +04:00
hit_u.ai_protocol = IPPROTO_TCP;
2014-02-27 03:46:34 +04:00
raq_u->data = ron_u;
if ( 0 != uv_getaddrinfo(u2L,
raq_u,
_raft_getaddrinfo_cb,
nam_u->nam_c,
2014-03-01 23:55:46 +04:00
nam_u->por_c,
&hit_u) )
2014-02-27 03:46:34 +04:00
{
uL(fprintf(uH, "raft: getaddrinfo: %s\n",
uv_strerror(uv_last_error(u2L))));
uv_close((uv_handle_t*)&ron_u->wax_u, 0);
free(raq_u);
free(ron_u);
c3_assert(0);
}
else {
ron_u->nam_u = nam_u;
nam_u->ron_u = ron_u;
}
con_f(nam_u->ron_u);
2014-02-27 03:46:34 +04:00
}
2014-03-03 23:30:00 +04:00
else {
2014-03-03 23:34:20 +04:00
//uL(fprintf(uH, "raft: existing connection %p for %s\n",
// nam_u->ron_u, nam_u->str_c));
con_f(nam_u->ron_u);
if ( u2_yes == nam_u->ron_u->liv ) {
_raft_conn_work(nam_u->ron_u);
}
2014-03-03 23:30:00 +04:00
}
2014-02-27 03:46:34 +04:00
nam_u = nam_u->nex_u;
}
}
2014-03-06 22:18:36 +04:00
/* _raft_write_base(): Populate the base fields of a u2_rmsg.
**
** Should not be called directly.
*/
2014-03-06 07:40:35 +04:00
static void
2014-03-06 22:18:36 +04:00
_raft_write_base(u2_rcon* ron_u, u2_rmsg* msg_u)
2014-03-06 07:40:35 +04:00
{
u2_raft* raf_u = ron_u->raf_u;
2014-03-06 08:23:33 +04:00
msg_u->ver_w = u2_cr_mug('a');
msg_u->tem_w = raf_u->tem_w;
msg_u->len_d = 5;
2014-03-06 07:40:35 +04:00
}
2014-03-06 22:18:36 +04:00
/* _raft_write_rest(): Write fields for an RPC request to msg_u.
**
** Should not be called directly.
*/
2014-02-27 03:46:34 +04:00
static void
2014-03-06 08:23:33 +04:00
_raft_write_rest(u2_rcon* ron_u, c3_d lai_d, c3_w lat_w, u2_rmsg* msg_u)
2014-02-27 03:46:34 +04:00
{
2014-03-06 03:38:06 +04:00
u2_raft* raf_u = ron_u->raf_u;
c3_assert(ron_u->nam_u);
2014-03-06 22:18:36 +04:00
_raft_write_base(ron_u, msg_u);
2014-03-06 08:23:33 +04:00
msg_u->rest.lai_d = lai_d;
msg_u->rest.lat_w = lat_w;
2014-03-06 03:38:06 +04:00
msg_u->rest.nam_w = 1 + strlen(raf_u->str_c) / 4;
msg_u->rest.nam_c = calloc(1, 4 * msg_u->rest.nam_w);
uv_strlcpy(msg_u->rest.nam_c, raf_u->str_c, 4 * msg_u->rest.nam_w);
2014-03-06 08:23:33 +04:00
msg_u->len_d += 4 + msg_u->rest.nam_w;
}
2014-03-06 03:38:06 +04:00
2014-03-06 22:18:36 +04:00
/* _raft_write_apen(): Write fields for an AppendEntries request.
*/
2014-03-06 08:23:33 +04:00
static void
_raft_write_apen(u2_rcon* ron_u,
c3_d lai_d, c3_w lat_w,
c3_d cit_d, c3_d ent_d, u2_rent* ent_u,
u2_rmsg* msg_u)
{
_raft_write_rest(ron_u, lai_d, lat_w, msg_u);
2014-03-06 22:18:36 +04:00
msg_u->typ_w = c3__apen;
2014-03-06 08:23:33 +04:00
msg_u->rest.apen.cit_d = cit_d;
msg_u->rest.apen.ent_d = ent_d;
msg_u->len_d += 4;
msg_u->rest.apen.ent_u = ent_u;
{
c3_d i_d;
for ( i_d = 0; i_d < ent_d; i_d++ ) {
msg_u->len_d += 3 + ent_u[i_d].len_w;
}
}
}
2014-03-06 22:18:36 +04:00
/* _raft_write_revo(): Write fields for a RequestVote request.
*/
2014-03-06 08:23:33 +04:00
static void
_raft_write_revo(u2_rcon* ron_u, u2_rmsg* msg_u)
{
u2_raft* raf_u = ron_u->raf_u;
_raft_write_rest(ron_u, raf_u->ent_w, raf_u->lat_w, msg_u);
2014-03-06 22:18:36 +04:00
msg_u->typ_w = c3__revo;
2014-03-06 08:23:33 +04:00
}
2014-03-06 22:18:36 +04:00
/* _raft_send_rasp(): Send a rasp (raft response) to a peer.
*/
2014-03-06 08:23:33 +04:00
static void
_raft_send_rasp(u2_rcon* ron_u, c3_t suc_t)
{
u2_rmsg msg_u;
2014-03-06 22:18:36 +04:00
_raft_write_base(ron_u, &msg_u);
msg_u.typ_w = c3__rasp;
2014-03-06 08:23:33 +04:00
msg_u.rasp.suc_w = suc_t;
msg_u.len_d += 1;
_raft_rmsg_send(ron_u, &msg_u);
2014-03-06 08:23:33 +04:00
}
2014-03-06 22:18:36 +04:00
/* _raft_send_beat(): send a heartbeat (empty AppendEntries) to a peer.
**
** Creates a new request.
2014-03-06 08:23:33 +04:00
*/
static void
_raft_send_beat(u2_rcon* ron_u)
{
u2_rreq* req_u = _raft_rreq_new(ron_u);
u2_rmsg* msg_u = req_u->msg_u;
2014-03-08 06:50:01 +04:00
c3_log_every(50, "raft: beat 50\n");
2014-03-06 08:23:33 +04:00
_raft_write_apen(ron_u, 0, 0, 0, 0, 0, msg_u);
_raft_rmsg_send(ron_u, msg_u);
2014-02-27 03:46:34 +04:00
}
2014-03-06 22:18:36 +04:00
/* _raft_send_revo(): send a RequestVote to a peer.
**
** Creates a new request.
2014-02-28 04:35:50 +04:00
*/
2014-02-27 03:46:34 +04:00
static void
_raft_send_revo(u2_rcon* ron_u)
{
2014-03-06 03:38:06 +04:00
u2_rreq* req_u = _raft_rreq_new(ron_u);
u2_rmsg* msg_u = req_u->msg_u;
2014-03-06 08:23:33 +04:00
_raft_write_revo(ron_u, msg_u);
_raft_rmsg_send(ron_u, msg_u);
2014-02-27 03:46:34 +04:00
}
2014-02-28 04:35:50 +04:00
/* _raft_start_election(): bump term, vote for self, solicit votes from peers.
*/
2014-02-27 03:46:34 +04:00
static void
_raft_start_election(u2_raft* raf_u)
{
raf_u->tem_w++;
u2_sist_put("term", (c3_y*)&raf_u->tem_w, sizeof(c3_w));
2014-02-27 03:46:34 +04:00
uL(fprintf(uH, "raft: starting election [tem:%d]\n", raf_u->tem_w));
raf_u->vot_w = 1;
raf_u->vog_c = strdup(raf_u->str_c);
u2_sist_put("vote", (c3_y*)raf_u->vog_c, strlen(raf_u->vog_c));
2014-02-27 03:46:34 +04:00
_raft_conn_all(raf_u, _raft_send_revo);
}
2014-02-28 04:35:50 +04:00
/* _raft_heartbeat(): send a heartbeat to all peers.
*/
2014-02-27 03:46:34 +04:00
static void
_raft_heartbeat(u2_raft* raf_u)
{
_raft_conn_all(raf_u, _raft_send_beat);
}
2014-02-28 04:35:50 +04:00
/* _raft_time_cb(): generic timer callback.
**
** Called on election timeouts for non-leaders, and at heartbeat interval for
** leaders.
*/
static void
_raft_time_cb(uv_timer_t* tim_u, c3_i sas_i)
{
2014-02-27 03:46:34 +04:00
u2_raft* raf_u = tim_u->data;
2014-03-03 23:34:20 +04:00
//uL(fprintf(uH, "raft: time\n"));
2014-02-27 03:46:34 +04:00
c3_assert(sas_i == 0);
switch ( raf_u->typ_e ) {
default: {
uL(fprintf(uH, "raft: time_cb: unknown server state\n"));
c3_assert(0);
}
case u2_raty_foll: {
2014-03-08 04:26:15 +04:00
uL(fprintf(uH, "raft: promoting to candidate\n"));
2014-02-27 03:46:34 +04:00
raf_u->typ_e = u2_raty_cand;
// continue to cand
}
case u2_raty_cand: {
sas_i = uv_timer_start(tim_u, _raft_time_cb,
150 + _raft_election_rand(), 0);
c3_assert(sas_i == 0);
_raft_start_election(raf_u);
break;
}
case u2_raty_lead: {
sas_i = uv_timer_start(tim_u, _raft_time_cb, 50, 0);
c3_assert(sas_i == 0);
_raft_heartbeat(raf_u);
break;
}
}
}
/* _raft_foll_init(): begin, follower mode.
*/
static void
_raft_foll_init(u2_raft* raf_u)
{
uL(fprintf(uH, "raft: starting follower\n"));
raf_u->typ_e = u2_raty_foll;
// Initialize and count peers.
{
u2_rnam* nam_u = u2_raft_readopt(u2_Host.ops_u.raf_c,
u2_Host.ops_u.nam_c,
u2_Host.ops_u.rop_s);
if ( 0 == nam_u ) {
uL(fprintf(uH, "raft: couldn't parse arg '%s'\n", u2_Host.ops_u.raf_c));
u2_lo_bail(u2A);
}
raf_u->pop_w = 1; raf_u->nam_u = nam_u;
while ( nam_u ) {
raf_u->pop_w++; nam_u = nam_u->nex_u;
}
}
// Set our name.
{
c3_i wri_i, siz_i;
siz_i = strlen(u2_Host.ops_u.nam_c) + strlen(":65536") + 1;
raf_u->str_c = malloc(siz_i);
wri_i = snprintf(raf_u->str_c, siz_i, "%s:%d",
u2_Host.ops_u.nam_c, u2_Host.ops_u.rop_s);
c3_assert(wri_i < siz_i);
}
// Bind the listener.
{
struct sockaddr_in add_u = uv_ip4_addr("0.0.0.0", u2_Host.ops_u.rop_s);
if ( 0 != uv_tcp_init(u2L, &raf_u->wax_u) ) {
uL(fprintf(uH, "raft: init: %s\n", uv_strerror(uv_last_error(u2L))));
c3_assert(0);
}
if ( 0 != uv_tcp_bind(&raf_u->wax_u, add_u) ) {
uL(fprintf(uH, "raft: bind: %s\n", uv_strerror(uv_last_error(u2L))));
c3_assert(0);
}
if ( 0 != uv_listen((uv_stream_t*)&raf_u->wax_u, 16, _raft_listen_cb) ) {
uL(fprintf(uH, "raft: listen: %s\n", uv_strerror(uv_last_error(u2L))));
c3_assert(0);
}
else {
uL(fprintf(uH, "raft: on TCP %d\n", u2_Host.ops_u.rop_s));
}
}
// Start the initial election timeout.
2014-03-08 04:47:30 +04:00
uv_timer_start(&raf_u->tim_u, _raft_time_cb, _raft_election_rand(), 0);
}
/* _raft_lone_init(): begin, single-instance mode.
*/
static void
_raft_lone_init(u2_raft* raf_u)
{
uL(fprintf(uH, "raft: single-instance mode\n"));
raf_u->pop_w = 1;
2014-03-01 02:52:00 +04:00
_raft_promote(raf_u);
}
2014-03-01 02:52:00 +04:00
/* u2_raft_init(): start Raft process.
2014-02-28 04:35:50 +04:00
*/
void
2014-03-01 02:52:00 +04:00
u2_raft_init()
{
u2_raft* raf_u = u2R;
ssize_t ret_i;
if ( (ret_i = u2_sist_has("term")) >= 0 ) {
c3_assert(ret_i == sizeof(c3_w));
u2_sist_get("term", (c3_y*)&raf_u->tem_w);
uL(fprintf(uH, "raft: found term %u\n", raf_u->tem_w));
}
if ( (ret_i = u2_sist_has("vote")) >= 0 ) {
raf_u->vog_c = malloc(ret_i);
u2_sist_get("vote", (c3_y*)raf_u->vog_c);
uL(fprintf(uH, "raft: found vote %s\n", raf_u->vog_c));
}
2014-03-06 07:40:35 +04:00
2014-03-08 04:47:30 +04:00
// Initialize timer -- used in both single and multi-instance mode,
// for different things.
2014-03-08 04:46:22 +04:00
uv_timer_init(u2L, &raf_u->tim_u);
raf_u->tim_u.data = raf_u;
if ( 0 == u2_Host.ops_u.raf_c ) {
_raft_lone_init(raf_u);
}
else {
_raft_foll_init(raf_u);
}
}
/* _raft_sure(): apply and save an input ovum and its result.
*/
static void
_raft_sure(u2_reck* rec_u, u2_noun ovo, u2_noun vir, u2_noun cor)
{
// Whatever worked, save it. (XX - should be concurrent with execute.)
// We'd like more events that don't change the state but need work here.
{
u2_mug(cor);
u2_mug(rec_u->roc);
if ( u2_no == u2_sing(cor, rec_u->roc) ) {
rec_u->roe = u2nc(u2nc(vir, ovo), rec_u->roe);
u2z(rec_u->roc);
rec_u->roc = cor;
}
else {
u2z(ovo);
rec_u->roe = u2nc(u2nc(vir, u2_nul), rec_u->roe);
u2z(cor);
}
}
}
/* _raft_lame(): handle an application failure.
*/
static void
_raft_lame(u2_reck* rec_u, u2_noun ovo, u2_noun why, u2_noun tan)
{
u2_noun bov, gon;
#if 1
{
c3_c* oik_c = u2_cr_string(u2h(u2t(ovo)));
// uL(fprintf(uH, "lame: %s\n", oik_c));
free(oik_c);
}
#endif
// Formal error in a network packet generates a hole card.
//
// There should be a separate path for crypto failures,
// to prevent timing attacks, but isn't right now. To deal
// with a crypto failure, just drop the packet.
//
if ( (c3__exit == why) && (c3__hear == u2h(u2t(ovo))) ) {
u2_lo_punt(2, u2_ckb_flop(u2k(tan)));
bov = u2nc(u2k(u2h(ovo)), u2nc(c3__hole, u2k(u2t(u2t(ovo)))));
u2z(why);
}
else {
bov = u2nc(u2k(u2h(ovo)), u2nt(c3__crud, why, u2k(tan)));
u2_hevn_at(lad) = u2_nul;
}
// u2_lo_show("data", u2k(u2t(u2t(ovo))));
u2z(ovo);
gon = u2_lo_soft(rec_u, 0, u2_reck_poke, u2k(bov));
if ( u2_blip == u2h(gon) ) {
_raft_sure(rec_u, bov, u2k(u2h(u2t(gon))), u2k(u2t(u2t(gon))));
u2z(gon);
}
else {
u2z(gon);
{
u2_noun vab = u2nc(u2k(u2h(bov)),
u2nc(c3__warn, u2_ci_tape("crude crash!")));
u2_noun nog = u2_lo_soft(rec_u, 0, u2_reck_poke, u2k(vab));
if ( u2_blip == u2h(nog) ) {
_raft_sure(rec_u, vab, u2k(u2h(u2t(nog))), u2k(u2t(u2t(nog))));
u2z(nog);
}
else {
u2z(nog);
u2z(vab);
uL(fprintf(uH, "crude: all delivery failed!\n"));
}
}
}
}
/* _raft_punk(): insert and apply an input ovum (unprotected).
*/
static void
_raft_punk(u2_reck* rec_u, u2_noun ovo)
{
// c3_c* txt_c = u2_cr_string(u2h(u2t(ovo)));
c3_w sec_w;
// static c3_w num_w;
u2_noun gon;
// uL(fprintf(uH, "punk: %s: %d\n", u2_cr_string(u2h(u2t(ovo))), num_w++));
// XX this is wrong - the timer should be on the original hose.
//
if ( (c3__term == u2h(u2t(u2h(ovo)))) ||
(c3__batz == u2h(u2t(u2h(ovo)))) ) {
sec_w = 0;
} else sec_w = 60;
// Control alarm loops.
//
if ( c3__wake != u2h(u2t(ovo)) ) {
u2_Host.beh_u.run_w = 0;
}
gon = u2_lo_soft(rec_u, sec_w, u2_reck_poke, u2k(ovo));
if ( u2_blip != u2h(gon) ) {
u2_noun why = u2k(u2h(gon));
u2_noun tan = u2k(u2t(gon));
u2z(gon);
_raft_lame(rec_u, ovo, why, tan);
}
else {
u2_noun vir = u2k(u2h(u2t(gon)));
u2_noun cor = u2k(u2t(u2t(gon)));
u2_noun nug;
u2z(gon);
nug = u2_reck_nick(rec_u, vir, cor);
if ( u2_blip != u2h(nug) ) {
u2_noun why = u2k(u2h(nug));
u2_noun tan = u2k(u2t(nug));
u2z(nug);
_raft_lame(rec_u, ovo, why, tan);
}
else {
vir = u2k(u2h(u2t(nug)));
cor = u2k(u2t(u2t(nug)));
u2z(nug);
_raft_sure(rec_u, ovo, vir, cor);
}
}
// uL(fprintf(uH, "punk oot %s\n", txt_c));
}
static void
_raft_comm(u2_reck* rec_u, c3_w bid_w)
{
u2_cart* egg_u;
u2_lo_open();
egg_u = rec_u->ova.egg_u;
while ( egg_u ) {
if ( egg_u->ent_w <= bid_w ) {
egg_u->did = u2_yes;
egg_u->cit = u2_yes;
} else break;
egg_u = egg_u->nex_u;
}
u2_lo_shut(u2_yes);
}
static void
_raft_comm_cb(uv_timer_t* tim_u, c3_i sas_i)
{
u2_raft* raf_u = tim_u->data;
_raft_comm(u2A, raf_u->ent_w);
}
static c3_w
_raft_push(u2_raft* raf_u, c3_w* bob_w, c3_w len_w)
{
c3_assert(raf_u->typ_e == u2_raty_lead);
c3_assert(0 != bob_w && 0 < len_w);
if ( 1 == raf_u->pop_w ) {
c3_assert(u2_raty_lead == raf_u->typ_e);
raf_u->ent_w = u2_sist_pack(u2A, c3__ov, bob_w, len_w);
2014-03-06 07:40:35 +04:00
raf_u->lat_w = raf_u->tem_w; // XX
if ( !uv_is_active((uv_handle_t*)&raf_u->tim_u) ) {
uv_timer_start(&raf_u->tim_u, _raft_comm_cb, 0, 0);
}
return raf_u->ent_w;
}
else {
uL(fprintf(uH, "raft: multi-instance push\n"));
c3_assert(0);
}
}
/* _raft_kick_all(): kick a list of events, transferring.
*/
static void
_raft_kick_all(u2_reck* rec_u, u2_noun vir)
{
while ( u2_nul != vir ) {
u2_noun ovo = u2k(u2h(vir));
u2_noun nex = u2k(u2t(vir));
u2z(vir); vir = nex;
u2_reck_kick(rec_u, ovo);
}
}
/* u2_raft_work(): work in rec_u.
*/
void
u2_raft_work(u2_reck* rec_u)
{
if ( u2R->typ_e != u2_raty_lead ) {
c3_assert(rec_u->ova.egg_u == 0);
2014-02-27 03:46:34 +04:00
if ( u2_nul != rec_u->roe ) {
uL(fprintf(uH, "raft: dropping roe!!\n"));
u2z(rec_u->roe);
rec_u->roe = u2_nul;
}
}
else {
u2_cart* egg_u;
u2_noun ova;
u2_noun vir;
u2_noun nex;
// Apply effects from just-committed events, and delete finished events.
//
while ( rec_u->ova.egg_u ) {
egg_u = rec_u->ova.egg_u;
if ( u2_yes == egg_u->did ) {
vir = egg_u->vir;
if ( egg_u == rec_u->ova.geg_u ) {
c3_assert(egg_u->nex_u == 0);
rec_u->ova.geg_u = rec_u->ova.egg_u = 0;
free(egg_u);
}
else {
c3_assert(egg_u->nex_u != 0);
rec_u->ova.egg_u = egg_u->nex_u;
free(egg_u);
}
if ( u2_yes == egg_u->cit ) {
_raft_kick_all(rec_u, vir);
}
else {
// We poked an event, but Raft failed to persist it.
// TODO: gracefully recover.
uL(fprintf(uH, "vere: event executed but not persisted\n"));
c3_assert(0);
}
}
else break;
}
// Poke pending events, leaving the poked events and errors on rec_u->roe.
//
{
if ( 0 == u2R->lug_u.len_d ) {
return;
}
ova = u2_ckb_flop(rec_u->roe);
rec_u->roe = u2_nul;
while ( u2_nul != ova ) {
_raft_punk(rec_u, u2k(u2t(u2h(ova))));
c3_assert(u2_nul == u2h(u2h(ova)));
nex = u2k(u2t(ova));
u2z(ova); ova = nex;
}
}
// Cartify, jam, and encrypt this batch of events. Take a number, Raft will
// be with you shortly.
{
c3_w bid_w;
c3_w len_w;
c3_w* bob_w;
u2_noun ron;
u2_noun ovo;
ova = u2_ckb_flop(rec_u->roe);
rec_u->roe = u2_nul;
while ( u2_nul != ova ) {
ovo = u2k(u2t(u2h(ova)));
vir = u2k(u2h(u2h(ova)));
nex = u2k(u2t(ova));
u2z(ova); ova = nex;
if ( u2_nul != ovo ) {
egg_u = malloc(sizeof(*egg_u));
egg_u->nex_u = 0;
egg_u->cit = u2_no;
egg_u->did = u2_no;
egg_u->vir = vir;
ron = u2_cke_jam(u2nc(u2k(rec_u->now), ovo));
c3_assert(rec_u->key);
ron = u2_dc("en:crya", u2k(rec_u->key), ron);
len_w = u2_cr_met(5, ron);
bob_w = malloc(len_w * 4L);
u2_cr_words(0, len_w, bob_w, ron);
u2z(ron);
bid_w = _raft_push(u2R, bob_w, len_w);
egg_u->ent_w = bid_w;
if ( 0 == rec_u->ova.geg_u ) {
c3_assert(0 == rec_u->ova.egg_u);
rec_u->ova.geg_u = rec_u->ova.egg_u = egg_u;
}
else {
c3_assert(0 == rec_u->ova.geg_u->nex_u);
rec_u->ova.geg_u->nex_u = egg_u;
rec_u->ova.geg_u = egg_u;
}
}
else {
_raft_kick_all(rec_u, vir);
}
}
}
}
}