[quagga-dev 12138] Re: [PATCH 3/4] lib: use trie structure for prefix list matching

Nick Hilliard nick at inex.ie
Mon Apr 13 10:43:31 BST 2015


On 13/04/2015 10:21, David Lamparter wrote:
> This buys CPU with memory.  Memory usage on an IXP setup with 100k
> prefix list entries is an additional 4 MB on top of the 9.5 MB that it
> was before.
> 
> ---
> Not-Signed-off-by: David Lamparter <equinox at opensourcerouting.org>
> 
> This patch needs more documentation/comments; just mailing this out for
> people to scroll through and voice high-level comments.

Have scrolled through, I'm very interested in seeing this implemented in
quagga because it solves one of the major performance problems we're
running into.

The impact on smaller installations will be negligible in terms of memory
consumption.  Larger installations with 100k prefixes will have no problems
paying 10 cents for the extra memory required if it means that the CPU
isn't trashed on startup and during normal operation.

Pls aim towards committing this.

Nick

> ---
>  lib/memtypes.c  |   1 +
>  lib/plist.c     | 229 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  lib/plist_int.h |   7 ++
>  3 files changed, 231 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/memtypes.c b/lib/memtypes.c
> index 1a0c11f..a5b76cc 100644
> --- a/lib/memtypes.c
> +++ b/lib/memtypes.c
> @@ -48,6 +48,7 @@ struct memory_list memory_list_lib[] =
>    { MTYPE_PREFIX_LIST,		"Prefix List"			},
>    { MTYPE_PREFIX_LIST_ENTRY,	"Prefix List Entry"		},
>    { MTYPE_PREFIX_LIST_STR,	"Prefix List Str"		},
> +  { MTYPE_PREFIX_LIST_TRIE,	"Prefix List Trie Table"	},
>    { MTYPE_ROUTE_MAP,		"Route map"			},
>    { MTYPE_ROUTE_MAP_NAME,	"Route map name"		},
>    { MTYPE_ROUTE_MAP_INDEX,	"Route map index"		},
> diff --git a/lib/plist.c b/lib/plist.c
> index d8700c8..1181df1 100644
> --- a/lib/plist.c
> +++ b/lib/plist.c
> @@ -32,6 +32,26 @@
>  
>  #include "plist_int.h"
>  
> +/* not currently changeable, code assumes bytes further down */
> +#define PLC_BITS	8
> +#define PLC_LEN		(1 << PLC_BITS)
> +#define PLC_MAXLEVELV4	2	/* /24 for IPv4 */
> +#define PLC_MAXLEVELV6	4	/* /48 for IPv6 */
> +#define PLC_MAXLEVEL	4	/* max(v4,v6) */
> +
> +struct pltrie_entry {
> +	union {
> +		struct pltrie_table *next_table;
> +		struct prefix_list_entry *final_chain;
> +	};
> +
> +	struct prefix_list_entry *up_chain;
> +};
> +
> +struct pltrie_table {
> +	struct pltrie_entry entries[PLC_LEN];
> +};
> +
>  /* List of struct prefix_list. */
>  struct prefix_list_list
>  {
> @@ -59,6 +79,9 @@ struct prefix_master
>  
>    /* Hook function which is executed when prefix_list is deleted. */
>    void (*delete_hook) (struct prefix_list *);
> +
> +  /* number of bytes that have a trie level */
> +  size_t trie_depth;
>  };
>  
>  /* Static structure of IPv4 prefix_list's master. */
> @@ -69,6 +92,8 @@ static struct prefix_master prefix_master_ipv4 =
>    1,
>    NULL,
>    NULL,
> +  NULL,
> +  PLC_MAXLEVELV4,
>  };
>  
>  #ifdef HAVE_IPV6
> @@ -80,27 +105,33 @@ static struct prefix_master prefix_master_ipv6 =
>    1,
>    NULL,
>    NULL,
> +  NULL,
> +  PLC_MAXLEVELV6,
>  };
>  #endif /* HAVE_IPV6*/
>  
>  /* Static structure of BGP ORF prefix_list's master. */
> -static struct prefix_master prefix_master_orf_v4 = 
> -{ 
> +static struct prefix_master prefix_master_orf_v4 =
> +{
>    {NULL, NULL},
>    {NULL, NULL},
>    1,
>    NULL,
>    NULL,
> +  NULL,
> +  PLC_MAXLEVELV4,
>  };
>  
>  /* Static structure of BGP ORF prefix_list's master. */
> -static struct prefix_master prefix_master_orf_v6 = 
> -{ 
> +static struct prefix_master prefix_master_orf_v6 =
> +{
>    {NULL, NULL},
>    {NULL, NULL},
>    1,
>    NULL,
>    NULL,
> +  NULL,
> +  PLC_MAXLEVELV6,
>  };
>  
>  static struct prefix_master *
> @@ -205,6 +236,7 @@ prefix_list_insert (afi_t afi, int orf, const char *name)
>    plist = prefix_list_new ();
>    plist->name = XSTRDUP (MTYPE_PREFIX_LIST_STR, name);
>    plist->master = master;
> +  plist->trie = XCALLOC (MTYPE_PREFIX_LIST_TRIE, sizeof (struct pltrie_table));
>  
>    /* If name is made by all digit character.  We treat it as
>       number. */
> @@ -332,7 +364,9 @@ prefix_list_delete (struct prefix_list *plist)
>  
>    if (plist->name)
>      XFREE (MTYPE_PREFIX_LIST_STR, plist->name);
> -  
> +
> +  XFREE (MTYPE_PREFIX_LIST_TRIE, plist->trie);
> +
>    prefix_list_free (plist);
>    
>    if (master->delete_hook)
> @@ -436,10 +470,87 @@ prefix_list_entry_lookup (struct prefix_list *plist, struct prefix *prefix,
>  }
>  
>  static void
> +trie_walk_affected (size_t validbits, struct pltrie_table *table, uint8_t byte,
> +                    struct prefix_list_entry *object,
> +                    void (*fn)(struct prefix_list_entry *object,
> +                               struct prefix_list_entry **updptr))
> +{
> +  uint8_t mask;
> +  uint16_t bwalk;
> +
> +  if (validbits > PLC_BITS)
> +    {
> +      fn (object, &table->entries[byte].final_chain);
> +      return;
> +    }
> +
> +  mask = (1 << (8 - validbits)) - 1;
> +  for (bwalk = byte & ~mask; bwalk <= byte + mask; bwalk++)
> +    {
> +      fn (object, &table->entries[bwalk].up_chain);
> +    }
> +}
> +
> +static void trie_uninstall_fn (struct prefix_list_entry *object,
> +                               struct prefix_list_entry **updptr)
> +{
> +  for (; *updptr; updptr = &(*updptr)->next_best)
> +    if (*updptr == object)
> +      {
> +        *updptr = object->next_best;
> +        break;
> +      }
> +}
> +
> +static int
> +trie_table_empty (struct pltrie_table *table)
> +{
> +  size_t i;
> +  for (i = 0; i < PLC_LEN; i++)
> +    if (table->entries[i].next_table || table->entries[i].final_chain)
> +      return 0;
> +  return 1;
> +}
> +
> +static void
> +prefix_list_trie_del (struct prefix_list *plist,
> +		      struct prefix_list_entry *pentry)
> +{
> +  size_t depth, maxdepth = plist->master->trie_depth;
> +  uint8_t *bytes = &pentry->prefix.u.prefix;
> +  size_t validbits = pentry->prefix.prefixlen;
> +  struct pltrie_table *table, **tables[PLC_MAXLEVEL];
> +
> +  table = plist->trie;
> +  for (depth = 0; validbits > PLC_BITS && depth < maxdepth - 1; depth++)
> +    {
> +      uint8_t byte = bytes[depth];
> +      assert (table->entries[byte].next_table);
> +
> +      tables[depth + 1] = &table->entries[byte].next_table;
> +      table = table->entries[byte].next_table;
> +
> +      validbits -= PLC_BITS;
> +    }
> +
> +  trie_walk_affected (validbits, table, bytes[depth], pentry, trie_uninstall_fn);
> +
> +  for (; depth > 0; depth--)
> +    if (trie_table_empty (*tables[depth]))
> +      {
> +        XFREE (MTYPE_PREFIX_LIST_TRIE, *tables[depth]);
> +        *tables[depth] = NULL;
> +      }
> +}
> +
> +
> +static void
>  prefix_list_entry_delete (struct prefix_list *plist, 
>  			  struct prefix_list_entry *pentry,
>  			  int update_list)
>  {
> +  prefix_list_trie_del (plist, pentry);
> +
>    if (plist == NULL || pentry == NULL)
>      return;
>    if (pentry->prev)
> @@ -467,6 +578,52 @@ prefix_list_entry_delete (struct prefix_list *plist,
>      }
>  }
>  
> +static void trie_install_fn (struct prefix_list_entry *object,
> +                             struct prefix_list_entry **updptr)
> +{
> +  while (*updptr)
> +    {
> +      if (*updptr == object)
> +        return;
> +      if ((*updptr)->prefix.prefixlen < object->prefix.prefixlen)
> +        break;
> +      if ((*updptr)->seq > object->seq)
> +        break;
> +      updptr = &(*updptr)->next_best;
> +    }
> +
> +  if (!object->next_best)
> +    object->next_best = *updptr;
> +  else
> +    assert (object->next_best == *updptr || !*updptr);
> +
> +  *updptr = object;
> +}
> +
> +static void
> +prefix_list_trie_add (struct prefix_list *plist,
> +		      struct prefix_list_entry *pentry)
> +{
> +  size_t depth = plist->master->trie_depth;
> +  uint8_t *bytes = &pentry->prefix.u.prefix;
> +  size_t validbits = pentry->prefix.prefixlen;
> +  struct pltrie_table *table;
> +
> +  table = plist->trie;
> +  while (validbits > PLC_BITS && depth > 1)
> +    {
> +      if (!table->entries[*bytes].next_table)
> +        table->entries[*bytes].next_table = XCALLOC (MTYPE_PREFIX_LIST_TRIE,
> +                sizeof(struct pltrie_table));
> +      table = table->entries[*bytes].next_table;
> +      bytes++;
> +      depth--;
> +      validbits -= PLC_BITS;
> +    }
> +
> +  trie_walk_affected (validbits, table, *bytes, pentry, trie_install_fn);
> +}
> +
>  static void
>  prefix_list_entry_add (struct prefix_list *plist,
>  		       struct prefix_list_entry *pentry)
> @@ -512,6 +669,8 @@ prefix_list_entry_add (struct prefix_list *plist,
>        plist->tail = pentry;
>      }
>  
> +  prefix_list_trie_add (plist, pentry);
> +
>    /* Increment count. */
>    plist->count++;
>  
> @@ -566,7 +725,7 @@ prefix_list_entry_match (struct prefix_list_entry *pentry, struct prefix *p)
>  }
>  
>  enum prefix_list_type
> -prefix_list_apply (struct prefix_list *plist, void *object)
> +prefix_list_apply_old (struct prefix_list *plist, void *object)
>  {
>    struct prefix_list_entry *pentry;
>    struct prefix *p;
> @@ -592,6 +751,64 @@ prefix_list_apply (struct prefix_list *plist, void *object)
>    return PREFIX_DENY;
>  }
>  
> +enum prefix_list_type
> +prefix_list_apply (struct prefix_list *plist, void *object)
> +{
> +  struct prefix_list_entry *pentry, *pbest = NULL;
> +
> +  struct prefix *p = (struct prefix *) object;
> +  uint8_t *byte = &p->u.prefix;
> +  size_t depth = plist->master->trie_depth;
> +  size_t validbits = p->prefixlen;
> +  struct pltrie_table *table;
> +
> +  if (plist == NULL)
> +    return PREFIX_DENY;
> +
> +  if (plist->count == 0)
> +    return PREFIX_PERMIT;
> +
> +  table = plist->trie;
> +  while (1)
> +    {
> +      for (pentry = table->entries[*byte].up_chain; pentry; pentry = pentry->next_best)
> +        {
> +          if (pbest && pbest->seq < pentry->seq)
> +            continue;
> +          if (prefix_list_entry_match (pentry, p))
> +            pbest = pentry;
> +        }
> +
> +      if (validbits <= PLC_BITS)
> +        break;
> +      validbits -= PLC_BITS;
> +
> +      if (--depth)
> +        {
> +          if (!table->entries[*byte].next_table)
> +            break;
> +
> +          table = table->entries[*byte].next_table;
> +          byte++;
> +          continue;
> +        }
> +
> +      for (pentry = table->entries[*byte].final_chain; pentry; pentry = pentry->next_best)
> +        {
> +          if (pbest && pbest->seq < pentry->seq)
> +            continue;
> +          if (prefix_list_entry_match (pentry, p))
> +            pbest = pentry;
> +        }
> +      break;
> +    }
> +
> +  if (pbest == NULL)
> +    return PREFIX_DENY;
> +
> +  return pbest->type;
> +}
> +
>  static void __attribute__ ((unused))
>  prefix_list_print (struct prefix_list *plist)
>  {
> diff --git a/lib/plist_int.h b/lib/plist_int.h
> index 6459579..e6e5901 100644
> --- a/lib/plist_int.h
> +++ b/lib/plist_int.h
> @@ -29,6 +29,8 @@ enum prefix_name_type
>    PREFIX_TYPE_NUMBER
>  };
>  
> +struct pltrie_table;
> +
>  struct prefix_list
>  {
>    char *name;
> @@ -44,6 +46,8 @@ struct prefix_list
>    struct prefix_list_entry *head;
>    struct prefix_list_entry *tail;
>  
> +  struct pltrie_table *trie;
> +
>    struct prefix_list *next;
>    struct prefix_list *prev;
>  };
> @@ -66,6 +70,9 @@ struct prefix_list_entry
>  
>    struct prefix_list_entry *next;
>    struct prefix_list_entry *prev;
> +
> +  /* up the chain for best match search */
> +  struct prefix_list_entry *next_best;
>  };
>  
>  #endif /* _QUAGGA_PLIST_INT_H */
> 


-- 
Network Ability Ltd. | Chief Technical Officer | Tel: +353 1 6169698
52 Lower Sandwith St | INEX - Internet Neutral |
Dublin 2, Ireland    | Exchange Association    | Email: nick at inex.ie




More information about the Quagga-dev mailing list