[quagga-dev 12136] [PATCH 3/4] lib: use trie structure for prefix list matching

David Lamparter equinox at opensourcerouting.org
Mon Apr 13 09:21:36 BST 2015


Prefix lists were implemented with a simple linear list that is scanned
sequentially.  This is, of course, extremely inefficient as it scales by
O(n).  This patch adds a trie-ish data structure that allows quickly
descending based on the prefix.

Note that the trie structure used here is designed for real-world use,
hence it uses a relatively crude fixed-size bytewise table instead of
some fancy balancing scheme.  It is quite cacheline efficient.

Using real-world routeserver prefix lists, matching against a fulltable
dump:

entries   before     after    factor
   9103   63.8s      .0124s   5142x
    772    4.52s     .0101s    445.3x
     86     .445s    .0098s     45.51x
      7     .0379s   .0099s      3.834x
      2     .0136s   .0095s      1.440x
      1     .0084s   .0095s       .879x

This buys CPU with memory.  Memory usage on an IXP setup with 100k
prefix list entries is an additional 4 MB on top of the 9.5 MB that it
was before.

---
Not-Signed-off-by: David Lamparter <equinox at opensourcerouting.org>

This patch needs more documentation/comments; just mailing this out for
people to scroll through and voice high-level comments.  Also, the test
tools aren't cleaned up yet.
---
 lib/memtypes.c  |   1 +
 lib/plist.c     | 229 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 lib/plist_int.h |   7 ++
 3 files changed, 231 insertions(+), 6 deletions(-)

diff --git a/lib/memtypes.c b/lib/memtypes.c
index 1a0c11f..a5b76cc 100644
--- a/lib/memtypes.c
+++ b/lib/memtypes.c
@@ -48,6 +48,7 @@ struct memory_list memory_list_lib[] =
   { MTYPE_PREFIX_LIST,		"Prefix List"			},
   { MTYPE_PREFIX_LIST_ENTRY,	"Prefix List Entry"		},
   { MTYPE_PREFIX_LIST_STR,	"Prefix List Str"		},
+  { MTYPE_PREFIX_LIST_TRIE,	"Prefix List Trie Table"	},
   { MTYPE_ROUTE_MAP,		"Route map"			},
   { MTYPE_ROUTE_MAP_NAME,	"Route map name"		},
   { MTYPE_ROUTE_MAP_INDEX,	"Route map index"		},
diff --git a/lib/plist.c b/lib/plist.c
index d8700c8..1181df1 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -32,6 +32,26 @@
 
 #include "plist_int.h"
 
+/* not currently changeable, code assumes bytes further down */
+#define PLC_BITS	8
+#define PLC_LEN		(1 << PLC_BITS)
+#define PLC_MAXLEVELV4	2	/* /24 for IPv4 */
+#define PLC_MAXLEVELV6	4	/* /48 for IPv6 */
+#define PLC_MAXLEVEL	4	/* max(v4,v6) */
+
+struct pltrie_entry {
+	union {
+		struct pltrie_table *next_table;
+		struct prefix_list_entry *final_chain;
+	};
+
+	struct prefix_list_entry *up_chain;
+};
+
+struct pltrie_table {
+	struct pltrie_entry entries[PLC_LEN];
+};
+
 /* List of struct prefix_list. */
 struct prefix_list_list
 {
@@ -59,6 +79,9 @@ struct prefix_master
 
   /* Hook function which is executed when prefix_list is deleted. */
   void (*delete_hook) (struct prefix_list *);
+
+  /* number of bytes that have a trie level */
+  size_t trie_depth;
 };
 
 /* Static structure of IPv4 prefix_list's master. */
@@ -69,6 +92,8 @@ static struct prefix_master prefix_master_ipv4 =
   1,
   NULL,
   NULL,
+  NULL,
+  PLC_MAXLEVELV4,
 };
 
 #ifdef HAVE_IPV6
@@ -80,27 +105,33 @@ static struct prefix_master prefix_master_ipv6 =
   1,
   NULL,
   NULL,
+  NULL,
+  PLC_MAXLEVELV6,
 };
 #endif /* HAVE_IPV6*/
 
 /* Static structure of BGP ORF prefix_list's master. */
-static struct prefix_master prefix_master_orf_v4 = 
-{ 
+static struct prefix_master prefix_master_orf_v4 =
+{
   {NULL, NULL},
   {NULL, NULL},
   1,
   NULL,
   NULL,
+  NULL,
+  PLC_MAXLEVELV4,
 };
 
 /* Static structure of BGP ORF prefix_list's master. */
-static struct prefix_master prefix_master_orf_v6 = 
-{ 
+static struct prefix_master prefix_master_orf_v6 =
+{
   {NULL, NULL},
   {NULL, NULL},
   1,
   NULL,
   NULL,
+  NULL,
+  PLC_MAXLEVELV6,
 };
 
 static struct prefix_master *
@@ -205,6 +236,7 @@ prefix_list_insert (afi_t afi, int orf, const char *name)
   plist = prefix_list_new ();
   plist->name = XSTRDUP (MTYPE_PREFIX_LIST_STR, name);
   plist->master = master;
+  plist->trie = XCALLOC (MTYPE_PREFIX_LIST_TRIE, sizeof (struct pltrie_table));
 
   /* If name is made by all digit character.  We treat it as
      number. */
@@ -332,7 +364,9 @@ prefix_list_delete (struct prefix_list *plist)
 
   if (plist->name)
     XFREE (MTYPE_PREFIX_LIST_STR, plist->name);
-  
+
+  XFREE (MTYPE_PREFIX_LIST_TRIE, plist->trie);
+
   prefix_list_free (plist);
   
   if (master->delete_hook)
@@ -436,10 +470,87 @@ prefix_list_entry_lookup (struct prefix_list *plist, struct prefix *prefix,
 }
 
 static void
+trie_walk_affected (size_t validbits, struct pltrie_table *table, uint8_t byte,
+                    struct prefix_list_entry *object,
+                    void (*fn)(struct prefix_list_entry *object,
+                               struct prefix_list_entry **updptr))
+{
+  uint8_t mask;
+  uint16_t bwalk;
+
+  if (validbits > PLC_BITS)
+    {
+      fn (object, &table->entries[byte].final_chain);
+      return;
+    }
+
+  mask = (1 << (8 - validbits)) - 1;
+  for (bwalk = byte & ~mask; bwalk <= byte + mask; bwalk++)
+    {
+      fn (object, &table->entries[bwalk].up_chain);
+    }
+}
+
+static void trie_uninstall_fn (struct prefix_list_entry *object,
+                               struct prefix_list_entry **updptr)
+{
+  for (; *updptr; updptr = &(*updptr)->next_best)
+    if (*updptr == object)
+      {
+        *updptr = object->next_best;
+        break;
+      }
+}
+
+static int
+trie_table_empty (struct pltrie_table *table)
+{
+  size_t i;
+  for (i = 0; i < PLC_LEN; i++)
+    if (table->entries[i].next_table || table->entries[i].final_chain)
+      return 0;
+  return 1;
+}
+
+static void
+prefix_list_trie_del (struct prefix_list *plist,
+		      struct prefix_list_entry *pentry)
+{
+  size_t depth, maxdepth = plist->master->trie_depth;
+  uint8_t *bytes = &pentry->prefix.u.prefix;
+  size_t validbits = pentry->prefix.prefixlen;
+  struct pltrie_table *table, **tables[PLC_MAXLEVEL];
+
+  table = plist->trie;
+  for (depth = 0; validbits > PLC_BITS && depth < maxdepth - 1; depth++)
+    {
+      uint8_t byte = bytes[depth];
+      assert (table->entries[byte].next_table);
+
+      tables[depth + 1] = &table->entries[byte].next_table;
+      table = table->entries[byte].next_table;
+
+      validbits -= PLC_BITS;
+    }
+
+  trie_walk_affected (validbits, table, bytes[depth], pentry, trie_uninstall_fn);
+
+  for (; depth > 0; depth--)
+    if (trie_table_empty (*tables[depth]))
+      {
+        XFREE (MTYPE_PREFIX_LIST_TRIE, *tables[depth]);
+        *tables[depth] = NULL;
+      }
+}
+
+
+static void
 prefix_list_entry_delete (struct prefix_list *plist, 
 			  struct prefix_list_entry *pentry,
 			  int update_list)
 {
+  prefix_list_trie_del (plist, pentry);
+
   if (plist == NULL || pentry == NULL)
     return;
   if (pentry->prev)
@@ -467,6 +578,52 @@ prefix_list_entry_delete (struct prefix_list *plist,
     }
 }
 
+static void trie_install_fn (struct prefix_list_entry *object,
+                             struct prefix_list_entry **updptr)
+{
+  while (*updptr)
+    {
+      if (*updptr == object)
+        return;
+      if ((*updptr)->prefix.prefixlen < object->prefix.prefixlen)
+        break;
+      if ((*updptr)->seq > object->seq)
+        break;
+      updptr = &(*updptr)->next_best;
+    }
+
+  if (!object->next_best)
+    object->next_best = *updptr;
+  else
+    assert (object->next_best == *updptr || !*updptr);
+
+  *updptr = object;
+}
+
+static void
+prefix_list_trie_add (struct prefix_list *plist,
+		      struct prefix_list_entry *pentry)
+{
+  size_t depth = plist->master->trie_depth;
+  uint8_t *bytes = &pentry->prefix.u.prefix;
+  size_t validbits = pentry->prefix.prefixlen;
+  struct pltrie_table *table;
+
+  table = plist->trie;
+  while (validbits > PLC_BITS && depth > 1)
+    {
+      if (!table->entries[*bytes].next_table)
+        table->entries[*bytes].next_table = XCALLOC (MTYPE_PREFIX_LIST_TRIE,
+                sizeof(struct pltrie_table));
+      table = table->entries[*bytes].next_table;
+      bytes++;
+      depth--;
+      validbits -= PLC_BITS;
+    }
+
+  trie_walk_affected (validbits, table, *bytes, pentry, trie_install_fn);
+}
+
 static void
 prefix_list_entry_add (struct prefix_list *plist,
 		       struct prefix_list_entry *pentry)
@@ -512,6 +669,8 @@ prefix_list_entry_add (struct prefix_list *plist,
       plist->tail = pentry;
     }
 
+  prefix_list_trie_add (plist, pentry);
+
   /* Increment count. */
   plist->count++;
 
@@ -566,7 +725,7 @@ prefix_list_entry_match (struct prefix_list_entry *pentry, struct prefix *p)
 }
 
 enum prefix_list_type
-prefix_list_apply (struct prefix_list *plist, void *object)
+prefix_list_apply_old (struct prefix_list *plist, void *object)
 {
   struct prefix_list_entry *pentry;
   struct prefix *p;
@@ -592,6 +751,64 @@ prefix_list_apply (struct prefix_list *plist, void *object)
   return PREFIX_DENY;
 }
 
+enum prefix_list_type
+prefix_list_apply (struct prefix_list *plist, void *object)
+{
+  struct prefix_list_entry *pentry, *pbest = NULL;
+
+  struct prefix *p = (struct prefix *) object;
+  uint8_t *byte = &p->u.prefix;
+  size_t depth = plist->master->trie_depth;
+  size_t validbits = p->prefixlen;
+  struct pltrie_table *table;
+
+  if (plist == NULL)
+    return PREFIX_DENY;
+
+  if (plist->count == 0)
+    return PREFIX_PERMIT;
+
+  table = plist->trie;
+  while (1)
+    {
+      for (pentry = table->entries[*byte].up_chain; pentry; pentry = pentry->next_best)
+        {
+          if (pbest && pbest->seq < pentry->seq)
+            continue;
+          if (prefix_list_entry_match (pentry, p))
+            pbest = pentry;
+        }
+
+      if (validbits <= PLC_BITS)
+        break;
+      validbits -= PLC_BITS;
+
+      if (--depth)
+        {
+          if (!table->entries[*byte].next_table)
+            break;
+
+          table = table->entries[*byte].next_table;
+          byte++;
+          continue;
+        }
+
+      for (pentry = table->entries[*byte].final_chain; pentry; pentry = pentry->next_best)
+        {
+          if (pbest && pbest->seq < pentry->seq)
+            continue;
+          if (prefix_list_entry_match (pentry, p))
+            pbest = pentry;
+        }
+      break;
+    }
+
+  if (pbest == NULL)
+    return PREFIX_DENY;
+
+  return pbest->type;
+}
+
 static void __attribute__ ((unused))
 prefix_list_print (struct prefix_list *plist)
 {
diff --git a/lib/plist_int.h b/lib/plist_int.h
index 6459579..e6e5901 100644
--- a/lib/plist_int.h
+++ b/lib/plist_int.h
@@ -29,6 +29,8 @@ enum prefix_name_type
   PREFIX_TYPE_NUMBER
 };
 
+struct pltrie_table;
+
 struct prefix_list
 {
   char *name;
@@ -44,6 +46,8 @@ struct prefix_list
   struct prefix_list_entry *head;
   struct prefix_list_entry *tail;
 
+  struct pltrie_table *trie;
+
   struct prefix_list *next;
   struct prefix_list *prev;
 };
@@ -66,6 +70,9 @@ struct prefix_list_entry
 
   struct prefix_list_entry *next;
   struct prefix_list_entry *prev;
+
+  /* up the chain for best match search */
+  struct prefix_list_entry *next_best;
 };
 
 #endif /* _QUAGGA_PLIST_INT_H */
-- 
2.0.4





More information about the Quagga-dev mailing list