Main Page | Class List | File List | Class Members | File Members

inetpeer.c

Go to the documentation of this file.
00001 /* 00002 * INETPEER - A storage for permanent information about peers 00003 * 00004 * This source is covered by the GNU GPL, the same as all kernel sources. 00005 * 00006 * Version: $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $ 00007 * 00008 * Authors: Andrey V. Savochkin <saw@msu.ru> 00009 */ 00010 00011 #include <linux/types.h> 00012 #include <linux/slab.h> 00013 #include <linux/interrupt.h> 00014 #include <linux/spinlock.h> 00015 #include <linux/random.h> 00016 #include <linux/sched.h> 00017 #include <linux/timer.h> 00018 #include <linux/time.h> 00019 #include <linux/kernel.h> 00020 #include <linux/mm.h> 00021 #include <net/inetpeer.h> 00022 00023 /* 00024 * Theory of operations. 00025 * We keep one entry for each peer IP address. The nodes contains long-living 00026 * information about the peer which doesn't depend on routes. 00027 * At this moment this information consists only of ID field for the next 00028 * outgoing IP packet. This field is incremented with each packet as encoded 00029 * in inet_getid() function (include/net/inetpeer.h). 00030 * At the moment of writing this notes identifier of IP packets is generated 00031 * to be unpredictable using this code only for packets subjected 00032 * (actually or potentially) to defragmentation. I.e. DF packets less than 00033 * PMTU in size uses a constant ID and do not use this code (see 00034 * ip_select_ident() in include/net/ip.h). 00035 * 00036 * Route cache entries hold references to our nodes. 00037 * New cache entries get references via lookup by destination IP address in 00038 * the avl tree. The reference is grabbed only when it's needed i.e. only 00039 * when we try to output IP packet which needs an unpredictable ID (see 00040 * __ip_select_ident() in net/ipv4/route.c). 00041 * Nodes are removed only when reference counter goes to 0. 00042 * When it's happened the node may be removed when a sufficient amount of 00043 * time has been passed since its last use. The less-recently-used entry can 00044 * also be removed if the pool is overloaded i.e. if the total amount of 00045 * entries is greater-or-equal than the threshold. 00046 * 00047 * Node pool is organised as an AVL tree. 00048 * Such an implementation has been chosen not just for fun. It's a way to 00049 * prevent easy and efficient DoS attacks by creating hash collisions. A huge 00050 * amount of long living nodes in a single hash slot would significantly delay 00051 * lookups performed with disabled BHs. 00052 * 00053 * Serialisation issues. 00054 * 1. Nodes may appear in the tree only with the pool write lock held. 00055 * 2. Nodes may disappear from the tree only with the pool write lock held 00056 * AND reference count being 0. 00057 * 3. Nodes appears and disappears from unused node list only under 00058 * "inet_peer_unused_lock". 00059 * 4. Global variable peer_total is modified under the pool lock. 00060 * 5. struct inet_peer fields modification: 00061 * avl_left, avl_right, avl_parent, avl_height: pool lock 00062 * unused_next, unused_prevp: unused node list lock 00063 * refcnt: atomically against modifications on other CPU; 00064 * usually under some other lock to prevent node disappearing 00065 * dtime: unused node list lock 00066 * v4daddr: unchangeable 00067 * ip_id_count: idlock 00068 */ 00069 00070 /* Exported for inet_getid inline function. */ 00071 spinlock_t inet_peer_idlock = SPIN_LOCK_UNLOCKED; 00072 00073 static kmem_cache_t *peer_cachep; 00074 00075 #define node_height(x) x->avl_height 00076 static struct inet_peer peer_fake_node = { 00077 avl_left : &peer_fake_node, 00078 avl_right : &peer_fake_node, 00079 avl_height : 0 00080 }; 00081 #define peer_avl_empty (&peer_fake_node) 00082 static struct inet_peer *peer_root = peer_avl_empty; 00083 static rwlock_t peer_pool_lock = RW_LOCK_UNLOCKED; 00084 #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 00085 00086 static volatile int peer_total; 00087 /* Exported for sysctl_net_ipv4. */ 00088 int inet_peer_threshold = 65536 + 128; /* start to throw entries more 00089 * aggressively at this stage */ 00090 int inet_peer_minttl = 120 * HZ; /* TTL under high load: 120 sec */ 00091 int inet_peer_maxttl = 10 * 60 * HZ; /* usual time to live: 10 min */ 00092 00093 /* Exported for inet_putpeer inline function. */ 00094 struct inet_peer *inet_peer_unused_head, 00095 **inet_peer_unused_tailp = &inet_peer_unused_head; 00096 spinlock_t inet_peer_unused_lock = SPIN_LOCK_UNLOCKED; 00097 #define PEER_MAX_CLEANUP_WORK 30 00098 00099 static void peer_check_expire(unsigned long dummy); 00100 static struct timer_list peer_periodic_timer = 00101 { { NULL, NULL }, 0, 0, &peer_check_expire }; 00102 00103 /* Exported for sysctl_net_ipv4. */ 00104 int inet_peer_gc_mintime = 10 * HZ, 00105 inet_peer_gc_maxtime = 120 * HZ; 00106 00107 /* Called from ip_output.c:ip_init */ 00108 void __init inet_initpeers(void) 00109 { 00110 struct sysinfo si; 00111 00112 /* Use the straight interface to information about memory. */ 00113 si_meminfo(&si); 00114 /* The values below were suggested by Alexey Kuznetsov 00115 * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values 00116 * myself. --SAW 00117 */ 00118 if (si.totalram <= (32768*1024)/PAGE_SIZE) 00119 inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */ 00120 if (si.totalram <= (16384*1024)/PAGE_SIZE) 00121 inet_peer_threshold >>= 1; /* about 512KB */ 00122 if (si.totalram <= (8192*1024)/PAGE_SIZE) 00123 inet_peer_threshold >>= 2; /* about 128KB */ 00124 00125 peer_cachep = kmem_cache_create("inet_peer_cache", 00126 sizeof(struct inet_peer), 00127 0, SLAB_HWCACHE_ALIGN, 00128 NULL, NULL); 00129 00130 /* All the timers, started at system startup tend 00131 to synchronize. Perturb it a bit. 00132 */ 00133 peer_periodic_timer.expires = jiffies 00134 + net_random() % inet_peer_gc_maxtime 00135 + inet_peer_gc_maxtime; 00136 add_timer(&peer_periodic_timer); 00137 } 00138 00139 /* Called with or without local BH being disabled. */ 00140 static void unlink_from_unused(struct inet_peer *p) 00141 { 00142 spin_lock_bh(&inet_peer_unused_lock); 00143 if (p->unused_prevp != NULL) { 00144 /* On unused list. */ 00145 *p->unused_prevp = p->unused_next; 00146 if (p->unused_next != NULL) 00147 p->unused_next->unused_prevp = p->unused_prevp; 00148 else 00149 inet_peer_unused_tailp = p->unused_prevp; 00150 p->unused_prevp = NULL; /* mark it as removed */ 00151 } 00152 spin_unlock_bh(&inet_peer_unused_lock); 00153 } 00154 00155 /* Called with local BH disabled and the pool lock held. */ 00156 #define lookup(daddr) \ 00157 ({ \ 00158 struct inet_peer *u, **v; \ 00159 stackptr = stack; \ 00160 *stackptr++ = &peer_root; \ 00161 for (u = peer_root; u != peer_avl_empty; ) { \ 00162 if (daddr == u->v4daddr) \ 00163 break; \ 00164 if (daddr < u->v4daddr) \ 00165 v = &u->avl_left; \ 00166 else \ 00167 v = &u->avl_right; \ 00168 *stackptr++ = v; \ 00169 u = *v; \ 00170 } \ 00171 u; \ 00172 }) 00173 00174 /* Called with local BH disabled and the pool write lock held. */ 00175 #define lookup_rightempty(start) \ 00176 ({ \ 00177 struct inet_peer *u, **v; \ 00178 *stackptr++ = &start->avl_left; \ 00179 v = &start->avl_left; \ 00180 for (u = *v; u->avl_right != peer_avl_empty; ) { \ 00181 v = &u->avl_right; \ 00182 *stackptr++ = v; \ 00183 u = *v; \ 00184 } \ 00185 u; \ 00186 }) 00187 00188 /* Called with local BH disabled and the pool write lock held. 00189 * Variable names are the proof of operation correctness. 00190 * Look into mm/map_avl.c for more detail description of the ideas. */ 00191 static void peer_avl_rebalance(struct inet_peer **stack[], 00192 struct inet_peer ***stackend) 00193 { 00194 struct inet_peer **nodep, *node, *l, *r; 00195 int lh, rh; 00196 00197 while (stackend > stack) { 00198 nodep = *--stackend; 00199 node = *nodep; 00200 l = node->avl_left; 00201 r = node->avl_right; 00202 lh = node_height(l); 00203 rh = node_height(r); 00204 if (lh > rh + 1) { /* l: RH+2 */ 00205 struct inet_peer *ll, *lr, *lrl, *lrr; 00206 int lrh; 00207 ll = l->avl_left; 00208 lr = l->avl_right; 00209 lrh = node_height(lr); 00210 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 00211 node->avl_left = lr; /* lr: RH or RH+1 */ 00212 node->avl_right = r; /* r: RH */ 00213 node->avl_height = lrh + 1; /* RH+1 or RH+2 */ 00214 l->avl_left = ll; /* ll: RH+1 */ 00215 l->avl_right = node; /* node: RH+1 or RH+2 */ 00216 l->avl_height = node->avl_height + 1; 00217 *nodep = l; 00218 } else { /* ll: RH, lr: RH+1 */ 00219 lrl = lr->avl_left; /* lrl: RH or RH-1 */ 00220 lrr = lr->avl_right; /* lrr: RH or RH-1 */ 00221 node->avl_left = lrr; /* lrr: RH or RH-1 */ 00222 node->avl_right = r; /* r: RH */ 00223 node->avl_height = rh + 1; /* node: RH+1 */ 00224 l->avl_left = ll; /* ll: RH */ 00225 l->avl_right = lrl; /* lrl: RH or RH-1 */ 00226 l->avl_height = rh + 1; /* l: RH+1 */ 00227 lr->avl_left = l; /* l: RH+1 */ 00228 lr->avl_right = node; /* node: RH+1 */ 00229 lr->avl_height = rh + 2; 00230 *nodep = lr; 00231 } 00232 } else if (rh > lh + 1) { /* r: LH+2 */ 00233 struct inet_peer *rr, *rl, *rlr, *rll; 00234 int rlh; 00235 rr = r->avl_right; 00236 rl = r->avl_left; 00237 rlh = node_height(rl); 00238 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 00239 node->avl_right = rl; /* rl: LH or LH+1 */ 00240 node->avl_left = l; /* l: LH */ 00241 node->avl_height = rlh + 1; /* LH+1 or LH+2 */ 00242 r->avl_right = rr; /* rr: LH+1 */ 00243 r->avl_left = node; /* node: LH+1 or LH+2 */ 00244 r->avl_height = node->avl_height + 1; 00245 *nodep = r; 00246 } else { /* rr: RH, rl: RH+1 */ 00247 rlr = rl->avl_right; /* rlr: LH or LH-1 */ 00248 rll = rl->avl_left; /* rll: LH or LH-1 */ 00249 node->avl_right = rll; /* rll: LH or LH-1 */ 00250 node->avl_left = l; /* l: LH */ 00251 node->avl_height = lh + 1; /* node: LH+1 */ 00252 r->avl_right = rr; /* rr: LH */ 00253 r->avl_left = rlr; /* rlr: LH or LH-1 */ 00254 r->avl_height = lh + 1; /* r: LH+1 */ 00255 rl->avl_right = r; /* r: LH+1 */ 00256 rl->avl_left = node; /* node: LH+1 */ 00257 rl->avl_height = lh + 2; 00258 *nodep = rl; 00259 } 00260 } else { 00261 node->avl_height = (lh > rh ? lh : rh) + 1; 00262 } 00263 } 00264 } 00265 00266 /* Called with local BH disabled and the pool write lock held. */ 00267 #define link_to_pool(n) \ 00268 do { \ 00269 n->avl_height = 1; \ 00270 n->avl_left = peer_avl_empty; \ 00271 n->avl_right = peer_avl_empty; \ 00272 **--stackptr = n; \ 00273 peer_avl_rebalance(stack, stackptr); \ 00274 } while(0) 00275 00276 /* May be called with local BH enabled. */ 00277 static void unlink_from_pool(struct inet_peer *p) 00278 { 00279 int do_free; 00280 00281 do_free = 0; 00282 00283 write_lock_bh(&peer_pool_lock); 00284 /* Check the reference counter. It was artificially incremented by 1 00285 * in cleanup() function to prevent sudden disappearing. If the 00286 * reference count is still 1 then the node is referenced only as `p' 00287 * here and from the pool. So under the exclusive pool lock it's safe 00288 * to remove the node and free it later. */ 00289 if (atomic_read(&p->refcnt) == 1) { 00290 struct inet_peer **stack[PEER_MAXDEPTH]; 00291 struct inet_peer ***stackptr, ***delp; 00292 if (lookup(p->v4daddr) != p) 00293 BUG(); 00294 delp = stackptr - 1; /* *delp[0] == p */ 00295 if (p->avl_left == peer_avl_empty) { 00296 *delp[0] = p->avl_right; 00297 --stackptr; 00298 } else { 00299 /* look for a node to insert instead of p */ 00300 struct inet_peer *t; 00301 t = lookup_rightempty(p); 00302 if (*stackptr[-1] != t) 00303 BUG(); 00304 **--stackptr = t->avl_left; 00305 /* t is removed, t->v4daddr > x->v4daddr for any 00306 * x in p->avl_left subtree. 00307 * Put t in the old place of p. */ 00308 *delp[0] = t; 00309 t->avl_left = p->avl_left; 00310 t->avl_right = p->avl_right; 00311 t->avl_height = p->avl_height; 00312 if (delp[1] != &p->avl_left) 00313 BUG(); 00314 delp[1] = &t->avl_left; /* was &p->avl_left */ 00315 } 00316 peer_avl_rebalance(stack, stackptr); 00317 peer_total--; 00318 do_free = 1; 00319 } 00320 write_unlock_bh(&peer_pool_lock); 00321 00322 if (do_free) 00323 kmem_cache_free(peer_cachep, p); 00324 else 00325 /* The node is used again. Decrease the reference counter 00326 * back. The loop "cleanup -> unlink_from_unused 00327 * -> unlink_from_pool -> putpeer -> link_to_unused 00328 * -> cleanup (for the same node)" 00329 * doesn't really exist because the entry will have a 00330 * recent deletion time and will not be cleaned again soon. */ 00331 inet_putpeer(p); 00332 } 00333 00334 /* May be called with local BH enabled. */ 00335 static int cleanup_once(unsigned long ttl) 00336 { 00337 struct inet_peer *p; 00338 00339 /* Remove the first entry from the list of unused nodes. */ 00340 spin_lock_bh(&inet_peer_unused_lock); 00341 p = inet_peer_unused_head; 00342 if (p != NULL) { 00343 if (time_after(p->dtime + ttl, jiffies)) { 00344 /* Do not prune fresh entries. */ 00345 spin_unlock_bh(&inet_peer_unused_lock); 00346 return -1; 00347 } 00348 inet_peer_unused_head = p->unused_next; 00349 if (p->unused_next != NULL) 00350 p->unused_next->unused_prevp = p->unused_prevp; 00351 else 00352 inet_peer_unused_tailp = p->unused_prevp; 00353 p->unused_prevp = NULL; /* mark as not on the list */ 00354 /* Grab an extra reference to prevent node disappearing 00355 * before unlink_from_pool() call. */ 00356 atomic_inc(&p->refcnt); 00357 } 00358 spin_unlock_bh(&inet_peer_unused_lock); 00359 00360 if (p == NULL) 00361 /* It means that the total number of USED entries has 00362 * grown over inet_peer_threshold. It shouldn't really 00363 * happen because of entry limits in route cache. */ 00364 return -1; 00365 00366 unlink_from_pool(p); 00367 return 0; 00368 } 00369 00370 /* Called with or without local BH being disabled. */ 00371 struct inet_peer *inet_getpeer(__u32 daddr, int create) 00372 { 00373 struct inet_peer *p, *n; 00374 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; 00375 00376 /* Look up for the address quickly. */ 00377 read_lock_bh(&peer_pool_lock); 00378 p = lookup(daddr); 00379 if (p != peer_avl_empty) 00380 atomic_inc(&p->refcnt); 00381 read_unlock_bh(&peer_pool_lock); 00382 00383 if (p != peer_avl_empty) { 00384 /* The existing node has been found. */ 00385 /* Remove the entry from unused list if it was there. */ 00386 unlink_from_unused(p); 00387 return p; 00388 } 00389 00390 if (!create) 00391 return NULL; 00392 00393 /* Allocate the space outside the locked region. */ 00394 n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); 00395 if (n == NULL) 00396 return NULL; 00397 n->v4daddr = daddr; 00398 atomic_set(&n->refcnt, 1); 00399 n->ip_id_count = secure_ip_id(daddr); 00400 n->tcp_ts_stamp = 0; 00401 00402 write_lock_bh(&peer_pool_lock); 00403 /* Check if an entry has suddenly appeared. */ 00404 p = lookup(daddr); 00405 if (p != peer_avl_empty) 00406 goto out_free; 00407 00408 /* Link the node. */ 00409 link_to_pool(n); 00410 n->unused_prevp = NULL; /* not on the list */ 00411 peer_total++; 00412 write_unlock_bh(&peer_pool_lock); 00413 00414 if (peer_total >= inet_peer_threshold) 00415 /* Remove one less-recently-used entry. */ 00416 cleanup_once(0); 00417 00418 return n; 00419 00420 out_free: 00421 /* The appropriate node is already in the pool. */ 00422 atomic_inc(&p->refcnt); 00423 write_unlock_bh(&peer_pool_lock); 00424 /* Remove the entry from unused list if it was there. */ 00425 unlink_from_unused(p); 00426 /* Free preallocated the preallocated node. */ 00427 kmem_cache_free(peer_cachep, n); 00428 return p; 00429 } 00430 00431 /* Called with local BH disabled. */ 00432 static void peer_check_expire(unsigned long dummy) 00433 { 00434 int i; 00435 int ttl; 00436 00437 if (peer_total >= inet_peer_threshold) 00438 ttl = inet_peer_minttl; 00439 else 00440 ttl = inet_peer_maxttl 00441 - (inet_peer_maxttl - inet_peer_minttl) / HZ * 00442 peer_total / inet_peer_threshold * HZ; 00443 for (i = 0; i < PEER_MAX_CLEANUP_WORK && !cleanup_once(ttl); i++); 00444 00445 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 00446 * interval depending on the total number of entries (more entries, 00447 * less interval). */ 00448 peer_periodic_timer.expires = jiffies 00449 + inet_peer_gc_maxtime 00450 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 00451 peer_total / inet_peer_threshold * HZ; 00452 add_timer(&peer_periodic_timer); 00453 }

Generated on Wed Dec 1 21:25:30 2004 for Linux 2.4.23 Networking by doxygen 1.3.8