Main Page | Class List | File List | Class Members | File Members

route.c

Go to the documentation of this file.
00001 /* 00002 * INET An implementation of the TCP/IP protocol suite for the LINUX 00003 * operating system. INET is implemented using the BSD Socket 00004 * interface as the means of communication with the user level. 00005 * 00006 * ROUTE - implementation of the IP router. 00007 * 00008 * Version: $Id: route.c,v 1.102.2.1 2002/01/12 07:43:57 davem Exp $ 00009 * 00010 * Authors: Ross Biro, <bir7@leland.Stanford.Edu> 00011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 00012 * Alan Cox, <gw4pts@gw4pts.ampr.org> 00013 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 00014 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 00015 * 00016 * Fixes: 00017 * Alan Cox : Verify area fixes. 00018 * Alan Cox : cli() protects routing changes 00019 * Rui Oliveira : ICMP routing table updates 00020 * (rco@di.uminho.pt) Routing table insertion and update 00021 * Linus Torvalds : Rewrote bits to be sensible 00022 * Alan Cox : Added BSD route gw semantics 00023 * Alan Cox : Super /proc >4K 00024 * Alan Cox : MTU in route table 00025 * Alan Cox : MSS actually. Also added the window 00026 * clamper. 00027 * Sam Lantinga : Fixed route matching in rt_del() 00028 * Alan Cox : Routing cache support. 00029 * Alan Cox : Removed compatibility cruft. 00030 * Alan Cox : RTF_REJECT support. 00031 * Alan Cox : TCP irtt support. 00032 * Jonathan Naylor : Added Metric support. 00033 * Miquel van Smoorenburg : BSD API fixes. 00034 * Miquel van Smoorenburg : Metrics. 00035 * Alan Cox : Use __u32 properly 00036 * Alan Cox : Aligned routing errors more closely with BSD 00037 * our system is still very different. 00038 * Alan Cox : Faster /proc handling 00039 * Alexey Kuznetsov : Massive rework to support tree based routing, 00040 * routing caches and better behaviour. 00041 * 00042 * Olaf Erb : irtt wasn't being copied right. 00043 * Bjorn Ekwall : Kerneld route support. 00044 * Alan Cox : Multicast fixed (I hope) 00045 * Pavel Krauz : Limited broadcast fixed 00046 * Mike McLagan : Routing by source 00047 * Alexey Kuznetsov : End of old history. Splitted to fib.c and 00048 * route.c and rewritten from scratch. 00049 * Andi Kleen : Load-limit warning messages. 00050 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 00051 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 00052 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 00053 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 00054 * Marc Boucher : routing by fwmark 00055 * Robert Olsson : Added rt_cache statistics 00056 * 00057 * This program is free software; you can redistribute it and/or 00058 * modify it under the terms of the GNU General Public License 00059 * as published by the Free Software Foundation; either version 00060 * 2 of the License, or (at your option) any later version. 00061 */ 00062 00063 #include <linux/config.h> 00064 #include <asm/uaccess.h> 00065 #include <asm/system.h> 00066 #include <asm/bitops.h> 00067 #include <linux/types.h> 00068 #include <linux/kernel.h> 00069 #include <linux/sched.h> 00070 #include <linux/mm.h> 00071 #include <linux/string.h> 00072 #include <linux/socket.h> 00073 #include <linux/sockios.h> 00074 #include <linux/errno.h> 00075 #include <linux/in.h> 00076 #include <linux/inet.h> 00077 #include <linux/netdevice.h> 00078 #include <linux/proc_fs.h> 00079 #include <linux/init.h> 00080 #include <linux/skbuff.h> 00081 #include <linux/rtnetlink.h> 00082 #include <linux/inetdevice.h> 00083 #include <linux/igmp.h> 00084 #include <linux/pkt_sched.h> 00085 #include <linux/mroute.h> 00086 #include <linux/netfilter_ipv4.h> 00087 #include <linux/random.h> 00088 #include <linux/jhash.h> 00089 #include <net/protocol.h> 00090 #include <net/ip.h> 00091 #include <net/route.h> 00092 #include <net/inetpeer.h> 00093 #include <net/sock.h> 00094 #include <net/ip_fib.h> 00095 #include <net/arp.h> 00096 #include <net/tcp.h> 00097 #include <net/icmp.h> 00098 #ifdef CONFIG_SYSCTL 00099 #include <linux/sysctl.h> 00100 #endif 00101 00102 #define IP_MAX_MTU 0xFFF0 00103 00104 #define RT_GC_TIMEOUT (300*HZ) 00105 00106 int ip_rt_min_delay = 2 * HZ; 00107 int ip_rt_max_delay = 10 * HZ; 00108 int ip_rt_max_size; 00109 int ip_rt_gc_timeout = RT_GC_TIMEOUT; 00110 int ip_rt_gc_interval = 60 * HZ; 00111 int ip_rt_gc_min_interval = HZ / 2; 00112 int ip_rt_redirect_number = 9; 00113 int ip_rt_redirect_load = HZ / 50; 00114 int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1)); 00115 int ip_rt_error_cost = HZ; 00116 int ip_rt_error_burst = 5 * HZ; 00117 int ip_rt_gc_elasticity = 8; 00118 int ip_rt_mtu_expires = 10 * 60 * HZ; 00119 int ip_rt_min_pmtu = 512 + 20 + 20; 00120 int ip_rt_min_advmss = 256; 00121 int ip_rt_secret_interval = 10 * 60 * HZ; 00122 static unsigned long rt_deadline; 00123 00124 #define RTprint(a...) printk(KERN_DEBUG a) 00125 00126 static struct timer_list rt_flush_timer; 00127 static struct timer_list rt_periodic_timer; 00128 static struct timer_list rt_secret_timer; 00129 00130 /* 00131 * Interface to generic destination cache. 00132 */ 00133 00134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 00135 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst, 00136 struct sk_buff *skb); 00137 static void ipv4_dst_destroy(struct dst_entry *dst); 00138 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 00139 static void ipv4_link_failure(struct sk_buff *skb); 00140 static int rt_garbage_collect(void); 00141 00142 00143 struct dst_ops ipv4_dst_ops = { 00144 family: AF_INET, 00145 protocol: __constant_htons(ETH_P_IP), 00146 gc: rt_garbage_collect, 00147 check: ipv4_dst_check, 00148 reroute: ipv4_dst_reroute, 00149 destroy: ipv4_dst_destroy, 00150 negative_advice: ipv4_negative_advice, 00151 link_failure: ipv4_link_failure, 00152 entry_size: sizeof(struct rtable), 00153 }; 00154 00155 #define ECN_OR_COST(class) TC_PRIO_##class 00156 00157 __u8 ip_tos2prio[16] = { 00158 TC_PRIO_BESTEFFORT, 00159 ECN_OR_COST(FILLER), 00160 TC_PRIO_BESTEFFORT, 00161 ECN_OR_COST(BESTEFFORT), 00162 TC_PRIO_BULK, 00163 ECN_OR_COST(BULK), 00164 TC_PRIO_BULK, 00165 ECN_OR_COST(BULK), 00166 TC_PRIO_INTERACTIVE, 00167 ECN_OR_COST(INTERACTIVE), 00168 TC_PRIO_INTERACTIVE, 00169 ECN_OR_COST(INTERACTIVE), 00170 TC_PRIO_INTERACTIVE_BULK, 00171 ECN_OR_COST(INTERACTIVE_BULK), 00172 TC_PRIO_INTERACTIVE_BULK, 00173 ECN_OR_COST(INTERACTIVE_BULK) 00174 }; 00175 00176 00177 /* 00178 * Route cache. 00179 */ 00180 00181 /* The locking scheme is rather straight forward: 00182 * 00183 * 1) A BH protected rwlocks protect buckets of the central route hash. 00184 * 2) Only writers remove entries, and they hold the lock 00185 * as they look at rtable reference counts. 00186 * 3) Only readers acquire references to rtable entries, 00187 * they do so with atomic increments and with the 00188 * lock held. 00189 */ 00190 00191 struct rt_hash_bucket { 00192 struct rtable *chain; 00193 rwlock_t lock; 00194 } __attribute__((__aligned__(8))); 00195 00196 static struct rt_hash_bucket *rt_hash_table; 00197 static unsigned rt_hash_mask; 00198 static int rt_hash_log; 00199 static unsigned int rt_hash_rnd; 00200 00201 struct rt_cache_stat rt_cache_stat[NR_CPUS]; 00202 00203 static int rt_intern_hash(unsigned hash, struct rtable *rth, 00204 struct rtable **res); 00205 00206 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos) 00207 { 00208 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd) 00209 & rt_hash_mask); 00210 } 00211 00212 static int rt_cache_get_info(char *buffer, char **start, off_t offset, 00213 int length) 00214 { 00215 int len = 0; 00216 off_t pos = 128; 00217 char temp[256]; 00218 struct rtable *r; 00219 int i; 00220 00221 if (offset < 128) { 00222 sprintf(buffer, "%-127s\n", 00223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 00224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 00225 "HHUptod\tSpecDst"); 00226 len = 128; 00227 } 00228 00229 for (i = rt_hash_mask; i >= 0; i--) { 00230 read_lock_bh(&rt_hash_table[i].lock); 00231 for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) { 00232 /* 00233 * Spin through entries until we are ready 00234 */ 00235 pos += 128; 00236 00237 if (pos <= offset) { 00238 len = 0; 00239 continue; 00240 } 00241 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" 00242 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", 00243 r->u.dst.dev ? r->u.dst.dev->name : "*", 00244 (unsigned long)r->rt_dst, 00245 (unsigned long)r->rt_gateway, 00246 r->rt_flags, 00247 atomic_read(&r->u.dst.__refcnt), 00248 r->u.dst.__use, 00249 0, 00250 (unsigned long)r->rt_src, 00251 (r->u.dst.advmss ? 00252 (int) r->u.dst.advmss + 40 : 0), 00253 r->u.dst.window, 00254 (int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar), 00255 r->key.tos, 00256 r->u.dst.hh ? 00257 atomic_read(&r->u.dst.hh->hh_refcnt) : 00258 -1, 00259 r->u.dst.hh ? 00260 (r->u.dst.hh->hh_output == 00261 dev_queue_xmit) : 0, 00262 r->rt_spec_dst); 00263 sprintf(buffer + len, "%-127s\n", temp); 00264 len += 128; 00265 if (pos >= offset+length) { 00266 read_unlock_bh(&rt_hash_table[i].lock); 00267 goto done; 00268 } 00269 } 00270 read_unlock_bh(&rt_hash_table[i].lock); 00271 } 00272 00273 done: 00274 *start = buffer + len - (pos - offset); 00275 len = pos - offset; 00276 if (len > length) 00277 len = length; 00278 return len; 00279 } 00280 00281 static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length) 00282 { 00283 unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries); 00284 int i, lcpu; 00285 int len = 0; 00286 00287 for (lcpu = 0; lcpu < smp_num_cpus; lcpu++) { 00288 i = cpu_logical_map(lcpu); 00289 00290 len += sprintf(buffer+len, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 00291 dst_entries, 00292 rt_cache_stat[i].in_hit, 00293 rt_cache_stat[i].in_slow_tot, 00294 rt_cache_stat[i].in_slow_mc, 00295 rt_cache_stat[i].in_no_route, 00296 rt_cache_stat[i].in_brd, 00297 rt_cache_stat[i].in_martian_dst, 00298 rt_cache_stat[i].in_martian_src, 00299 00300 rt_cache_stat[i].out_hit, 00301 rt_cache_stat[i].out_slow_tot, 00302 rt_cache_stat[i].out_slow_mc, 00303 00304 rt_cache_stat[i].gc_total, 00305 rt_cache_stat[i].gc_ignored, 00306 rt_cache_stat[i].gc_goal_miss, 00307 rt_cache_stat[i].gc_dst_overflow, 00308 rt_cache_stat[i].in_hlist_search, 00309 rt_cache_stat[i].out_hlist_search 00310 00311 ); 00312 } 00313 len -= offset; 00314 00315 if (len > length) 00316 len = length; 00317 if (len < 0) 00318 len = 0; 00319 00320 *start = buffer + offset; 00321 return len; 00322 } 00323 00324 static __inline__ void rt_free(struct rtable *rt) 00325 { 00326 dst_free(&rt->u.dst); 00327 } 00328 00329 static __inline__ void rt_drop(struct rtable *rt) 00330 { 00331 ip_rt_put(rt); 00332 dst_free(&rt->u.dst); 00333 } 00334 00335 static __inline__ int rt_fast_clean(struct rtable *rth) 00336 { 00337 /* Kill broadcast/multicast entries very aggresively, if they 00338 collide in hash table with more useful entries */ 00339 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 00340 rth->key.iif && rth->u.rt_next; 00341 } 00342 00343 static __inline__ int rt_valuable(struct rtable *rth) 00344 { 00345 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 00346 rth->u.dst.expires; 00347 } 00348 00349 static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 00350 { 00351 unsigned long age; 00352 int ret = 0; 00353 00354 if (atomic_read(&rth->u.dst.__refcnt)) 00355 goto out; 00356 00357 ret = 1; 00358 if (rth->u.dst.expires && 00359 time_after_eq(jiffies, rth->u.dst.expires)) 00360 goto out; 00361 00362 age = jiffies - rth->u.dst.lastuse; 00363 ret = 0; 00364 if ((age <= tmo1 && !rt_fast_clean(rth)) || 00365 (age <= tmo2 && rt_valuable(rth))) 00366 goto out; 00367 ret = 1; 00368 out: return ret; 00369 } 00370 00371 /* Bits of score are: 00372 * 31: very valuable 00373 * 30: not quite useless 00374 * 29..0: usage counter 00375 */ 00376 static inline u32 rt_score(struct rtable *rt) 00377 { 00378 u32 score = jiffies - rt->u.dst.lastuse; 00379 00380 score = ~score & ~(3<<30); 00381 00382 if (rt_valuable(rt)) 00383 score |= (1<<31); 00384 00385 if (!rt->key.iif || 00386 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 00387 score |= (1<<30); 00388 00389 return score; 00390 } 00391 00392 /* This runs via a timer and thus is always in BH context. */ 00393 static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy) 00394 { 00395 static int rover; 00396 int i = rover, t; 00397 struct rtable *rth, **rthp; 00398 unsigned long now = jiffies; 00399 00400 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; 00401 t -= ip_rt_gc_timeout) { 00402 unsigned long tmo = ip_rt_gc_timeout; 00403 00404 i = (i + 1) & rt_hash_mask; 00405 rthp = &rt_hash_table[i].chain; 00406 00407 write_lock(&rt_hash_table[i].lock); 00408 while ((rth = *rthp) != NULL) { 00409 if (rth->u.dst.expires) { 00410 /* Entry is expired even if it is in use */ 00411 if (time_before_eq(now, rth->u.dst.expires)) { 00412 tmo >>= 1; 00413 rthp = &rth->u.rt_next; 00414 continue; 00415 } 00416 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { 00417 tmo >>= 1; 00418 rthp = &rth->u.rt_next; 00419 continue; 00420 } 00421 00422 /* Cleanup aged off entries. */ 00423 *rthp = rth->u.rt_next; 00424 rt_free(rth); 00425 } 00426 write_unlock(&rt_hash_table[i].lock); 00427 00428 /* Fallback loop breaker. */ 00429 if (time_after(jiffies, now)) 00430 break; 00431 } 00432 rover = i; 00433 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); 00434 } 00435 00436 SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task); 00437 00438 /* This can run from both BH and non-BH contexts, the latter 00439 * in the case of a forced flush event. 00440 */ 00441 static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy) 00442 { 00443 int i; 00444 struct rtable *rth, *next; 00445 00446 rt_deadline = 0; 00447 00448 get_random_bytes(&rt_hash_rnd, 4); 00449 00450 for (i = rt_hash_mask; i >= 0; i--) { 00451 write_lock_bh(&rt_hash_table[i].lock); 00452 rth = rt_hash_table[i].chain; 00453 if (rth) 00454 rt_hash_table[i].chain = NULL; 00455 write_unlock_bh(&rt_hash_table[i].lock); 00456 00457 for (; rth; rth = next) { 00458 next = rth->u.rt_next; 00459 rt_free(rth); 00460 } 00461 } 00462 } 00463 00464 SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task); 00465 00466 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED; 00467 00468 void rt_cache_flush(int delay) 00469 { 00470 unsigned long now = jiffies; 00471 int user_mode = !in_softirq(); 00472 00473 if (delay < 0) 00474 delay = ip_rt_min_delay; 00475 00476 spin_lock_bh(&rt_flush_lock); 00477 00478 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { 00479 long tmo = (long)(rt_deadline - now); 00480 00481 /* If flush timer is already running 00482 and flush request is not immediate (delay > 0): 00483 00484 if deadline is not achieved, prolongate timer to "delay", 00485 otherwise fire it at deadline time. 00486 */ 00487 00488 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) 00489 tmo = 0; 00490 00491 if (delay > tmo) 00492 delay = tmo; 00493 } 00494 00495 if (delay <= 0) { 00496 spin_unlock_bh(&rt_flush_lock); 00497 SMP_TIMER_NAME(rt_run_flush)(0); 00498 return; 00499 } 00500 00501 if (rt_deadline == 0) 00502 rt_deadline = now + ip_rt_max_delay; 00503 00504 mod_timer(&rt_flush_timer, now+delay); 00505 spin_unlock_bh(&rt_flush_lock); 00506 } 00507 00508 static void rt_secret_rebuild(unsigned long dummy) 00509 { 00510 unsigned long now = jiffies; 00511 00512 rt_cache_flush(0); 00513 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); 00514 } 00515 00516 /* 00517 Short description of GC goals. 00518 00519 We want to build algorithm, which will keep routing cache 00520 at some equilibrium point, when number of aged off entries 00521 is kept approximately equal to newly generated ones. 00522 00523 Current expiration strength is variable "expire". 00524 We try to adjust it dynamically, so that if networking 00525 is idle expires is large enough to keep enough of warm entries, 00526 and when load increases it reduces to limit cache size. 00527 */ 00528 00529 static int rt_garbage_collect(void) 00530 { 00531 static unsigned long expire = RT_GC_TIMEOUT; 00532 static unsigned long last_gc; 00533 static int rover; 00534 static int equilibrium; 00535 struct rtable *rth, **rthp; 00536 unsigned long now = jiffies; 00537 int goal; 00538 00539 /* 00540 * Garbage collection is pretty expensive, 00541 * do not make it too frequently. 00542 */ 00543 00544 rt_cache_stat[smp_processor_id()].gc_total++; 00545 00546 if (now - last_gc < ip_rt_gc_min_interval && 00547 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { 00548 rt_cache_stat[smp_processor_id()].gc_ignored++; 00549 goto out; 00550 } 00551 00552 /* Calculate number of entries, which we want to expire now. */ 00553 goal = atomic_read(&ipv4_dst_ops.entries) - 00554 (ip_rt_gc_elasticity << rt_hash_log); 00555 if (goal <= 0) { 00556 if (equilibrium < ipv4_dst_ops.gc_thresh) 00557 equilibrium = ipv4_dst_ops.gc_thresh; 00558 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 00559 if (goal > 0) { 00560 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1); 00561 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 00562 } 00563 } else { 00564 /* We are in dangerous area. Try to reduce cache really 00565 * aggressively. 00566 */ 00567 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1); 00568 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; 00569 } 00570 00571 if (now - last_gc >= ip_rt_gc_min_interval) 00572 last_gc = now; 00573 00574 if (goal <= 0) { 00575 equilibrium += goal; 00576 goto work_done; 00577 } 00578 00579 do { 00580 int i, k; 00581 00582 for (i = rt_hash_mask, k = rover; i >= 0; i--) { 00583 unsigned long tmo = expire; 00584 00585 k = (k + 1) & rt_hash_mask; 00586 rthp = &rt_hash_table[k].chain; 00587 write_lock_bh(&rt_hash_table[k].lock); 00588 while ((rth = *rthp) != NULL) { 00589 if (!rt_may_expire(rth, tmo, expire)) { 00590 tmo >>= 1; 00591 rthp = &rth->u.rt_next; 00592 continue; 00593 } 00594 *rthp = rth->u.rt_next; 00595 rt_free(rth); 00596 goal--; 00597 } 00598 write_unlock_bh(&rt_hash_table[k].lock); 00599 if (goal <= 0) 00600 break; 00601 } 00602 rover = k; 00603 00604 if (goal <= 0) 00605 goto work_done; 00606 00607 /* Goal is not achieved. We stop process if: 00608 00609 - if expire reduced to zero. Otherwise, expire is halfed. 00610 - if table is not full. 00611 - if we are called from interrupt. 00612 - jiffies check is just fallback/debug loop breaker. 00613 We will not spin here for long time in any case. 00614 */ 00615 00616 rt_cache_stat[smp_processor_id()].gc_goal_miss++; 00617 00618 if (expire == 0) 00619 break; 00620 00621 expire >>= 1; 00622 #if RT_CACHE_DEBUG >= 2 00623 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, 00624 atomic_read(&ipv4_dst_ops.entries), goal, i); 00625 #endif 00626 00627 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 00628 goto out; 00629 } while (!in_softirq() && time_before_eq(jiffies, now)); 00630 00631 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 00632 goto out; 00633 if (net_ratelimit()) 00634 printk(KERN_WARNING "dst cache overflow\n"); 00635 rt_cache_stat[smp_processor_id()].gc_dst_overflow++; 00636 return 1; 00637 00638 work_done: 00639 expire += ip_rt_gc_min_interval; 00640 if (expire > ip_rt_gc_timeout || 00641 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) 00642 expire = ip_rt_gc_timeout; 00643 #if RT_CACHE_DEBUG >= 2 00644 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, 00645 atomic_read(&ipv4_dst_ops.entries), goal, rover); 00646 #endif 00647 out: return 0; 00648 } 00649 00650 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) 00651 { 00652 struct rtable *rth, **rthp; 00653 unsigned long now; 00654 struct rtable *cand, **candp; 00655 u32 min_score; 00656 int chain_length; 00657 int attempts = !in_softirq(); 00658 00659 restart: 00660 chain_length = 0; 00661 min_score = ~(u32)0; 00662 cand = NULL; 00663 candp = NULL; 00664 now = jiffies; 00665 00666 rthp = &rt_hash_table[hash].chain; 00667 00668 write_lock_bh(&rt_hash_table[hash].lock); 00669 while ((rth = *rthp) != NULL) { 00670 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) { 00671 /* Put it first */ 00672 *rthp = rth->u.rt_next; 00673 rth->u.rt_next = rt_hash_table[hash].chain; 00674 rt_hash_table[hash].chain = rth; 00675 00676 rth->u.dst.__use++; 00677 dst_hold(&rth->u.dst); 00678 rth->u.dst.lastuse = now; 00679 write_unlock_bh(&rt_hash_table[hash].lock); 00680 00681 rt_drop(rt); 00682 *rp = rth; 00683 return 0; 00684 } 00685 00686 if (!atomic_read(&rth->u.dst.__refcnt)) { 00687 u32 score = rt_score(rth); 00688 00689 if (score <= min_score) { 00690 cand = rth; 00691 candp = rthp; 00692 min_score = score; 00693 } 00694 } 00695 00696 chain_length++; 00697 00698 rthp = &rth->u.rt_next; 00699 } 00700 00701 if (cand) { 00702 /* ip_rt_gc_elasticity used to be average length of chain 00703 * length, when exceeded gc becomes really aggressive. 00704 * 00705 * The second limit is less certain. At the moment it allows 00706 * only 2 entries per bucket. We will see. 00707 */ 00708 if (chain_length > ip_rt_gc_elasticity) { 00709 *candp = cand->u.rt_next; 00710 rt_free(cand); 00711 } 00712 } 00713 00714 /* Try to bind route to arp only if it is output 00715 route or unicast forwarding path. 00716 */ 00717 if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) { 00718 int err = arp_bind_neighbour(&rt->u.dst); 00719 if (err) { 00720 write_unlock_bh(&rt_hash_table[hash].lock); 00721 00722 if (err != -ENOBUFS) { 00723 rt_drop(rt); 00724 return err; 00725 } 00726 00727 /* Neighbour tables are full and nothing 00728 can be released. Try to shrink route cache, 00729 it is most likely it holds some neighbour records. 00730 */ 00731 if (attempts-- > 0) { 00732 int saved_elasticity = ip_rt_gc_elasticity; 00733 int saved_int = ip_rt_gc_min_interval; 00734 ip_rt_gc_elasticity = 1; 00735 ip_rt_gc_min_interval = 0; 00736 rt_garbage_collect(); 00737 ip_rt_gc_min_interval = saved_int; 00738 ip_rt_gc_elasticity = saved_elasticity; 00739 goto restart; 00740 } 00741 00742 if (net_ratelimit()) 00743 printk(KERN_WARNING "Neighbour table overflow.\n"); 00744 rt_drop(rt); 00745 return -ENOBUFS; 00746 } 00747 } 00748 00749 rt->u.rt_next = rt_hash_table[hash].chain; 00750 #if RT_CACHE_DEBUG >= 2 00751 if (rt->u.rt_next) { 00752 struct rtable *trt; 00753 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, 00754 NIPQUAD(rt->rt_dst)); 00755 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next) 00756 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); 00757 printk("\n"); 00758 } 00759 #endif 00760 rt_hash_table[hash].chain = rt; 00761 write_unlock_bh(&rt_hash_table[hash].lock); 00762 *rp = rt; 00763 return 0; 00764 } 00765 00766 void rt_bind_peer(struct rtable *rt, int create) 00767 { 00768 static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED; 00769 struct inet_peer *peer; 00770 00771 peer = inet_getpeer(rt->rt_dst, create); 00772 00773 spin_lock_bh(&rt_peer_lock); 00774 if (rt->peer == NULL) { 00775 rt->peer = peer; 00776 peer = NULL; 00777 } 00778 spin_unlock_bh(&rt_peer_lock); 00779 if (peer) 00780 inet_putpeer(peer); 00781 } 00782 00783 /* 00784 * Peer allocation may fail only in serious out-of-memory conditions. However 00785 * we still can generate some output. 00786 * Random ID selection looks a bit dangerous because we have no chances to 00787 * select ID being unique in a reasonable period of time. 00788 * But broken packet identifier may be better than no packet at all. 00789 */ 00790 static void ip_select_fb_ident(struct iphdr *iph) 00791 { 00792 static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED; 00793 static u32 ip_fallback_id; 00794 u32 salt; 00795 00796 spin_lock_bh(&ip_fb_id_lock); 00797 salt = secure_ip_id(ip_fallback_id ^ iph->daddr); 00798 iph->id = htons(salt & 0xFFFF); 00799 ip_fallback_id = salt; 00800 spin_unlock_bh(&ip_fb_id_lock); 00801 } 00802 00803 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst) 00804 { 00805 struct rtable *rt = (struct rtable *) dst; 00806 00807 if (rt) { 00808 if (rt->peer == NULL) 00809 rt_bind_peer(rt, 1); 00810 00811 /* If peer is attached to destination, it is never detached, 00812 so that we need not to grab a lock to dereference it. 00813 */ 00814 if (rt->peer) { 00815 iph->id = htons(inet_getid(rt->peer)); 00816 return; 00817 } 00818 } else 00819 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph)); 00820 00821 ip_select_fb_ident(iph); 00822 } 00823 00824 static void rt_del(unsigned hash, struct rtable *rt) 00825 { 00826 struct rtable **rthp; 00827 00828 write_lock_bh(&rt_hash_table[hash].lock); 00829 ip_rt_put(rt); 00830 for (rthp = &rt_hash_table[hash].chain; *rthp; 00831 rthp = &(*rthp)->u.rt_next) 00832 if (*rthp == rt) { 00833 *rthp = rt->u.rt_next; 00834 rt_free(rt); 00835 break; 00836 } 00837 write_unlock_bh(&rt_hash_table[hash].lock); 00838 } 00839 00840 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, 00841 u32 saddr, u8 tos, struct net_device *dev) 00842 { 00843 int i, k; 00844 struct in_device *in_dev = in_dev_get(dev); 00845 struct rtable *rth, **rthp; 00846 u32 skeys[2] = { saddr, 0 }; 00847 int ikeys[2] = { dev->ifindex, 0 }; 00848 00849 tos &= IPTOS_RT_MASK; 00850 00851 if (!in_dev) 00852 return; 00853 00854 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) 00855 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) 00856 goto reject_redirect; 00857 00858 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 00859 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 00860 goto reject_redirect; 00861 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 00862 goto reject_redirect; 00863 } else { 00864 if (inet_addr_type(new_gw) != RTN_UNICAST) 00865 goto reject_redirect; 00866 } 00867 00868 for (i = 0; i < 2; i++) { 00869 for (k = 0; k < 2; k++) { 00870 unsigned hash = rt_hash_code(daddr, 00871 skeys[i] ^ (ikeys[k] << 5), 00872 tos); 00873 00874 rthp=&rt_hash_table[hash].chain; 00875 00876 read_lock(&rt_hash_table[hash].lock); 00877 while ((rth = *rthp) != NULL) { 00878 struct rtable *rt; 00879 00880 if (rth->key.dst != daddr || 00881 rth->key.src != skeys[i] || 00882 rth->key.tos != tos || 00883 rth->key.oif != ikeys[k] || 00884 rth->key.iif != 0) { 00885 rthp = &rth->u.rt_next; 00886 continue; 00887 } 00888 00889 if (rth->rt_dst != daddr || 00890 rth->rt_src != saddr || 00891 rth->u.dst.error || 00892 rth->rt_gateway != old_gw || 00893 rth->u.dst.dev != dev) 00894 break; 00895 00896 dst_hold(&rth->u.dst); 00897 read_unlock(&rt_hash_table[hash].lock); 00898 00899 rt = dst_alloc(&ipv4_dst_ops); 00900 if (rt == NULL) { 00901 ip_rt_put(rth); 00902 in_dev_put(in_dev); 00903 return; 00904 } 00905 00906 /* Copy all the information. */ 00907 *rt = *rth; 00908 rt->u.dst.__use = 1; 00909 atomic_set(&rt->u.dst.__refcnt, 1); 00910 if (rt->u.dst.dev) 00911 dev_hold(rt->u.dst.dev); 00912 rt->u.dst.lastuse = jiffies; 00913 rt->u.dst.neighbour = NULL; 00914 rt->u.dst.hh = NULL; 00915 rt->u.dst.obsolete = 0; 00916 00917 rt->rt_flags |= RTCF_REDIRECTED; 00918 00919 /* Gateway is different ... */ 00920 rt->rt_gateway = new_gw; 00921 00922 /* Redirect received -> path was valid */ 00923 dst_confirm(&rth->u.dst); 00924 00925 if (rt->peer) 00926 atomic_inc(&rt->peer->refcnt); 00927 00928 if (arp_bind_neighbour(&rt->u.dst) || 00929 !(rt->u.dst.neighbour->nud_state & 00930 NUD_VALID)) { 00931 if (rt->u.dst.neighbour) 00932 neigh_event_send(rt->u.dst.neighbour, NULL); 00933 ip_rt_put(rth); 00934 rt_drop(rt); 00935 goto do_next; 00936 } 00937 00938 rt_del(hash, rth); 00939 if (!rt_intern_hash(hash, rt, &rt)) 00940 ip_rt_put(rt); 00941 goto do_next; 00942 } 00943 read_unlock(&rt_hash_table[hash].lock); 00944 do_next: 00945 ; 00946 } 00947 } 00948 in_dev_put(in_dev); 00949 return; 00950 00951 reject_redirect: 00952 #ifdef CONFIG_IP_ROUTE_VERBOSE 00953 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 00954 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about " 00955 "%u.%u.%u.%u ignored.\n" 00956 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, " 00957 "tos %02x\n", 00958 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), 00959 NIPQUAD(saddr), NIPQUAD(daddr), tos); 00960 #endif 00961 in_dev_put(in_dev); 00962 } 00963 00964 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 00965 { 00966 struct rtable *rt = (struct rtable*)dst; 00967 struct dst_entry *ret = dst; 00968 00969 if (rt) { 00970 if (dst->obsolete) { 00971 ip_rt_put(rt); 00972 ret = NULL; 00973 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 00974 rt->u.dst.expires) { 00975 unsigned hash = rt_hash_code(rt->key.dst, 00976 rt->key.src ^ 00977 (rt->key.oif << 5), 00978 rt->key.tos); 00979 #if RT_CACHE_DEBUG >= 1 00980 printk(KERN_DEBUG "ip_rt_advice: redirect to " 00981 "%u.%u.%u.%u/%02x dropped\n", 00982 NIPQUAD(rt->rt_dst), rt->key.tos); 00983 #endif 00984 rt_del(hash, rt); 00985 ret = NULL; 00986 } 00987 } 00988 return ret; 00989 } 00990 00991 /* 00992 * Algorithm: 00993 * 1. The first ip_rt_redirect_number redirects are sent 00994 * with exponential backoff, then we stop sending them at all, 00995 * assuming that the host ignores our redirects. 00996 * 2. If we did not see packets requiring redirects 00997 * during ip_rt_redirect_silence, we assume that the host 00998 * forgot redirected route and start to send redirects again. 00999 * 01000 * This algorithm is much cheaper and more intelligent than dumb load limiting 01001 * in icmp.c. 01002 * 01003 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 01004 * and "frag. need" (breaks PMTU discovery) in icmp.c. 01005 */ 01006 01007 void ip_rt_send_redirect(struct sk_buff *skb) 01008 { 01009 struct rtable *rt = (struct rtable*)skb->dst; 01010 struct in_device *in_dev = in_dev_get(rt->u.dst.dev); 01011 01012 if (!in_dev) 01013 return; 01014 01015 if (!IN_DEV_TX_REDIRECTS(in_dev)) 01016 goto out; 01017 01018 /* No redirected packets during ip_rt_redirect_silence; 01019 * reset the algorithm. 01020 */ 01021 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) 01022 rt->u.dst.rate_tokens = 0; 01023 01024 /* Too many ignored redirects; do not send anything 01025 * set u.dst.rate_last to the last seen redirected packet. 01026 */ 01027 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { 01028 rt->u.dst.rate_last = jiffies; 01029 goto out; 01030 } 01031 01032 /* Check for load limit; set rate_last to the latest sent 01033 * redirect. 01034 */ 01035 if (time_after(jiffies, 01036 (rt->u.dst.rate_last + 01037 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { 01038 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 01039 rt->u.dst.rate_last = jiffies; 01040 ++rt->u.dst.rate_tokens; 01041 #ifdef CONFIG_IP_ROUTE_VERBOSE 01042 if (IN_DEV_LOG_MARTIANS(in_dev) && 01043 rt->u.dst.rate_tokens == ip_rt_redirect_number && 01044 net_ratelimit()) 01045 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores " 01046 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n", 01047 NIPQUAD(rt->rt_src), rt->rt_iif, 01048 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); 01049 #endif 01050 } 01051 out: 01052 in_dev_put(in_dev); 01053 } 01054 01055 static int ip_error(struct sk_buff *skb) 01056 { 01057 struct rtable *rt = (struct rtable*)skb->dst; 01058 unsigned long now; 01059 int code; 01060 01061 switch (rt->u.dst.error) { 01062 case EINVAL: 01063 default: 01064 goto out; 01065 case EHOSTUNREACH: 01066 code = ICMP_HOST_UNREACH; 01067 break; 01068 case ENETUNREACH: 01069 code = ICMP_NET_UNREACH; 01070 break; 01071 case EACCES: 01072 code = ICMP_PKT_FILTERED; 01073 break; 01074 } 01075 01076 now = jiffies; 01077 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; 01078 if (rt->u.dst.rate_tokens > ip_rt_error_burst) 01079 rt->u.dst.rate_tokens = ip_rt_error_burst; 01080 rt->u.dst.rate_last = now; 01081 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { 01082 rt->u.dst.rate_tokens -= ip_rt_error_cost; 01083 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 01084 } 01085 01086 out: kfree_skb(skb); 01087 return 0; 01088 } 01089 01090 /* 01091 * The last two values are not from the RFC but 01092 * are needed for AMPRnet AX.25 paths. 01093 */ 01094 01095 static unsigned short mtu_plateau[] = 01096 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; 01097 01098 static __inline__ unsigned short guess_mtu(unsigned short old_mtu) 01099 { 01100 int i; 01101 01102 for (i = 0; i < sizeof(mtu_plateau) / sizeof(mtu_plateau[0]); i++) 01103 if (old_mtu > mtu_plateau[i]) 01104 return mtu_plateau[i]; 01105 return 68; 01106 } 01107 01108 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) 01109 { 01110 int i; 01111 unsigned short old_mtu = ntohs(iph->tot_len); 01112 struct rtable *rth; 01113 u32 skeys[2] = { iph->saddr, 0, }; 01114 u32 daddr = iph->daddr; 01115 u8 tos = iph->tos & IPTOS_RT_MASK; 01116 unsigned short est_mtu = 0; 01117 01118 if (ipv4_config.no_pmtu_disc) 01119 return 0; 01120 01121 for (i = 0; i < 2; i++) { 01122 unsigned hash = rt_hash_code(daddr, skeys[i], tos); 01123 01124 read_lock(&rt_hash_table[hash].lock); 01125 for (rth = rt_hash_table[hash].chain; rth; 01126 rth = rth->u.rt_next) { 01127 if (rth->key.dst == daddr && 01128 rth->key.src == skeys[i] && 01129 rth->rt_dst == daddr && 01130 rth->rt_src == iph->saddr && 01131 rth->key.tos == tos && 01132 rth->key.iif == 0 && 01133 !(rth->u.dst.mxlock & (1 << RTAX_MTU))) { 01134 unsigned short mtu = new_mtu; 01135 01136 if (new_mtu < 68 || new_mtu >= old_mtu) { 01137 01138 /* BSD 4.2 compatibility hack :-( */ 01139 if (mtu == 0 && 01140 old_mtu >= rth->u.dst.pmtu && 01141 old_mtu >= 68 + (iph->ihl << 2)) 01142 old_mtu -= iph->ihl << 2; 01143 01144 mtu = guess_mtu(old_mtu); 01145 } 01146 if (mtu <= rth->u.dst.pmtu) { 01147 if (mtu < rth->u.dst.pmtu) { 01148 dst_confirm(&rth->u.dst); 01149 if (mtu < ip_rt_min_pmtu) { 01150 mtu = ip_rt_min_pmtu; 01151 rth->u.dst.mxlock |= 01152 (1 << RTAX_MTU); 01153 } 01154 rth->u.dst.pmtu = mtu; 01155 dst_set_expires(&rth->u.dst, 01156 ip_rt_mtu_expires); 01157 } 01158 est_mtu = mtu; 01159 } 01160 } 01161 } 01162 read_unlock(&rt_hash_table[hash].lock); 01163 } 01164 return est_mtu ? : new_mtu; 01165 } 01166 01167 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu) 01168 { 01169 if (dst->pmtu > mtu && mtu >= 68 && 01170 !(dst->mxlock & (1 << RTAX_MTU))) { 01171 if (mtu < ip_rt_min_pmtu) { 01172 mtu = ip_rt_min_pmtu; 01173 dst->mxlock |= (1 << RTAX_MTU); 01174 } 01175 dst->pmtu = mtu; 01176 dst_set_expires(dst, ip_rt_mtu_expires); 01177 } 01178 } 01179 01180 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 01181 { 01182 dst_release(dst); 01183 return NULL; 01184 } 01185 01186 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst, 01187 struct sk_buff *skb) 01188 { 01189 return NULL; 01190 } 01191 01192 static void ipv4_dst_destroy(struct dst_entry *dst) 01193 { 01194 struct rtable *rt = (struct rtable *) dst; 01195 struct inet_peer *peer = rt->peer; 01196 01197 if (peer) { 01198 rt->peer = NULL; 01199 inet_putpeer(peer); 01200 } 01201 } 01202 01203 static void ipv4_link_failure(struct sk_buff *skb) 01204 { 01205 struct rtable *rt; 01206 01207 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 01208 01209 rt = (struct rtable *) skb->dst; 01210 if (rt) 01211 dst_set_expires(&rt->u.dst, 0); 01212 } 01213 01214 static int ip_rt_bug(struct sk_buff *skb) 01215 { 01216 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", 01217 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr), 01218 skb->dev ? skb->dev->name : "?"); 01219 kfree_skb(skb); 01220 return 0; 01221 } 01222 01223 /* 01224 We do not cache source address of outgoing interface, 01225 because it is used only by IP RR, TS and SRR options, 01226 so that it out of fast path. 01227 01228 BTW remember: "addr" is allowed to be not aligned 01229 in IP options! 01230 */ 01231 01232 void ip_rt_get_source(u8 *addr, struct rtable *rt) 01233 { 01234 u32 src; 01235 struct fib_result res; 01236 01237 if (rt->key.iif == 0) 01238 src = rt->rt_src; 01239 else if (fib_lookup(&rt->key, &res) == 0) { 01240 #ifdef CONFIG_IP_ROUTE_NAT 01241 if (res.type == RTN_NAT) 01242 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, 01243 RT_SCOPE_UNIVERSE); 01244 else 01245 #endif 01246 src = FIB_RES_PREFSRC(res); 01247 fib_res_put(&res); 01248 } else 01249 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, 01250 RT_SCOPE_UNIVERSE); 01251 memcpy(addr, &src, 4); 01252 } 01253 01254 #ifdef CONFIG_NET_CLS_ROUTE 01255 static void set_class_tag(struct rtable *rt, u32 tag) 01256 { 01257 if (!(rt->u.dst.tclassid & 0xFFFF)) 01258 rt->u.dst.tclassid |= tag & 0xFFFF; 01259 if (!(rt->u.dst.tclassid & 0xFFFF0000)) 01260 rt->u.dst.tclassid |= tag & 0xFFFF0000; 01261 } 01262 #endif 01263 01264 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 01265 { 01266 struct fib_info *fi = res->fi; 01267 01268 if (fi) { 01269 if (FIB_RES_GW(*res) && 01270 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 01271 rt->rt_gateway = FIB_RES_GW(*res); 01272 memcpy(&rt->u.dst.mxlock, fi->fib_metrics, 01273 sizeof(fi->fib_metrics)); 01274 if (fi->fib_mtu == 0) { 01275 rt->u.dst.pmtu = rt->u.dst.dev->mtu; 01276 if (rt->u.dst.mxlock & (1 << RTAX_MTU) && 01277 rt->rt_gateway != rt->rt_dst && 01278 rt->u.dst.pmtu > 576) 01279 rt->u.dst.pmtu = 576; 01280 } 01281 #ifdef CONFIG_NET_CLS_ROUTE 01282 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 01283 #endif 01284 } else 01285 rt->u.dst.pmtu = rt->u.dst.dev->mtu; 01286 01287 if (rt->u.dst.pmtu > IP_MAX_MTU) 01288 rt->u.dst.pmtu = IP_MAX_MTU; 01289 if (rt->u.dst.advmss == 0) 01290 rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40, 01291 ip_rt_min_advmss); 01292 if (rt->u.dst.advmss > 65535 - 40) 01293 rt->u.dst.advmss = 65535 - 40; 01294 01295 #ifdef CONFIG_NET_CLS_ROUTE 01296 #ifdef CONFIG_IP_MULTIPLE_TABLES 01297 set_class_tag(rt, fib_rules_tclass(res)); 01298 #endif 01299 set_class_tag(rt, itag); 01300 #endif 01301 rt->rt_type = res->type; 01302 } 01303 01304 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, 01305 u8 tos, struct net_device *dev, int our) 01306 { 01307 unsigned hash; 01308 struct rtable *rth; 01309 u32 spec_dst; 01310 struct in_device *in_dev = in_dev_get(dev); 01311 u32 itag = 0; 01312 01313 /* Primary sanity checks. */ 01314 01315 if (in_dev == NULL) 01316 return -EINVAL; 01317 01318 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || 01319 skb->protocol != htons(ETH_P_IP)) 01320 goto e_inval; 01321 01322 if (ZERONET(saddr)) { 01323 if (!LOCAL_MCAST(daddr)) 01324 goto e_inval; 01325 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 01326 } else if (fib_validate_source(saddr, 0, tos, 0, 01327 dev, &spec_dst, &itag) < 0) 01328 goto e_inval; 01329 01330 rth = dst_alloc(&ipv4_dst_ops); 01331 if (!rth) 01332 goto e_nobufs; 01333 01334 rth->u.dst.output= ip_rt_bug; 01335 01336 atomic_set(&rth->u.dst.__refcnt, 1); 01337 rth->u.dst.flags= DST_HOST; 01338 rth->key.dst = daddr; 01339 rth->rt_dst = daddr; 01340 rth->key.tos = tos; 01341 #ifdef CONFIG_IP_ROUTE_FWMARK 01342 rth->key.fwmark = skb->nfmark; 01343 #endif 01344 rth->key.src = saddr; 01345 rth->rt_src = saddr; 01346 #ifdef CONFIG_IP_ROUTE_NAT 01347 rth->rt_dst_map = daddr; 01348 rth->rt_src_map = saddr; 01349 #endif 01350 #ifdef CONFIG_NET_CLS_ROUTE 01351 rth->u.dst.tclassid = itag; 01352 #endif 01353 rth->rt_iif = 01354 rth->key.iif = dev->ifindex; 01355 rth->u.dst.dev = &loopback_dev; 01356 dev_hold(rth->u.dst.dev); 01357 rth->key.oif = 0; 01358 rth->rt_gateway = daddr; 01359 rth->rt_spec_dst= spec_dst; 01360 rth->rt_type = RTN_MULTICAST; 01361 rth->rt_flags = RTCF_MULTICAST; 01362 if (our) { 01363 rth->u.dst.input= ip_local_deliver; 01364 rth->rt_flags |= RTCF_LOCAL; 01365 } 01366 01367 #ifdef CONFIG_IP_MROUTE 01368 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) 01369 rth->u.dst.input = ip_mr_input; 01370 #endif 01371 rt_cache_stat[smp_processor_id()].in_slow_mc++; 01372 01373 in_dev_put(in_dev); 01374 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos); 01375 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst); 01376 01377 e_nobufs: 01378 in_dev_put(in_dev); 01379 return -ENOBUFS; 01380 01381 e_inval: 01382 in_dev_put(in_dev); 01383 return -EINVAL; 01384 } 01385 01386 /* 01387 * NOTE. We drop all the packets that has local source 01388 * addresses, because every properly looped back packet 01389 * must have correct destination already attached by output routine. 01390 * 01391 * Such approach solves two big problems: 01392 * 1. Not simplex devices are handled properly. 01393 * 2. IP spoofing attempts are filtered with 100% of guarantee. 01394 */ 01395 01396 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, 01397 u8 tos, struct net_device *dev) 01398 { 01399 struct rt_key key; 01400 struct fib_result res; 01401 struct in_device *in_dev = in_dev_get(dev); 01402 struct in_device *out_dev = NULL; 01403 unsigned flags = 0; 01404 u32 itag = 0; 01405 struct rtable * rth; 01406 unsigned hash; 01407 u32 spec_dst; 01408 int err = -EINVAL; 01409 int free_res = 0; 01410 01411 /* IP on this device is disabled. */ 01412 01413 if (!in_dev) 01414 goto out; 01415 01416 key.dst = daddr; 01417 key.src = saddr; 01418 key.tos = tos; 01419 #ifdef CONFIG_IP_ROUTE_FWMARK 01420 key.fwmark = skb->nfmark; 01421 #endif 01422 key.iif = dev->ifindex; 01423 key.oif = 0; 01424 key.scope = RT_SCOPE_UNIVERSE; 01425 01426 hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos); 01427 01428 /* Check for the most weird martians, which can be not detected 01429 by fib_lookup. 01430 */ 01431 01432 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) 01433 goto martian_source; 01434 01435 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0)) 01436 goto brd_input; 01437 01438 /* Accept zero addresses only to limited broadcast; 01439 * I even do not know to fix it or not. Waiting for complains :-) 01440 */ 01441 if (ZERONET(saddr)) 01442 goto martian_source; 01443 01444 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) 01445 goto martian_destination; 01446 01447 /* 01448 * Now we are ready to route packet. 01449 */ 01450 if ((err = fib_lookup(&key, &res)) != 0) { 01451 if (!IN_DEV_FORWARD(in_dev)) 01452 goto e_inval; 01453 goto no_route; 01454 } 01455 free_res = 1; 01456 01457 rt_cache_stat[smp_processor_id()].in_slow_tot++; 01458 01459 #ifdef CONFIG_IP_ROUTE_NAT 01460 /* Policy is applied before mapping destination, 01461 but rerouting after map should be made with old source. 01462 */ 01463 01464 if (1) { 01465 u32 src_map = saddr; 01466 if (res.r) 01467 src_map = fib_rules_policy(saddr, &res, &flags); 01468 01469 if (res.type == RTN_NAT) { 01470 key.dst = fib_rules_map_destination(daddr, &res); 01471 fib_res_put(&res); 01472 free_res = 0; 01473 if (fib_lookup(&key, &res)) 01474 goto e_inval; 01475 free_res = 1; 01476 if (res.type != RTN_UNICAST) 01477 goto e_inval; 01478 flags |= RTCF_DNAT; 01479 } 01480 key.src = src_map; 01481 } 01482 #endif 01483 01484 if (res.type == RTN_BROADCAST) 01485 goto brd_input; 01486 01487 if (res.type == RTN_LOCAL) { 01488 int result; 01489 result = fib_validate_source(saddr, daddr, tos, 01490 loopback_dev.ifindex, 01491 dev, &spec_dst, &itag); 01492 if (result < 0) 01493 goto martian_source; 01494 if (result) 01495 flags |= RTCF_DIRECTSRC; 01496 spec_dst = daddr; 01497 goto local_input; 01498 } 01499 01500 if (!IN_DEV_FORWARD(in_dev)) 01501 goto e_inval; 01502 if (res.type != RTN_UNICAST) 01503 goto martian_destination; 01504 01505 #ifdef CONFIG_IP_ROUTE_MULTIPATH 01506 if (res.fi->fib_nhs > 1 && key.oif == 0) 01507 fib_select_multipath(&key, &res); 01508 #endif 01509 out_dev = in_dev_get(FIB_RES_DEV(res)); 01510 if (out_dev == NULL) { 01511 if (net_ratelimit()) 01512 printk(KERN_CRIT "Bug in ip_route_input_slow(). " 01513 "Please, report\n"); 01514 goto e_inval; 01515 } 01516 01517 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, 01518 &spec_dst, &itag); 01519 if (err < 0) 01520 goto martian_source; 01521 01522 if (err) 01523 flags |= RTCF_DIRECTSRC; 01524 01525 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) && 01526 (IN_DEV_SHARED_MEDIA(out_dev) || 01527 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res)))) 01528 flags |= RTCF_DOREDIRECT; 01529 01530 if (skb->protocol != htons(ETH_P_IP)) { 01531 /* Not IP (i.e. ARP). Do not create route, if it is 01532 * invalid for proxy arp. DNAT routes are always valid. 01533 */ 01534 if (out_dev == in_dev && !(flags & RTCF_DNAT)) 01535 goto e_inval; 01536 } 01537 01538 rth = dst_alloc(&ipv4_dst_ops); 01539 if (!rth) 01540 goto e_nobufs; 01541 01542 atomic_set(&rth->u.dst.__refcnt, 1); 01543 rth->u.dst.flags= DST_HOST; 01544 rth->key.dst = daddr; 01545 rth->rt_dst = daddr; 01546 rth->key.tos = tos; 01547 #ifdef CONFIG_IP_ROUTE_FWMARK 01548 rth->key.fwmark = skb->nfmark; 01549 #endif 01550 rth->key.src = saddr; 01551 rth->rt_src = saddr; 01552 rth->rt_gateway = daddr; 01553 #ifdef CONFIG_IP_ROUTE_NAT 01554 rth->rt_src_map = key.src; 01555 rth->rt_dst_map = key.dst; 01556 if (flags&RTCF_DNAT) 01557 rth->rt_gateway = key.dst; 01558 #endif 01559 rth->rt_iif = 01560 rth->key.iif = dev->ifindex; 01561 rth->u.dst.dev = out_dev->dev; 01562 dev_hold(rth->u.dst.dev); 01563 rth->key.oif = 0; 01564 rth->rt_spec_dst= spec_dst; 01565 01566 rth->u.dst.input = ip_forward; 01567 rth->u.dst.output = ip_output; 01568 01569 rt_set_nexthop(rth, &res, itag); 01570 01571 rth->rt_flags = flags; 01572 01573 #ifdef CONFIG_NET_FASTROUTE 01574 if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) { 01575 struct net_device *odev = rth->u.dst.dev; 01576 if (odev != dev && 01577 dev->accept_fastpath && 01578 odev->mtu >= dev->mtu && 01579 dev->accept_fastpath(dev, &rth->u.dst) == 0) 01580 rth->rt_flags |= RTCF_FAST; 01581 } 01582 #endif 01583 01584 intern: 01585 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); 01586 done: 01587 in_dev_put(in_dev); 01588 if (out_dev) 01589 in_dev_put(out_dev); 01590 if (free_res) 01591 fib_res_put(&res); 01592 out: return err; 01593 01594 brd_input: 01595 if (skb->protocol != htons(ETH_P_IP)) 01596 goto e_inval; 01597 01598 if (ZERONET(saddr)) 01599 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 01600 else { 01601 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 01602 &itag); 01603 if (err < 0) 01604 goto martian_source; 01605 if (err) 01606 flags |= RTCF_DIRECTSRC; 01607 } 01608 flags |= RTCF_BROADCAST; 01609 res.type = RTN_BROADCAST; 01610 rt_cache_stat[smp_processor_id()].in_brd++; 01611 01612 local_input: 01613 rth = dst_alloc(&ipv4_dst_ops); 01614 if (!rth) 01615 goto e_nobufs; 01616 01617 rth->u.dst.output= ip_rt_bug; 01618 01619 atomic_set(&rth->u.dst.__refcnt, 1); 01620 rth->u.dst.flags= DST_HOST; 01621 rth->key.dst = daddr; 01622 rth->rt_dst = daddr; 01623 rth->key.tos = tos; 01624 #ifdef CONFIG_IP_ROUTE_FWMARK 01625 rth->key.fwmark = skb->nfmark; 01626 #endif 01627 rth->key.src = saddr; 01628 rth->rt_src = saddr; 01629 #ifdef CONFIG_IP_ROUTE_NAT 01630 rth->rt_dst_map = key.dst; 01631 rth->rt_src_map = key.src; 01632 #endif 01633 #ifdef CONFIG_NET_CLS_ROUTE 01634 rth->u.dst.tclassid = itag; 01635 #endif 01636 rth->rt_iif = 01637 rth->key.iif = dev->ifindex; 01638 rth->u.dst.dev = &loopback_dev; 01639 dev_hold(rth->u.dst.dev); 01640 rth->key.oif = 0; 01641 rth->rt_gateway = daddr; 01642 rth->rt_spec_dst= spec_dst; 01643 rth->u.dst.input= ip_local_deliver; 01644 rth->rt_flags = flags|RTCF_LOCAL; 01645 if (res.type == RTN_UNREACHABLE) { 01646 rth->u.dst.input= ip_error; 01647 rth->u.dst.error= -err; 01648 rth->rt_flags &= ~RTCF_LOCAL; 01649 } 01650 rth->rt_type = res.type; 01651 goto intern; 01652 01653 no_route: 01654 rt_cache_stat[smp_processor_id()].in_no_route++; 01655 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 01656 res.type = RTN_UNREACHABLE; 01657 goto local_input; 01658 01659 /* 01660 * Do not cache martian addresses: they should be logged (RFC1812) 01661 */ 01662 martian_destination: 01663 rt_cache_stat[smp_processor_id()].in_martian_dst++; 01664 #ifdef CONFIG_IP_ROUTE_VERBOSE 01665 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 01666 printk(KERN_WARNING "martian destination %u.%u.%u.%u from " 01667 "%u.%u.%u.%u, dev %s\n", 01668 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 01669 #endif 01670 e_inval: 01671 err = -EINVAL; 01672 goto done; 01673 01674 e_nobufs: 01675 err = -ENOBUFS; 01676 goto done; 01677 01678 martian_source: 01679 01680 rt_cache_stat[smp_processor_id()].in_martian_src++; 01681 #ifdef CONFIG_IP_ROUTE_VERBOSE 01682 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 01683 /* 01684 * RFC1812 recommendation, if source is martian, 01685 * the only hint is MAC header. 01686 */ 01687 printk(KERN_WARNING "martian source %u.%u.%u.%u from " 01688 "%u.%u.%u.%u, on dev %s\n", 01689 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 01690 if (dev->hard_header_len) { 01691 int i; 01692 unsigned char *p = skb->mac.raw; 01693 printk(KERN_WARNING "ll header: "); 01694 for (i = 0; i < dev->hard_header_len; i++, p++) { 01695 printk("%02x", *p); 01696 if (i < (dev->hard_header_len - 1)) 01697 printk(":"); 01698 } 01699 printk("\n"); 01700 } 01701 } 01702 #endif 01703 goto e_inval; 01704 } 01705 01712 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, 01713 u8 tos, struct net_device *dev) 01714 { 01715 struct rtable * rth; 01716 unsigned hash; 01717 int iif = dev->ifindex; 01718 01719 tos &= IPTOS_RT_MASK; 01720 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos); 01721 01722 read_lock(&rt_hash_table[hash].lock); 01723 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) { 01724 if (rth->key.dst == daddr && 01725 rth->key.src == saddr && 01726 rth->key.iif == iif && 01727 rth->key.oif == 0 && 01728 #ifdef CONFIG_IP_ROUTE_FWMARK 01729 rth->key.fwmark == skb->nfmark && 01730 #endif 01731 rth->key.tos == tos) { 01732 rth->u.dst.lastuse = jiffies; 01733 dst_hold(&rth->u.dst); 01734 rth->u.dst.__use++; 01735 rt_cache_stat[smp_processor_id()].in_hit++; 01736 read_unlock(&rt_hash_table[hash].lock); 01737 skb->dst = (struct dst_entry*)rth; 01738 return 0; 01739 } 01740 rt_cache_stat[smp_processor_id()].in_hlist_search++; 01741 } 01742 read_unlock(&rt_hash_table[hash].lock); 01743 01744 /* Multicast recognition logic is moved from route cache to here. 01745 The problem was that too many Ethernet cards have broken/missing 01746 hardware multicast filters :-( As result the host on multicasting 01747 network acquires a lot of useless route cache entries, sort of 01748 SDR messages from all the world. Now we try to get rid of them. 01749 Really, provided software IP multicast filter is organized 01750 reasonably (at least, hashed), it does not result in a slowdown 01751 comparing with route cache reject entries. 01752 Note, that multicast routers are not affected, because 01753 route cache entry is created eventually. 01754 */ 01755 if (MULTICAST(daddr)) { 01756 struct in_device *in_dev; 01757 01758 read_lock(&inetdev_lock); 01759 if ((in_dev = __in_dev_get(dev)) != NULL) { 01760 int our = ip_check_mc(in_dev, daddr, saddr); 01761 if (our 01762 #ifdef CONFIG_IP_MROUTE 01763 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) 01764 #endif 01765 ) { 01766 read_unlock(&inetdev_lock); 01767 return ip_route_input_mc(skb, daddr, saddr, 01768 tos, dev, our); 01769 } 01770 } 01771 read_unlock(&inetdev_lock); 01772 return -EINVAL; 01773 } 01774 return ip_route_input_slow(skb, daddr, saddr, tos, dev); 01775 } 01776 01777 /* 01778 * Major route resolver routine. 01779 */ 01780 01781 int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey) 01782 { 01783 struct rt_key key; 01784 struct fib_result res; 01785 unsigned flags = 0; 01786 struct rtable *rth; 01787 struct net_device *dev_out = NULL; 01788 unsigned hash; 01789 int free_res = 0; 01790 int err; 01791 u32 tos; 01792 01793 tos = oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK); 01794 key.dst = oldkey->dst; 01795 key.src = oldkey->src; 01796 key.tos = tos & IPTOS_RT_MASK; 01797 key.iif = loopback_dev.ifindex; 01798 key.oif = oldkey->oif; 01799 #ifdef CONFIG_IP_ROUTE_FWMARK 01800 key.fwmark = oldkey->fwmark; 01801 #endif 01802 key.scope = (tos & RTO_ONLINK) ? RT_SCOPE_LINK : 01803 RT_SCOPE_UNIVERSE; 01804 res.fi = NULL; 01805 #ifdef CONFIG_IP_MULTIPLE_TABLES 01806 res.r = NULL; 01807 #endif 01808 01809 if (oldkey->src) { 01810 err = -EINVAL; 01811 if (MULTICAST(oldkey->src) || 01812 BADCLASS(oldkey->src) || 01813 ZERONET(oldkey->src)) 01814 goto out; 01815 01816 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 01817 dev_out = ip_dev_find(oldkey->src); 01818 if (dev_out == NULL) 01819 goto out; 01820 01821 /* I removed check for oif == dev_out->oif here. 01822 It was wrong by three reasons: 01823 1. ip_dev_find(saddr) can return wrong iface, if saddr is 01824 assigned to multiple interfaces. 01825 2. Moreover, we are allowed to send packets with saddr 01826 of another iface. --ANK 01827 */ 01828 01829 if (oldkey->oif == 0 01830 && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) { 01831 /* Special hack: user can direct multicasts 01832 and limited broadcast via necessary interface 01833 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 01834 This hack is not just for fun, it allows 01835 vic,vat and friends to work. 01836 They bind socket to loopback, set ttl to zero 01837 and expect that it will work. 01838 From the viewpoint of routing cache they are broken, 01839 because we are not allowed to build multicast path 01840 with loopback source addr (look, routing cache 01841 cannot know, that ttl is zero, so that packet 01842 will not leave this host and route is valid). 01843 Luckily, this hack is good workaround. 01844 */ 01845 01846 key.oif = dev_out->ifindex; 01847 goto make_route; 01848 } 01849 if (dev_out) 01850 dev_put(dev_out); 01851 dev_out = NULL; 01852 } 01853 if (oldkey->oif) { 01854 dev_out = dev_get_by_index(oldkey->oif); 01855 err = -ENODEV; 01856 if (dev_out == NULL) 01857 goto out; 01858 if (__in_dev_get(dev_out) == NULL) { 01859 dev_put(dev_out); 01860 goto out; /* Wrong error code */ 01861 } 01862 err = -ENETDOWN; 01863 if (!(dev_out->flags&IFF_UP)) { 01864 dev_put(dev_out); 01865 goto out; 01866 } 01867 01868 if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) { 01869 if (!key.src) 01870 key.src = inet_select_addr(dev_out, 0, 01871 RT_SCOPE_LINK); 01872 goto make_route; 01873 } 01874 if (!key.src) { 01875 if (MULTICAST(oldkey->dst)) 01876 key.src = inet_select_addr(dev_out, 0, 01877 key.scope); 01878 else if (!oldkey->dst) 01879 key.src = inet_select_addr(dev_out, 0, 01880 RT_SCOPE_HOST); 01881 } 01882 } 01883 01884 if (!key.dst) { 01885 key.dst = key.src; 01886 if (!key.dst) 01887 key.dst = key.src = htonl(INADDR_LOOPBACK); 01888 if (dev_out) 01889 dev_put(dev_out); 01890 dev_out = &loopback_dev; 01891 dev_hold(dev_out); 01892 key.oif = loopback_dev.ifindex; 01893 res.type = RTN_LOCAL; 01894 flags |= RTCF_LOCAL; 01895 goto make_route; 01896 } 01897 01898 if (fib_lookup(&key, &res)) { 01899 res.fi = NULL; 01900 if (oldkey->oif) { 01901 /* Apparently, routing tables are wrong. Assume, 01902 that the destination is on link. 01903 01904 WHY? DW. 01905 Because we are allowed to send to iface 01906 even if it has NO routes and NO assigned 01907 addresses. When oif is specified, routing 01908 tables are looked up with only one purpose: 01909 to catch if destination is gatewayed, rather than 01910 direct. Moreover, if MSG_DONTROUTE is set, 01911 we send packet, ignoring both routing tables 01912 and ifaddr state. --ANK 01913 01914 01915 We could make it even if oif is unknown, 01916 likely IPv6, but we do not. 01917 */ 01918 01919 if (key.src == 0) 01920 key.src = inet_select_addr(dev_out, 0, 01921 RT_SCOPE_LINK); 01922 res.type = RTN_UNICAST; 01923 goto make_route; 01924 } 01925 if (dev_out) 01926 dev_put(dev_out); 01927 err = -ENETUNREACH; 01928 goto out; 01929 } 01930 free_res = 1; 01931 01932 if (res.type == RTN_NAT) 01933 goto e_inval; 01934 01935 if (res.type == RTN_LOCAL) { 01936 struct in_device *in_dev; 01937 u32 src; 01938 01939 if (dev_out) 01940 dev_put(dev_out); 01941 dev_out = FIB_RES_DEV(res); 01942 in_dev = in_dev_get(dev_out); 01943 src = key.src? : FIB_RES_PREFSRC(res); 01944 if (in_dev && IN_DEV_LOOP(in_dev) && src) { 01945 struct net_device *dev_src; 01946 01947 in_dev_put(in_dev); 01948 in_dev = NULL; 01949 dev_src = ip_dev_find(src); 01950 if (dev_src && dev_src != dev_out && 01951 (in_dev = in_dev_get(dev_src)) && 01952 IN_DEV_LOOP(in_dev)) { 01953 in_dev_put(in_dev); 01954 dev_out = dev_src; 01955 key.src = src; 01956 key.oif = dev_out->ifindex; 01957 res.type = RTN_UNICAST; 01958 if (res.fi) { 01959 fib_info_put(res.fi); 01960 res.fi = NULL; 01961 } 01962 goto make_route; 01963 } 01964 if (dev_src) 01965 dev_put(dev_src); 01966 } 01967 if (in_dev) 01968 in_dev_put(in_dev); 01969 if (!key.src) 01970 key.src = key.dst; 01971 dev_out = &loopback_dev; 01972 dev_hold(dev_out); 01973 key.oif = dev_out->ifindex; 01974 if (res.fi) 01975 fib_info_put(res.fi); 01976 res.fi = NULL; 01977 flags |= RTCF_LOCAL; 01978 goto make_route; 01979 } 01980 01981 #ifdef CONFIG_IP_ROUTE_MULTIPATH 01982 if (res.fi->fib_nhs > 1 && key.oif == 0) 01983 fib_select_multipath(&key, &res); 01984 else 01985 #endif 01986 if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif) 01987 fib_select_default(&key, &res); 01988 01989 if (!key.src) 01990 key.src = FIB_RES_PREFSRC(res); 01991 01992 if (dev_out) 01993 dev_put(dev_out); 01994 dev_out = FIB_RES_DEV(res); 01995 dev_hold(dev_out); 01996 key.oif = dev_out->ifindex; 01997 01998 make_route: 01999 if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK)) 02000 goto e_inval; 02001 02002 if (key.dst == 0xFFFFFFFF) 02003 res.type = RTN_BROADCAST; 02004 else if (MULTICAST(key.dst)) 02005 res.type = RTN_MULTICAST; 02006 else if (BADCLASS(key.dst) || ZERONET(key.dst)) 02007 goto e_inval; 02008 02009 if (dev_out->flags & IFF_LOOPBACK) 02010 flags |= RTCF_LOCAL; 02011 02012 if (res.type == RTN_BROADCAST) { 02013 flags |= RTCF_BROADCAST | RTCF_LOCAL; 02014 if (res.fi) { 02015 fib_info_put(res.fi); 02016 res.fi = NULL; 02017 } 02018 } else if (res.type == RTN_MULTICAST) { 02019 flags |= RTCF_MULTICAST|RTCF_LOCAL; 02020 read_lock(&inetdev_lock); 02021 if (!__in_dev_get(dev_out) || 02022 !ip_check_mc(__in_dev_get(dev_out),oldkey->dst,oldkey->src)) 02023 flags &= ~RTCF_LOCAL; 02024 read_unlock(&inetdev_lock); 02025 /* If multicast route do not exist use 02026 default one, but do not gateway in this case. 02027 Yes, it is hack. 02028 */ 02029 if (res.fi && res.prefixlen < 4) { 02030 fib_info_put(res.fi); 02031 res.fi = NULL; 02032 } 02033 } 02034 02035 rth = dst_alloc(&ipv4_dst_ops); 02036 if (!rth) 02037 goto e_nobufs; 02038 02039 atomic_set(&rth->u.dst.__refcnt, 1); 02040 rth->u.dst.flags= DST_HOST; 02041 rth->key.dst = oldkey->dst; 02042 rth->key.tos = tos; 02043 rth->key.src = oldkey->src; 02044 rth->key.iif = 0; 02045 rth->key.oif = oldkey->oif; 02046 #ifdef CONFIG_IP_ROUTE_FWMARK 02047 rth->key.fwmark = oldkey->fwmark; 02048 #endif 02049 rth->rt_dst = key.dst; 02050 rth->rt_src = key.src; 02051 #ifdef CONFIG_IP_ROUTE_NAT 02052 rth->rt_dst_map = key.dst; 02053 rth->rt_src_map = key.src; 02054 #endif 02055 rth->rt_iif = oldkey->oif ? : dev_out->ifindex; 02056 rth->u.dst.dev = dev_out; 02057 dev_hold(dev_out); 02058 rth->rt_gateway = key.dst; 02059 rth->rt_spec_dst= key.src; 02060 02061 rth->u.dst.output=ip_output; 02062 02063 rt_cache_stat[smp_processor_id()].out_slow_tot++; 02064 02065 if (flags & RTCF_LOCAL) { 02066 rth->u.dst.input = ip_local_deliver; 02067 rth->rt_spec_dst = key.dst; 02068 } 02069 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 02070 rth->rt_spec_dst = key.src; 02071 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { 02072 rth->u.dst.output = ip_mc_output; 02073 rt_cache_stat[smp_processor_id()].out_slow_mc++; 02074 } 02075 #ifdef CONFIG_IP_MROUTE 02076 if (res.type == RTN_MULTICAST) { 02077 struct in_device *in_dev = in_dev_get(dev_out); 02078 if (in_dev) { 02079 if (IN_DEV_MFORWARD(in_dev) && 02080 !LOCAL_MCAST(oldkey->dst)) { 02081 rth->u.dst.input = ip_mr_input; 02082 rth->u.dst.output = ip_mc_output; 02083 } 02084 in_dev_put(in_dev); 02085 } 02086 } 02087 #endif 02088 } 02089 02090 rt_set_nexthop(rth, &res, 0); 02091 02092 rth->rt_flags = flags; 02093 02094 hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos); 02095 err = rt_intern_hash(hash, rth, rp); 02096 done: 02097 if (free_res) 02098 fib_res_put(&res); 02099 if (dev_out) 02100 dev_put(dev_out); 02101 out: return err; 02102 02103 e_inval: 02104 err = -EINVAL; 02105 goto done; 02106 e_nobufs: 02107 err = -ENOBUFS; 02108 goto done; 02109 } 02110 02111 int ip_route_output_key(struct rtable **rp, const struct rt_key *key) 02112 { 02113 unsigned hash; 02114 struct rtable *rth; 02115 02116 hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos); 02117 02118 read_lock_bh(&rt_hash_table[hash].lock); 02119 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) { 02120 if (rth->key.dst == key->dst && 02121 rth->key.src == key->src && 02122 rth->key.iif == 0 && 02123 rth->key.oif == key->oif && 02124 #ifdef CONFIG_IP_ROUTE_FWMARK 02125 rth->key.fwmark == key->fwmark && 02126 #endif 02127 !((rth->key.tos ^ key->tos) & 02128 (IPTOS_RT_MASK | RTO_ONLINK))) { 02129 rth->u.dst.lastuse = jiffies; 02130 dst_hold(&rth->u.dst); 02131 rth->u.dst.__use++; 02132 rt_cache_stat[smp_processor_id()].out_hit++; 02133 read_unlock_bh(&rt_hash_table[hash].lock); 02134 *rp = rth; 02135 return 0; 02136 } 02137 rt_cache_stat[smp_processor_id()].out_hlist_search++; 02138 } 02139 read_unlock_bh(&rt_hash_table[hash].lock); 02140 02141 return ip_route_output_slow(rp, key); 02142 } 02143 02144 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 02145 int nowait) 02146 { 02147 struct rtable *rt = (struct rtable*)skb->dst; 02148 struct rtmsg *r; 02149 struct nlmsghdr *nlh; 02150 unsigned char *b = skb->tail; 02151 struct rta_cacheinfo ci; 02152 #ifdef CONFIG_IP_MROUTE 02153 struct rtattr *eptr; 02154 #endif 02155 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); 02156 r = NLMSG_DATA(nlh); 02157 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; 02158 r->rtm_family = AF_INET; 02159 r->rtm_dst_len = 32; 02160 r->rtm_src_len = 0; 02161 r->rtm_tos = rt->key.tos; 02162 r->rtm_table = RT_TABLE_MAIN; 02163 r->rtm_type = rt->rt_type; 02164 r->rtm_scope = RT_SCOPE_UNIVERSE; 02165 r->rtm_protocol = RTPROT_UNSPEC; 02166 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 02167 if (rt->rt_flags & RTCF_NOTIFY) 02168 r->rtm_flags |= RTM_F_NOTIFY; 02169 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); 02170 if (rt->key.src) { 02171 r->rtm_src_len = 32; 02172 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src); 02173 } 02174 if (rt->u.dst.dev) 02175 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); 02176 #ifdef CONFIG_NET_CLS_ROUTE 02177 if (rt->u.dst.tclassid) 02178 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid); 02179 #endif 02180 if (rt->key.iif) 02181 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); 02182 else if (rt->rt_src != rt->key.src) 02183 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src); 02184 if (rt->rt_dst != rt->rt_gateway) 02185 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); 02186 if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0) 02187 goto rtattr_failure; 02188 ci.rta_lastuse = jiffies - rt->u.dst.lastuse; 02189 ci.rta_used = rt->u.dst.__use; 02190 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt); 02191 if (rt->u.dst.expires) 02192 ci.rta_expires = rt->u.dst.expires - jiffies; 02193 else 02194 ci.rta_expires = 0; 02195 ci.rta_error = rt->u.dst.error; 02196 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0; 02197 if (rt->peer) { 02198 ci.rta_id = rt->peer->ip_id_count; 02199 if (rt->peer->tcp_ts_stamp) { 02200 ci.rta_ts = rt->peer->tcp_ts; 02201 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp; 02202 } 02203 } 02204 #ifdef CONFIG_IP_MROUTE 02205 eptr = (struct rtattr*)skb->tail; 02206 #endif 02207 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); 02208 if (rt->key.iif) { 02209 #ifdef CONFIG_IP_MROUTE 02210 u32 dst = rt->rt_dst; 02211 02212 if (MULTICAST(dst) && !LOCAL_MCAST(dst) && 02213 ipv4_devconf.mc_forwarding) { 02214 int err = ipmr_get_route(skb, r, nowait); 02215 if (err <= 0) { 02216 if (!nowait) { 02217 if (err == 0) 02218 return 0; 02219 goto nlmsg_failure; 02220 } else { 02221 if (err == -EMSGSIZE) 02222 goto nlmsg_failure; 02223 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err; 02224 } 02225 } 02226 } else 02227 #endif 02228 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif); 02229 } 02230 02231 nlh->nlmsg_len = skb->tail - b; 02232 return skb->len; 02233 02234 nlmsg_failure: 02235 rtattr_failure: 02236 skb_trim(skb, b - skb->data); 02237 return -1; 02238 } 02239 02240 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 02241 { 02242 struct rtattr **rta = arg; 02243 struct rtmsg *rtm = NLMSG_DATA(nlh); 02244 struct rtable *rt = NULL; 02245 u32 dst = 0; 02246 u32 src = 0; 02247 int iif = 0; 02248 int err = -ENOBUFS; 02249 struct sk_buff *skb; 02250 02251 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 02252 if (!skb) 02253 goto out; 02254 02255 /* Reserve room for dummy headers, this skb can pass 02256 through good chunk of routing engine. 02257 */ 02258 skb->mac.raw = skb->data; 02259 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 02260 02261 if (rta[RTA_SRC - 1]) 02262 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4); 02263 if (rta[RTA_DST - 1]) 02264 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4); 02265 if (rta[RTA_IIF - 1]) 02266 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int)); 02267 02268 if (iif) { 02269 struct net_device *dev = __dev_get_by_index(iif); 02270 err = -ENODEV; 02271 if (!dev) 02272 goto out_free; 02273 skb->protocol = htons(ETH_P_IP); 02274 skb->dev = dev; 02275 local_bh_disable(); 02276 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 02277 local_bh_enable(); 02278 rt = (struct rtable*)skb->dst; 02279 if (!err && rt->u.dst.error) 02280 err = -rt->u.dst.error; 02281 } else { 02282 int oif = 0; 02283 if (rta[RTA_OIF - 1]) 02284 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int)); 02285 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif); 02286 } 02287 if (err) 02288 goto out_free; 02289 02290 skb->dst = &rt->u.dst; 02291 if (rtm->rtm_flags & RTM_F_NOTIFY) 02292 rt->rt_flags |= RTCF_NOTIFY; 02293 02294 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; 02295 02296 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 02297 RTM_NEWROUTE, 0); 02298 if (!err) 02299 goto out_free; 02300 if (err < 0) { 02301 err = -EMSGSIZE; 02302 goto out_free; 02303 } 02304 02305 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); 02306 if (err > 0) 02307 err = 0; 02308 out: return err; 02309 02310 out_free: 02311 kfree_skb(skb); 02312 goto out; 02313 } 02314 02315 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 02316 { 02317 struct rtable *rt; 02318 int h, s_h; 02319 int idx, s_idx; 02320 02321 s_h = cb->args[0]; 02322 s_idx = idx = cb->args[1]; 02323 for (h = 0; h <= rt_hash_mask; h++) { 02324 if (h < s_h) continue; 02325 if (h > s_h) 02326 s_idx = 0; 02327 read_lock_bh(&rt_hash_table[h].lock); 02328 for (rt = rt_hash_table[h].chain, idx = 0; rt; 02329 rt = rt->u.rt_next, idx++) { 02330 if (idx < s_idx) 02331 continue; 02332 skb->dst = dst_clone(&rt->u.dst); 02333 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, 02334 cb->nlh->nlmsg_seq, 02335 RTM_NEWROUTE, 1) <= 0) { 02336 dst_release(xchg(&skb->dst, NULL)); 02337 read_unlock_bh(&rt_hash_table[h].lock); 02338 goto done; 02339 } 02340 dst_release(xchg(&skb->dst, NULL)); 02341 } 02342 read_unlock_bh(&rt_hash_table[h].lock); 02343 } 02344 02345 done: 02346 cb->args[0] = h; 02347 cb->args[1] = idx; 02348 return skb->len; 02349 } 02350 02351 void ip_rt_multicast_event(struct in_device *in_dev) 02352 { 02353 rt_cache_flush(0); 02354 } 02355 02356 #ifdef CONFIG_SYSCTL 02357 static int flush_delay; 02358 02359 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, 02360 struct file *filp, void *buffer, 02361 size_t *lenp) 02362 { 02363 if (write) { 02364 proc_dointvec(ctl, write, filp, buffer, lenp); 02365 rt_cache_flush(flush_delay); 02366 return 0; 02367 } 02368 02369 return -EINVAL; 02370 } 02371 02372 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name, 02373 int nlen, void *oldval, 02374 size_t *oldlenp, void *newval, 02375 size_t newlen, void **context) 02376 { 02377 int delay; 02378 if (newlen != sizeof(int)) 02379 return -EINVAL; 02380 if (get_user(delay, (int *)newval)) 02381 return -EFAULT; 02382 rt_cache_flush(delay); 02383 return 0; 02384 } 02385 02386 ctl_table ipv4_route_table[] = { 02387 { 02388 ctl_name: NET_IPV4_ROUTE_FLUSH, 02389 procname: "flush", 02390 data: &flush_delay, 02391 maxlen: sizeof(int), 02392 mode: 0644, 02393 proc_handler: &ipv4_sysctl_rtcache_flush, 02394 strategy: &ipv4_sysctl_rtcache_flush_strategy, 02395 }, 02396 { 02397 ctl_name: NET_IPV4_ROUTE_MIN_DELAY, 02398 procname: "min_delay", 02399 data: &ip_rt_min_delay, 02400 maxlen: sizeof(int), 02401 mode: 0644, 02402 proc_handler: &proc_dointvec_jiffies, 02403 strategy: &sysctl_jiffies, 02404 }, 02405 { 02406 ctl_name: NET_IPV4_ROUTE_MAX_DELAY, 02407 procname: "max_delay", 02408 data: &ip_rt_max_delay, 02409 maxlen: sizeof(int), 02410 mode: 0644, 02411 proc_handler: &proc_dointvec_jiffies, 02412 strategy: &sysctl_jiffies, 02413 }, 02414 { 02415 ctl_name: NET_IPV4_ROUTE_GC_THRESH, 02416 procname: "gc_thresh", 02417 data: &ipv4_dst_ops.gc_thresh, 02418 maxlen: sizeof(int), 02419 mode: 0644, 02420 proc_handler: &proc_dointvec, 02421 }, 02422 { 02423 ctl_name: NET_IPV4_ROUTE_MAX_SIZE, 02424 procname: "max_size", 02425 data: &ip_rt_max_size, 02426 maxlen: sizeof(int), 02427 mode: 0644, 02428 proc_handler: &proc_dointvec, 02429 }, 02430 { 02431 ctl_name: NET_IPV4_ROUTE_GC_MIN_INTERVAL, 02432 procname: "gc_min_interval", 02433 data: &ip_rt_gc_min_interval, 02434 maxlen: sizeof(int), 02435 mode: 0644, 02436 proc_handler: &proc_dointvec_jiffies, 02437 strategy: &sysctl_jiffies, 02438 }, 02439 { 02440 ctl_name: NET_IPV4_ROUTE_GC_TIMEOUT, 02441 procname: "gc_timeout", 02442 data: &ip_rt_gc_timeout, 02443 maxlen: sizeof(int), 02444 mode: 0644, 02445 proc_handler: &proc_dointvec_jiffies, 02446 strategy: &sysctl_jiffies, 02447 }, 02448 { 02449 ctl_name: NET_IPV4_ROUTE_GC_INTERVAL, 02450 procname: "gc_interval", 02451 data: &ip_rt_gc_interval, 02452 maxlen: sizeof(int), 02453 mode: 0644, 02454 proc_handler: &proc_dointvec_jiffies, 02455 strategy: &sysctl_jiffies, 02456 }, 02457 { 02458 ctl_name: NET_IPV4_ROUTE_REDIRECT_LOAD, 02459 procname: "redirect_load", 02460 data: &ip_rt_redirect_load, 02461 maxlen: sizeof(int), 02462 mode: 0644, 02463 proc_handler: &proc_dointvec, 02464 }, 02465 { 02466 ctl_name: NET_IPV4_ROUTE_REDIRECT_NUMBER, 02467 procname: "redirect_number", 02468 data: &ip_rt_redirect_number, 02469 maxlen: sizeof(int), 02470 mode: 0644, 02471 proc_handler: &proc_dointvec, 02472 }, 02473 { 02474 ctl_name: NET_IPV4_ROUTE_REDIRECT_SILENCE, 02475 procname: "redirect_silence", 02476 data: &ip_rt_redirect_silence, 02477 maxlen: sizeof(int), 02478 mode: 0644, 02479 proc_handler: &proc_dointvec, 02480 }, 02481 { 02482 ctl_name: NET_IPV4_ROUTE_ERROR_COST, 02483 procname: "error_cost", 02484 data: &ip_rt_error_cost, 02485 maxlen: sizeof(int), 02486 mode: 0644, 02487 proc_handler: &proc_dointvec, 02488 }, 02489 { 02490 ctl_name: NET_IPV4_ROUTE_ERROR_BURST, 02491 procname: "error_burst", 02492 data: &ip_rt_error_burst, 02493 maxlen: sizeof(int), 02494 mode: 0644, 02495 proc_handler: &proc_dointvec, 02496 }, 02497 { 02498 ctl_name: NET_IPV4_ROUTE_GC_ELASTICITY, 02499 procname: "gc_elasticity", 02500 data: &ip_rt_gc_elasticity, 02501 maxlen: sizeof(int), 02502 mode: 0644, 02503 proc_handler: &proc_dointvec, 02504 }, 02505 { 02506 ctl_name: NET_IPV4_ROUTE_MTU_EXPIRES, 02507 procname: "mtu_expires", 02508 data: &ip_rt_mtu_expires, 02509 maxlen: sizeof(int), 02510 mode: 0644, 02511 proc_handler: &proc_dointvec_jiffies, 02512 strategy: &sysctl_jiffies, 02513 }, 02514 { 02515 ctl_name: NET_IPV4_ROUTE_MIN_PMTU, 02516 procname: "min_pmtu", 02517 data: &ip_rt_min_pmtu, 02518 maxlen: sizeof(int), 02519 mode: 0644, 02520 proc_handler: &proc_dointvec, 02521 }, 02522 { 02523 ctl_name: NET_IPV4_ROUTE_MIN_ADVMSS, 02524 procname: "min_adv_mss", 02525 data: &ip_rt_min_advmss, 02526 maxlen: sizeof(int), 02527 mode: 0644, 02528 proc_handler: &proc_dointvec, 02529 }, 02530 { 02531 ctl_name: NET_IPV4_ROUTE_SECRET_INTERVAL, 02532 procname: "secret_interval", 02533 data: &ip_rt_secret_interval, 02534 maxlen: sizeof(int), 02535 mode: 0644, 02536 proc_handler: &proc_dointvec_jiffies, 02537 strategy: &sysctl_jiffies, 02538 }, 02539 { 0 } 02540 }; 02541 #endif 02542 02543 #ifdef CONFIG_NET_CLS_ROUTE 02544 struct ip_rt_acct *ip_rt_acct; 02545 02546 /* This code sucks. But you should have seen it before! --RR */ 02547 02548 /* IP route accounting ptr for this logical cpu number. */ 02549 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + cpu_logical_map(i) * 256) 02550 02551 static int ip_rt_acct_read(char *buffer, char **start, off_t offset, 02552 int length, int *eof, void *data) 02553 { 02554 unsigned int i; 02555 02556 if ((offset & 3) || (length & 3)) 02557 return -EIO; 02558 02559 if (offset >= sizeof(struct ip_rt_acct) * 256) { 02560 *eof = 1; 02561 return 0; 02562 } 02563 02564 if (offset + length >= sizeof(struct ip_rt_acct) * 256) { 02565 length = sizeof(struct ip_rt_acct) * 256 - offset; 02566 *eof = 1; 02567 } 02568 02569 offset /= sizeof(u32); 02570 02571 if (length > 0) { 02572 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset; 02573 u32 *dst = (u32 *) buffer; 02574 02575 /* Copy first cpu. */ 02576 *start = buffer; 02577 memcpy(dst, src, length); 02578 02579 /* Add the other cpus in, one int at a time */ 02580 for (i = 1; i < smp_num_cpus; i++) { 02581 unsigned int j; 02582 02583 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset; 02584 02585 for (j = 0; j < length/4; j++) 02586 dst[j] += src[j]; 02587 } 02588 } 02589 return length; 02590 } 02591 #endif 02592 02593 void __init ip_rt_init(void) 02594 { 02595 int i, order, goal; 02596 02597 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 02598 (jiffies ^ (jiffies >> 7))); 02599 02600 #ifdef CONFIG_NET_CLS_ROUTE 02601 for (order = 0; 02602 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) 02603 /* NOTHING */; 02604 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order); 02605 if (!ip_rt_acct) 02606 panic("IP: failed to allocate ip_rt_acct\n"); 02607 memset(ip_rt_acct, 0, PAGE_SIZE << order); 02608 #endif 02609 02610 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", 02611 sizeof(struct rtable), 02612 0, SLAB_HWCACHE_ALIGN, 02613 NULL, NULL); 02614 02615 if (!ipv4_dst_ops.kmem_cachep) 02616 panic("IP: failed to allocate ip_dst_cache\n"); 02617 02618 goal = num_physpages >> (26 - PAGE_SHIFT); 02619 02620 for (order = 0; (1UL << order) < goal; order++) 02621 /* NOTHING */; 02622 02623 do { 02624 rt_hash_mask = (1UL << order) * PAGE_SIZE / 02625 sizeof(struct rt_hash_bucket); 02626 while (rt_hash_mask & (rt_hash_mask - 1)) 02627 rt_hash_mask--; 02628 rt_hash_table = (struct rt_hash_bucket *) 02629 __get_free_pages(GFP_ATOMIC, order); 02630 } while (rt_hash_table == NULL && --order > 0); 02631 02632 if (!rt_hash_table) 02633 panic("Failed to allocate IP route cache hash table\n"); 02634 02635 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n", 02636 rt_hash_mask, 02637 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024); 02638 02639 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++) 02640 /* NOTHING */; 02641 02642 rt_hash_mask--; 02643 for (i = 0; i <= rt_hash_mask; i++) { 02644 rt_hash_table[i].lock = RW_LOCK_UNLOCKED; 02645 rt_hash_table[i].chain = NULL; 02646 } 02647 02648 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 02649 ip_rt_max_size = (rt_hash_mask + 1) * 16; 02650 02651 devinet_init(); 02652 ip_fib_init(); 02653 02654 rt_flush_timer.function = rt_run_flush; 02655 rt_periodic_timer.function = rt_check_expire; 02656 rt_secret_timer.function = rt_secret_rebuild; 02657 02658 /* All the timers, started at system startup tend 02659 to synchronize. Perturb it a bit. 02660 */ 02661 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval + 02662 ip_rt_gc_interval; 02663 add_timer(&rt_periodic_timer); 02664 02665 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + 02666 ip_rt_secret_interval; 02667 add_timer(&rt_secret_timer); 02668 02669 proc_net_create ("rt_cache", 0, rt_cache_get_info); 02670 proc_net_create ("rt_cache_stat", 0, rt_cache_stat_get_info); 02671 #ifdef CONFIG_NET_CLS_ROUTE 02672 create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL); 02673 #endif 02674 }

Generated on Wed Dec 1 21:25:32 2004 for Linux 2.4.23 Networking by doxygen 1.3.8