Main Page | Class List | File List | Class Members | File Members

tcp_ipv4.c

Go to the documentation of this file.
00001 /* 00002 * INET An implementation of the TCP/IP protocol suite for the LINUX 00003 * operating system. INET is implemented using the BSD Socket 00004 * interface as the means of communication with the user level. 00005 * 00006 * Implementation of the Transmission Control Protocol(TCP). 00007 * 00008 * Version: $Id: tcp_ipv4.c,v 1.237.2.1 2002/01/15 08:49:49 davem Exp $ 00009 * 00010 * IPv4 specific functions 00011 * 00012 * 00013 * code split from: 00014 * linux/ipv4/tcp.c 00015 * linux/ipv4/tcp_input.c 00016 * linux/ipv4/tcp_output.c 00017 * 00018 * See tcp.c for author information 00019 * 00020 * This program is free software; you can redistribute it and/or 00021 * modify it under the terms of the GNU General Public License 00022 * as published by the Free Software Foundation; either version 00023 * 2 of the License, or (at your option) any later version. 00024 */ 00025 00026 /* 00027 * Changes: 00028 * David S. Miller : New socket lookup architecture. 00029 * This code is dedicated to John Dyson. 00030 * David S. Miller : Change semantics of established hash, 00031 * half is devoted to TIME_WAIT sockets 00032 * and the rest go in the other half. 00033 * Andi Kleen : Add support for syncookies and fixed 00034 * some bugs: ip options weren't passed to 00035 * the TCP layer, missed a check for an ACK bit. 00036 * Andi Kleen : Implemented fast path mtu discovery. 00037 * Fixed many serious bugs in the 00038 * open_request handling and moved 00039 * most of it into the af independent code. 00040 * Added tail drop and some other bugfixes. 00041 * Added new listen sematics. 00042 * Mike McLagan : Routing by source 00043 * Juan Jose Ciarlante: ip_dynaddr bits 00044 * Andi Kleen: various fixes. 00045 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 00046 * Andi Kleen : Fix new listen. 00047 * Andi Kleen : Fix accept error reporting. 00048 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which 00049 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind 00050 * a single port at the same time. 00051 */ 00052 00053 #include <linux/config.h> 00054 00055 #include <linux/types.h> 00056 #include <linux/fcntl.h> 00057 #include <linux/random.h> 00058 #include <linux/cache.h> 00059 #include <linux/jhash.h> 00060 #include <linux/init.h> 00061 00062 #include <net/icmp.h> 00063 #include <net/tcp.h> 00064 #include <net/ipv6.h> 00065 #include <net/inet_common.h> 00066 00067 #include <linux/inet.h> 00068 #include <linux/stddef.h> 00069 #include <linux/ipsec.h> 00070 00071 extern int sysctl_ip_dynaddr; 00072 extern int sysctl_ip_default_ttl; 00073 int sysctl_tcp_tw_reuse = 0; 00074 int sysctl_tcp_low_latency = 0; 00075 00076 /* Check TCP sequence numbers in ICMP packets. */ 00077 #define ICMP_MIN_LENGTH 8 00078 00079 /* Socket used for sending RSTs */ 00080 static struct inode tcp_inode; 00081 static struct socket *tcp_socket=&tcp_inode.u.socket_i; 00082 00083 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 00084 struct sk_buff *skb); 00085 00086 /* 00087 * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation 00088 */ 00089 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { 00090 __tcp_ehash: NULL, 00091 __tcp_bhash: NULL, 00092 __tcp_bhash_size: 0, 00093 __tcp_ehash_size: 0, 00094 __tcp_listening_hash: { NULL, }, 00095 __tcp_lhash_lock: RW_LOCK_UNLOCKED, 00096 __tcp_lhash_users: ATOMIC_INIT(0), 00097 __tcp_lhash_wait: 00098 __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), 00099 __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED 00100 }; 00101 00102 /* 00103 * This array holds the first and last local port number. 00104 * For high-usage systems, use sysctl to change this to 00105 * 32768-61000 00106 */ 00107 int sysctl_local_port_range[2] = { 1024, 4999 }; 00108 int tcp_port_rover = (1024 - 1); 00109 00110 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, 00111 __u32 faddr, __u16 fport) 00112 { 00113 int h = ((laddr ^ lport) ^ (faddr ^ fport)); 00114 h ^= h>>16; 00115 h ^= h>>8; 00116 return h & (tcp_ehash_size - 1); 00117 } 00118 00119 static __inline__ int tcp_sk_hashfn(struct sock *sk) 00120 { 00121 __u32 laddr = sk->rcv_saddr; 00122 __u16 lport = sk->num; 00123 __u32 faddr = sk->daddr; 00124 __u16 fport = sk->dport; 00125 00126 return tcp_hashfn(laddr, lport, faddr, fport); 00127 } 00128 00129 /* Allocate and initialize a new TCP local port bind bucket. 00130 * The bindhash mutex for snum's hash chain must be held here. 00131 */ 00132 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, 00133 unsigned short snum) 00134 { 00135 struct tcp_bind_bucket *tb; 00136 00137 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC); 00138 if(tb != NULL) { 00139 tb->port = snum; 00140 tb->fastreuse = 0; 00141 tb->owners = NULL; 00142 if((tb->next = head->chain) != NULL) 00143 tb->next->pprev = &tb->next; 00144 head->chain = tb; 00145 tb->pprev = &head->chain; 00146 } 00147 return tb; 00148 } 00149 00150 /* Caller must disable local BH processing. */ 00151 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) 00152 { 00153 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)]; 00154 struct tcp_bind_bucket *tb; 00155 00156 spin_lock(&head->lock); 00157 tb = (struct tcp_bind_bucket *)sk->prev; 00158 if ((child->bind_next = tb->owners) != NULL) 00159 tb->owners->bind_pprev = &child->bind_next; 00160 tb->owners = child; 00161 child->bind_pprev = &tb->owners; 00162 child->prev = (struct sock *) tb; 00163 spin_unlock(&head->lock); 00164 } 00165 00166 inline void tcp_inherit_port(struct sock *sk, struct sock *child) 00167 { 00168 local_bh_disable(); 00169 __tcp_inherit_port(sk, child); 00170 local_bh_enable(); 00171 } 00172 00173 static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum) 00174 { 00175 sk->num = snum; 00176 if ((sk->bind_next = tb->owners) != NULL) 00177 tb->owners->bind_pprev = &sk->bind_next; 00178 tb->owners = sk; 00179 sk->bind_pprev = &tb->owners; 00180 sk->prev = (struct sock *) tb; 00181 } 00182 00183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) 00184 { 00185 struct sock *sk2 = tb->owners; 00186 int sk_reuse = sk->reuse; 00187 00188 for( ; sk2 != NULL; sk2 = sk2->bind_next) { 00189 if (sk != sk2 && 00190 sk2->reuse <= 1 && 00191 !ipv6_only_sock(sk2) && 00192 (!sk->bound_dev_if || 00193 !sk2->bound_dev_if || 00194 sk->bound_dev_if == sk2->bound_dev_if)) { 00195 if (!sk_reuse || 00196 !sk2->reuse || 00197 sk2->state == TCP_LISTEN) { 00198 if (!sk2->rcv_saddr || 00199 !sk->rcv_saddr || 00200 (sk2->rcv_saddr == sk->rcv_saddr)) 00201 break; 00202 } 00203 } 00204 } 00205 return sk2 != NULL; 00206 } 00207 00208 /* Obtain a reference to a local port for the given sock, 00209 * if snum is zero it means select any available local port. 00210 */ 00211 static int tcp_v4_get_port(struct sock *sk, unsigned short snum) 00212 { 00213 struct tcp_bind_hashbucket *head; 00214 struct tcp_bind_bucket *tb; 00215 int ret; 00216 00217 local_bh_disable(); 00218 if (snum == 0) { 00219 int low = sysctl_local_port_range[0]; 00220 int high = sysctl_local_port_range[1]; 00221 int remaining = (high - low) + 1; 00222 int rover; 00223 00224 spin_lock(&tcp_portalloc_lock); 00225 rover = tcp_port_rover; 00226 do { rover++; 00227 if ((rover < low) || (rover > high)) 00228 rover = low; 00229 head = &tcp_bhash[tcp_bhashfn(rover)]; 00230 spin_lock(&head->lock); 00231 for (tb = head->chain; tb; tb = tb->next) 00232 if (tb->port == rover) 00233 goto next; 00234 break; 00235 next: 00236 spin_unlock(&head->lock); 00237 } while (--remaining > 0); 00238 tcp_port_rover = rover; 00239 spin_unlock(&tcp_portalloc_lock); 00240 00241 /* Exhausted local port range during search? */ 00242 ret = 1; 00243 if (remaining <= 0) 00244 goto fail; 00245 00246 /* OK, here is the one we will use. HEAD is 00247 * non-NULL and we hold it's mutex. 00248 */ 00249 snum = rover; 00250 tb = NULL; 00251 } else { 00252 head = &tcp_bhash[tcp_bhashfn(snum)]; 00253 spin_lock(&head->lock); 00254 for (tb = head->chain; tb != NULL; tb = tb->next) 00255 if (tb->port == snum) 00256 break; 00257 } 00258 if (tb != NULL && tb->owners != NULL) { 00259 if (sk->reuse > 1) 00260 goto success; 00261 if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) { 00262 goto success; 00263 } else { 00264 ret = 1; 00265 if (tcp_bind_conflict(sk, tb)) 00266 goto fail_unlock; 00267 } 00268 } 00269 ret = 1; 00270 if (tb == NULL && 00271 (tb = tcp_bucket_create(head, snum)) == NULL) 00272 goto fail_unlock; 00273 if (tb->owners == NULL) { 00274 if (sk->reuse && sk->state != TCP_LISTEN) 00275 tb->fastreuse = 1; 00276 else 00277 tb->fastreuse = 0; 00278 } else if (tb->fastreuse && 00279 ((sk->reuse == 0) || (sk->state == TCP_LISTEN))) 00280 tb->fastreuse = 0; 00281 success: 00282 if (sk->prev == NULL) 00283 tcp_bind_hash(sk, tb, snum); 00284 BUG_TRAP(sk->prev == (struct sock *) tb); 00285 ret = 0; 00286 00287 fail_unlock: 00288 spin_unlock(&head->lock); 00289 fail: 00290 local_bh_enable(); 00291 return ret; 00292 } 00293 00294 /* Get rid of any references to a local port held by the 00295 * given sock. 00296 */ 00297 inline void __tcp_put_port(struct sock *sk) 00298 { 00299 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)]; 00300 struct tcp_bind_bucket *tb; 00301 00302 spin_lock(&head->lock); 00303 tb = (struct tcp_bind_bucket *) sk->prev; 00304 if (sk->bind_next) 00305 sk->bind_next->bind_pprev = sk->bind_pprev; 00306 *(sk->bind_pprev) = sk->bind_next; 00307 sk->prev = NULL; 00308 sk->num = 0; 00309 if (tb->owners == NULL) { 00310 if (tb->next) 00311 tb->next->pprev = tb->pprev; 00312 *(tb->pprev) = tb->next; 00313 kmem_cache_free(tcp_bucket_cachep, tb); 00314 } 00315 spin_unlock(&head->lock); 00316 } 00317 00318 void tcp_put_port(struct sock *sk) 00319 { 00320 local_bh_disable(); 00321 __tcp_put_port(sk); 00322 local_bh_enable(); 00323 } 00324 00325 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. 00326 * Look, when several writers sleep and reader wakes them up, all but one 00327 * immediately hit write lock and grab all the cpus. Exclusive sleep solves 00328 * this, _but_ remember, it adds useless work on UP machines (wake up each 00329 * exclusive lock release). It should be ifdefed really. 00330 */ 00331 00332 void tcp_listen_wlock(void) 00333 { 00334 write_lock(&tcp_lhash_lock); 00335 00336 if (atomic_read(&tcp_lhash_users)) { 00337 DECLARE_WAITQUEUE(wait, current); 00338 00339 add_wait_queue_exclusive(&tcp_lhash_wait, &wait); 00340 for (;;) { 00341 set_current_state(TASK_UNINTERRUPTIBLE); 00342 if (atomic_read(&tcp_lhash_users) == 0) 00343 break; 00344 write_unlock_bh(&tcp_lhash_lock); 00345 schedule(); 00346 write_lock_bh(&tcp_lhash_lock); 00347 } 00348 00349 __set_current_state(TASK_RUNNING); 00350 remove_wait_queue(&tcp_lhash_wait, &wait); 00351 } 00352 } 00353 00354 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) 00355 { 00356 struct sock **skp; 00357 rwlock_t *lock; 00358 00359 BUG_TRAP(sk->pprev==NULL); 00360 if(listen_possible && sk->state == TCP_LISTEN) { 00361 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; 00362 lock = &tcp_lhash_lock; 00363 tcp_listen_wlock(); 00364 } else { 00365 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain; 00366 lock = &tcp_ehash[sk->hashent].lock; 00367 write_lock(lock); 00368 } 00369 if((sk->next = *skp) != NULL) 00370 (*skp)->pprev = &sk->next; 00371 *skp = sk; 00372 sk->pprev = skp; 00373 sock_prot_inc_use(sk->prot); 00374 write_unlock(lock); 00375 if (listen_possible && sk->state == TCP_LISTEN) 00376 wake_up(&tcp_lhash_wait); 00377 } 00378 00379 static void tcp_v4_hash(struct sock *sk) 00380 { 00381 if (sk->state != TCP_CLOSE) { 00382 local_bh_disable(); 00383 __tcp_v4_hash(sk, 1); 00384 local_bh_enable(); 00385 } 00386 } 00387 00388 void tcp_unhash(struct sock *sk) 00389 { 00390 rwlock_t *lock; 00391 00392 if (!sk->pprev) 00393 goto ende; 00394 00395 if (sk->state == TCP_LISTEN) { 00396 local_bh_disable(); 00397 tcp_listen_wlock(); 00398 lock = &tcp_lhash_lock; 00399 } else { 00400 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent]; 00401 lock = &head->lock; 00402 write_lock_bh(&head->lock); 00403 } 00404 00405 if(sk->pprev) { 00406 if(sk->next) 00407 sk->next->pprev = sk->pprev; 00408 *sk->pprev = sk->next; 00409 sk->pprev = NULL; 00410 sock_prot_dec_use(sk->prot); 00411 } 00412 write_unlock_bh(lock); 00413 00414 ende: 00415 if (sk->state == TCP_LISTEN) 00416 wake_up(&tcp_lhash_wait); 00417 } 00418 00419 /* Don't inline this cruft. Here are some nice properties to 00420 * exploit here. The BSD API does not allow a listening TCP 00421 * to specify the remote port nor the remote address for the 00422 * connection. So always assume those are both wildcarded 00423 * during the search since they can never be otherwise. 00424 */ 00425 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif) 00426 { 00427 struct sock *result = NULL; 00428 int score, hiscore; 00429 00430 hiscore=-1; 00431 for(; sk; sk = sk->next) { 00432 if(sk->num == hnum && !ipv6_only_sock(sk)) { 00433 __u32 rcv_saddr = sk->rcv_saddr; 00434 00435 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 00436 score = sk->family == PF_INET ? 1 : 0; 00437 #else 00438 score = 1; 00439 #endif 00440 if(rcv_saddr) { 00441 if (rcv_saddr != daddr) 00442 continue; 00443 score+=2; 00444 } 00445 if (sk->bound_dev_if) { 00446 if (sk->bound_dev_if != dif) 00447 continue; 00448 score+=2; 00449 } 00450 if (score == 5) 00451 return sk; 00452 if (score > hiscore) { 00453 hiscore = score; 00454 result = sk; 00455 } 00456 } 00457 } 00458 return result; 00459 } 00460 00461 /* Optimize the common listener case. */ 00462 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif) 00463 { 00464 struct sock *sk; 00465 00466 read_lock(&tcp_lhash_lock); 00467 sk = tcp_listening_hash[tcp_lhashfn(hnum)]; 00468 if (sk) { 00469 if (sk->num == hnum && 00470 sk->next == NULL && 00471 (!sk->rcv_saddr || sk->rcv_saddr == daddr) && 00472 (sk->family == PF_INET || !ipv6_only_sock(sk)) && 00473 !sk->bound_dev_if) 00474 goto sherry_cache; 00475 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif); 00476 } 00477 if (sk) { 00478 sherry_cache: 00479 sock_hold(sk); 00480 } 00481 read_unlock(&tcp_lhash_lock); 00482 return sk; 00483 } 00484 00485 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so 00486 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM 00487 * 00488 * Local BH must be disabled here. 00489 */ 00490 00491 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, 00492 u32 daddr, u16 hnum, int dif) 00493 { 00494 struct tcp_ehash_bucket *head; 00495 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) 00496 __u32 ports = TCP_COMBINED_PORTS(sport, hnum); 00497 struct sock *sk; 00498 int hash; 00499 00500 /* Optimize here for direct hit, only listening connections can 00501 * have wildcards anyways. 00502 */ 00503 hash = tcp_hashfn(daddr, hnum, saddr, sport); 00504 head = &tcp_ehash[hash]; 00505 read_lock(&head->lock); 00506 for(sk = head->chain; sk; sk = sk->next) { 00507 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) 00508 goto hit; /* You sunk my battleship! */ 00509 } 00510 00511 /* Must check for a TIME_WAIT'er before going to listener hash. */ 00512 for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next) 00513 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) 00514 goto hit; 00515 read_unlock(&head->lock); 00516 00517 return NULL; 00518 00519 hit: 00520 sock_hold(sk); 00521 read_unlock(&head->lock); 00522 return sk; 00523 } 00524 00525 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, 00526 u32 daddr, u16 hnum, int dif) 00527 { 00528 struct sock *sk; 00529 00530 sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif); 00531 00532 if (sk) 00533 return sk; 00534 00535 return tcp_v4_lookup_listener(daddr, hnum, dif); 00536 } 00537 00538 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) 00539 { 00540 struct sock *sk; 00541 00542 local_bh_disable(); 00543 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); 00544 local_bh_enable(); 00545 00546 return sk; 00547 } 00548 00549 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) 00550 { 00551 return secure_tcp_sequence_number(skb->nh.iph->daddr, 00552 skb->nh.iph->saddr, 00553 skb->h.th->dest, 00554 skb->h.th->source); 00555 } 00556 00557 /* called with local bh disabled */ 00558 static int __tcp_v4_check_established(struct sock *sk, __u16 lport, 00559 struct tcp_tw_bucket **twp) 00560 { 00561 u32 daddr = sk->rcv_saddr; 00562 u32 saddr = sk->daddr; 00563 int dif = sk->bound_dev_if; 00564 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) 00565 __u32 ports = TCP_COMBINED_PORTS(sk->dport, lport); 00566 int hash = tcp_hashfn(daddr, lport, saddr, sk->dport); 00567 struct tcp_ehash_bucket *head = &tcp_ehash[hash]; 00568 struct sock *sk2, **skp; 00569 struct tcp_tw_bucket *tw; 00570 00571 write_lock(&head->lock); 00572 00573 /* Check TIME-WAIT sockets first. */ 00574 for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL; 00575 skp = &sk2->next) { 00576 tw = (struct tcp_tw_bucket*)sk2; 00577 00578 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { 00579 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 00580 00581 /* With PAWS, it is safe from the viewpoint 00582 of data integrity. Even without PAWS it 00583 is safe provided sequence spaces do not 00584 overlap i.e. at data rates <= 80Mbit/sec. 00585 00586 Actually, the idea is close to VJ's one, 00587 only timestamp cache is held not per host, 00588 but per port pair and TW bucket is used 00589 as state holder. 00590 00591 If TW bucket has been already destroyed we 00592 fall back to VJ's scheme and use initial 00593 timestamp retrieved from peer table. 00594 */ 00595 if (tw->ts_recent_stamp && 00596 (!twp || (sysctl_tcp_tw_reuse && 00597 xtime.tv_sec - tw->ts_recent_stamp > 1))) { 00598 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0) 00599 tp->write_seq = 1; 00600 tp->ts_recent = tw->ts_recent; 00601 tp->ts_recent_stamp = tw->ts_recent_stamp; 00602 sock_hold(sk2); 00603 skp = &head->chain; 00604 goto unique; 00605 } else 00606 goto not_unique; 00607 } 00608 } 00609 tw = NULL; 00610 00611 /* And established part... */ 00612 for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) { 00613 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) 00614 goto not_unique; 00615 } 00616 00617 unique: 00618 /* Must record num and sport now. Otherwise we will see 00619 * in hash table socket with a funny identity. */ 00620 sk->num = lport; 00621 sk->sport = htons(lport); 00622 BUG_TRAP(sk->pprev==NULL); 00623 if ((sk->next = *skp) != NULL) 00624 (*skp)->pprev = &sk->next; 00625 00626 *skp = sk; 00627 sk->pprev = skp; 00628 sk->hashent = hash; 00629 sock_prot_inc_use(sk->prot); 00630 write_unlock(&head->lock); 00631 00632 if (twp) { 00633 *twp = tw; 00634 NET_INC_STATS_BH(TimeWaitRecycled); 00635 } else if (tw) { 00636 /* Silly. Should hash-dance instead... */ 00637 tcp_tw_deschedule(tw); 00638 tcp_timewait_kill(tw); 00639 NET_INC_STATS_BH(TimeWaitRecycled); 00640 00641 tcp_tw_put(tw); 00642 } 00643 00644 return 0; 00645 00646 not_unique: 00647 write_unlock(&head->lock); 00648 return -EADDRNOTAVAIL; 00649 } 00650 00651 /* 00652 * Bind a port for a connect operation and hash it. 00653 */ 00654 static int tcp_v4_hash_connect(struct sock *sk) 00655 { 00656 unsigned short snum = sk->num; 00657 struct tcp_bind_hashbucket *head; 00658 struct tcp_bind_bucket *tb; 00659 00660 if (snum == 0) { 00661 int rover; 00662 int low = sysctl_local_port_range[0]; 00663 int high = sysctl_local_port_range[1]; 00664 int remaining = (high - low) + 1; 00665 struct tcp_tw_bucket *tw = NULL; 00666 00667 local_bh_disable(); 00668 00669 /* TODO. Actually it is not so bad idea to remove 00670 * tcp_portalloc_lock before next submission to Linus. 00671 * As soon as we touch this place at all it is time to think. 00672 * 00673 * Now it protects single _advisory_ variable tcp_port_rover, 00674 * hence it is mostly useless. 00675 * Code will work nicely if we just delete it, but 00676 * I am afraid in contented case it will work not better or 00677 * even worse: another cpu just will hit the same bucket 00678 * and spin there. 00679 * So some cpu salt could remove both contention and 00680 * memory pingpong. Any ideas how to do this in a nice way? 00681 */ 00682 spin_lock(&tcp_portalloc_lock); 00683 rover = tcp_port_rover; 00684 00685 do { 00686 rover++; 00687 if ((rover < low) || (rover > high)) 00688 rover = low; 00689 head = &tcp_bhash[tcp_bhashfn(rover)]; 00690 spin_lock(&head->lock); 00691 00692 /* Does not bother with rcv_saddr checks, 00693 * because the established check is already 00694 * unique enough. 00695 */ 00696 for (tb = head->chain; tb; tb = tb->next) { 00697 if (tb->port == rover) { 00698 BUG_TRAP(tb->owners != NULL); 00699 if (tb->fastreuse >= 0) 00700 goto next_port; 00701 if (!__tcp_v4_check_established(sk, rover, &tw)) 00702 goto ok; 00703 goto next_port; 00704 } 00705 } 00706 00707 tb = tcp_bucket_create(head, rover); 00708 if (!tb) { 00709 spin_unlock(&head->lock); 00710 break; 00711 } 00712 tb->fastreuse = -1; 00713 goto ok; 00714 00715 next_port: 00716 spin_unlock(&head->lock); 00717 } while (--remaining > 0); 00718 tcp_port_rover = rover; 00719 spin_unlock(&tcp_portalloc_lock); 00720 00721 local_bh_enable(); 00722 00723 return -EADDRNOTAVAIL; 00724 00725 ok: 00726 /* All locks still held and bhs disabled */ 00727 tcp_port_rover = rover; 00728 spin_unlock(&tcp_portalloc_lock); 00729 00730 tcp_bind_hash(sk, tb, rover); 00731 if (!sk->pprev) { 00732 sk->sport = htons(rover); 00733 __tcp_v4_hash(sk, 0); 00734 } 00735 spin_unlock(&head->lock); 00736 00737 if (tw) { 00738 tcp_tw_deschedule(tw); 00739 tcp_timewait_kill(tw); 00740 tcp_tw_put(tw); 00741 } 00742 00743 local_bh_enable(); 00744 return 0; 00745 } 00746 00747 head = &tcp_bhash[tcp_bhashfn(snum)]; 00748 tb = (struct tcp_bind_bucket *)sk->prev; 00749 spin_lock_bh(&head->lock); 00750 if (tb->owners == sk && sk->bind_next == NULL) { 00751 __tcp_v4_hash(sk, 0); 00752 spin_unlock_bh(&head->lock); 00753 return 0; 00754 } else { 00755 int ret; 00756 spin_unlock(&head->lock); 00757 /* No definite answer... Walk to established hash table */ 00758 ret = __tcp_v4_check_established(sk, snum, NULL); 00759 local_bh_enable(); 00760 return ret; 00761 } 00762 } 00763 00764 /* This will initiate an outgoing connection. */ 00765 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 00766 { 00767 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 00768 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; 00769 struct rtable *rt; 00770 u32 daddr, nexthop; 00771 int tmp; 00772 int err; 00773 00774 if (addr_len < sizeof(struct sockaddr_in)) 00775 return(-EINVAL); 00776 00777 if (usin->sin_family != AF_INET) 00778 return(-EAFNOSUPPORT); 00779 00780 nexthop = daddr = usin->sin_addr.s_addr; 00781 if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) { 00782 if (daddr == 0) 00783 return -EINVAL; 00784 nexthop = sk->protinfo.af_inet.opt->faddr; 00785 } 00786 00787 tmp = ip_route_connect(&rt, nexthop, sk->saddr, 00788 RT_CONN_FLAGS(sk), sk->bound_dev_if); 00789 if (tmp < 0) 00790 return tmp; 00791 00792 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) { 00793 ip_rt_put(rt); 00794 return -ENETUNREACH; 00795 } 00796 00797 __sk_dst_set(sk, &rt->u.dst); 00798 sk->route_caps = rt->u.dst.dev->features; 00799 00800 if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr) 00801 daddr = rt->rt_dst; 00802 00803 if (!sk->saddr) 00804 sk->saddr = rt->rt_src; 00805 sk->rcv_saddr = sk->saddr; 00806 00807 if (tp->ts_recent_stamp && sk->daddr != daddr) { 00808 /* Reset inherited state */ 00809 tp->ts_recent = 0; 00810 tp->ts_recent_stamp = 0; 00811 tp->write_seq = 0; 00812 } 00813 00814 if (sysctl_tcp_tw_recycle && 00815 !tp->ts_recent_stamp && 00816 rt->rt_dst == daddr) { 00817 struct inet_peer *peer = rt_get_peer(rt); 00818 00819 /* VJ's idea. We save last timestamp seen from 00820 * the destination in peer table, when entering state TIME-WAIT 00821 * and initialize ts_recent from it, when trying new connection. 00822 */ 00823 00824 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { 00825 tp->ts_recent_stamp = peer->tcp_ts_stamp; 00826 tp->ts_recent = peer->tcp_ts; 00827 } 00828 } 00829 00830 sk->dport = usin->sin_port; 00831 sk->daddr = daddr; 00832 00833 tp->ext_header_len = 0; 00834 if (sk->protinfo.af_inet.opt) 00835 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen; 00836 00837 tp->mss_clamp = 536; 00838 00839 /* Socket identity is still unknown (sport may be zero). 00840 * However we set state to SYN-SENT and not releasing socket 00841 * lock select source port, enter ourselves into the hash tables and 00842 * complete initalization after this. 00843 */ 00844 tcp_set_state(sk, TCP_SYN_SENT); 00845 err = tcp_v4_hash_connect(sk); 00846 if (err) 00847 goto failure; 00848 00849 if (!tp->write_seq) 00850 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, 00851 sk->sport, usin->sin_port); 00852 00853 sk->protinfo.af_inet.id = tp->write_seq^jiffies; 00854 00855 err = tcp_connect(sk); 00856 if (err) 00857 goto failure; 00858 00859 return 0; 00860 00861 failure: 00862 tcp_set_state(sk, TCP_CLOSE); 00863 __sk_dst_reset(sk); 00864 sk->route_caps = 0; 00865 sk->dport = 0; 00866 return err; 00867 } 00868 00869 static __inline__ int tcp_v4_iif(struct sk_buff *skb) 00870 { 00871 return ((struct rtable*)skb->dst)->rt_iif; 00872 } 00873 00874 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd) 00875 { 00876 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); 00877 } 00878 00879 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, 00880 struct open_request ***prevp, 00881 __u16 rport, 00882 __u32 raddr, __u32 laddr) 00883 { 00884 struct tcp_listen_opt *lopt = tp->listen_opt; 00885 struct open_request *req, **prev; 00886 00887 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)]; 00888 (req = *prev) != NULL; 00889 prev = &req->dl_next) { 00890 if (req->rmt_port == rport && 00891 req->af.v4_req.rmt_addr == raddr && 00892 req->af.v4_req.loc_addr == laddr && 00893 TCP_INET_FAMILY(req->class->family)) { 00894 BUG_TRAP(req->sk == NULL); 00895 *prevp = prev; 00896 return req; 00897 } 00898 } 00899 00900 return NULL; 00901 } 00902 00903 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req) 00904 { 00905 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; 00906 struct tcp_listen_opt *lopt = tp->listen_opt; 00907 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd); 00908 00909 req->expires = jiffies + TCP_TIMEOUT_INIT; 00910 req->retrans = 0; 00911 req->sk = NULL; 00912 req->dl_next = lopt->syn_table[h]; 00913 00914 write_lock(&tp->syn_wait_lock); 00915 lopt->syn_table[h] = req; 00916 write_unlock(&tp->syn_wait_lock); 00917 00918 tcp_synq_added(sk); 00919 } 00920 00921 00922 /* 00923 * This routine does path mtu discovery as defined in RFC1191. 00924 */ 00925 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu) 00926 { 00927 struct dst_entry *dst; 00928 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; 00929 00930 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs 00931 * send out by Linux are always <576bytes so they should go through 00932 * unfragmented). 00933 */ 00934 if (sk->state == TCP_LISTEN) 00935 return; 00936 00937 /* We don't check in the destentry if pmtu discovery is forbidden 00938 * on this route. We just assume that no packet_to_big packets 00939 * are send back when pmtu discovery is not active. 00940 * There is a small race when the user changes this flag in the 00941 * route, but I think that's acceptable. 00942 */ 00943 if ((dst = __sk_dst_check(sk, 0)) == NULL) 00944 return; 00945 00946 ip_rt_update_pmtu(dst, mtu); 00947 00948 /* Something is about to be wrong... Remember soft error 00949 * for the case, if this connection will not able to recover. 00950 */ 00951 if (mtu < dst->pmtu && ip_dont_fragment(sk, dst)) 00952 sk->err_soft = EMSGSIZE; 00953 00954 if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT && 00955 tp->pmtu_cookie > dst->pmtu) { 00956 tcp_sync_mss(sk, dst->pmtu); 00957 00958 /* Resend the TCP packet because it's 00959 * clear that the old packet has been 00960 * dropped. This is the new "fast" path mtu 00961 * discovery. 00962 */ 00963 tcp_simple_retransmit(sk); 00964 } /* else let the usual retransmit timer handle it */ 00965 } 00966 00967 /* 00968 * This routine is called by the ICMP module when it gets some 00969 * sort of error condition. If err < 0 then the socket should 00970 * be closed and the error returned to the user. If err > 0 00971 * it's just the icmp type << 8 | icmp code. After adjustment 00972 * header points to the first 8 bytes of the tcp header. We need 00973 * to find the appropriate port. 00974 * 00975 * The locking strategy used here is very "optimistic". When 00976 * someone else accesses the socket the ICMP is just dropped 00977 * and for some paths there is no check at all. 00978 * A more general error queue to queue errors for later handling 00979 * is probably better. 00980 * 00981 */ 00982 00983 void tcp_v4_err(struct sk_buff *skb, u32 info) 00984 { 00985 struct iphdr *iph = (struct iphdr*)skb->data; 00986 struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2)); 00987 struct tcp_opt *tp; 00988 int type = skb->h.icmph->type; 00989 int code = skb->h.icmph->code; 00990 struct sock *sk; 00991 __u32 seq; 00992 int err; 00993 00994 if (skb->len < (iph->ihl << 2) + 8) { 00995 ICMP_INC_STATS_BH(IcmpInErrors); 00996 return; 00997 } 00998 00999 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb)); 01000 if (sk == NULL) { 01001 ICMP_INC_STATS_BH(IcmpInErrors); 01002 return; 01003 } 01004 if (sk->state == TCP_TIME_WAIT) { 01005 tcp_tw_put((struct tcp_tw_bucket*)sk); 01006 return; 01007 } 01008 01009 bh_lock_sock(sk); 01010 /* If too many ICMPs get dropped on busy 01011 * servers this needs to be solved differently. 01012 */ 01013 if (sk->lock.users != 0) 01014 NET_INC_STATS_BH(LockDroppedIcmps); 01015 01016 if (sk->state == TCP_CLOSE) 01017 goto out; 01018 01019 tp = &sk->tp_pinfo.af_tcp; 01020 seq = ntohl(th->seq); 01021 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { 01022 NET_INC_STATS(OutOfWindowIcmps); 01023 goto out; 01024 } 01025 01026 switch (type) { 01027 case ICMP_SOURCE_QUENCH: 01028 /* This is deprecated, but if someone generated it, 01029 * we have no reasons to ignore it. 01030 */ 01031 if (sk->lock.users == 0) 01032 tcp_enter_cwr(tp); 01033 goto out; 01034 case ICMP_PARAMETERPROB: 01035 err = EPROTO; 01036 break; 01037 case ICMP_DEST_UNREACH: 01038 if (code > NR_ICMP_UNREACH) 01039 goto out; 01040 01041 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 01042 if (sk->lock.users == 0) 01043 do_pmtu_discovery(sk, iph, info); 01044 goto out; 01045 } 01046 01047 err = icmp_err_convert[code].errno; 01048 break; 01049 case ICMP_TIME_EXCEEDED: 01050 err = EHOSTUNREACH; 01051 break; 01052 default: 01053 goto out; 01054 } 01055 01056 switch (sk->state) { 01057 struct open_request *req, **prev; 01058 case TCP_LISTEN: 01059 if (sk->lock.users != 0) 01060 goto out; 01061 01062 req = tcp_v4_search_req(tp, &prev, 01063 th->dest, 01064 iph->daddr, iph->saddr); 01065 if (!req) 01066 goto out; 01067 01068 /* ICMPs are not backlogged, hence we cannot get 01069 an established socket here. 01070 */ 01071 BUG_TRAP(req->sk == NULL); 01072 01073 if (seq != req->snt_isn) { 01074 NET_INC_STATS_BH(OutOfWindowIcmps); 01075 goto out; 01076 } 01077 01078 /* 01079 * Still in SYN_RECV, just remove it silently. 01080 * There is no good way to pass the error to the newly 01081 * created socket, and POSIX does not want network 01082 * errors returned from accept(). 01083 */ 01084 tcp_synq_drop(sk, req, prev); 01085 goto out; 01086 01087 case TCP_SYN_SENT: 01088 case TCP_SYN_RECV: /* Cannot happen. 01089 It can f.e. if SYNs crossed. 01090 */ 01091 if (sk->lock.users == 0) { 01092 TCP_INC_STATS_BH(TcpAttemptFails); 01093 sk->err = err; 01094 01095 sk->error_report(sk); 01096 01097 tcp_done(sk); 01098 } else { 01099 sk->err_soft = err; 01100 } 01101 goto out; 01102 } 01103 01104 /* If we've already connected we will keep trying 01105 * until we time out, or the user gives up. 01106 * 01107 * rfc1122 4.2.3.9 allows to consider as hard errors 01108 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, 01109 * but it is obsoleted by pmtu discovery). 01110 * 01111 * Note, that in modern internet, where routing is unreliable 01112 * and in each dark corner broken firewalls sit, sending random 01113 * errors ordered by their masters even this two messages finally lose 01114 * their original sense (even Linux sends invalid PORT_UNREACHs) 01115 * 01116 * Now we are in compliance with RFCs. 01117 * --ANK (980905) 01118 */ 01119 01120 if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) { 01121 sk->err = err; 01122 sk->error_report(sk); 01123 } else { /* Only an error on timeout */ 01124 sk->err_soft = err; 01125 } 01126 01127 out: 01128 bh_unlock_sock(sk); 01129 sock_put(sk); 01130 } 01131 01132 /* This routine computes an IPv4 TCP checksum. */ 01133 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 01134 struct sk_buff *skb) 01135 { 01136 if (skb->ip_summed == CHECKSUM_HW) { 01137 th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0); 01138 skb->csum = offsetof(struct tcphdr, check); 01139 } else { 01140 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr, 01141 csum_partial((char *)th, th->doff<<2, skb->csum)); 01142 } 01143 } 01144 01145 /* 01146 * This routine will send an RST to the other tcp. 01147 * 01148 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) 01149 * for reset. 01150 * Answer: if a packet caused RST, it is not for a socket 01151 * existing in our system, if it is matched to a socket, 01152 * it is just duplicate segment or bug in other side's TCP. 01153 * So that we build reply only basing on parameters 01154 * arrived with segment. 01155 * Exception: precedence violation. We do not implement it in any case. 01156 */ 01157 01158 static void tcp_v4_send_reset(struct sk_buff *skb) 01159 { 01160 struct tcphdr *th = skb->h.th; 01161 struct tcphdr rth; 01162 struct ip_reply_arg arg; 01163 01164 /* Never send a reset in response to a reset. */ 01165 if (th->rst) 01166 return; 01167 01168 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL) 01169 return; 01170 01171 /* Swap the send and the receive. */ 01172 memset(&rth, 0, sizeof(struct tcphdr)); 01173 rth.dest = th->source; 01174 rth.source = th->dest; 01175 rth.doff = sizeof(struct tcphdr)/4; 01176 rth.rst = 1; 01177 01178 if (th->ack) { 01179 rth.seq = th->ack_seq; 01180 } else { 01181 rth.ack = 1; 01182 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin 01183 + skb->len - (th->doff<<2)); 01184 } 01185 01186 memset(&arg, 0, sizeof arg); 01187 arg.iov[0].iov_base = (unsigned char *)&rth; 01188 arg.iov[0].iov_len = sizeof rth; 01189 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, 01190 skb->nh.iph->saddr, /*XXX*/ 01191 sizeof(struct tcphdr), 01192 IPPROTO_TCP, 01193 0); 01194 arg.n_iov = 1; 01195 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 01196 01197 tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl; 01198 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth); 01199 01200 TCP_INC_STATS_BH(TcpOutSegs); 01201 TCP_INC_STATS_BH(TcpOutRsts); 01202 } 01203 01204 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 01205 outside socket context is ugly, certainly. What can I do? 01206 */ 01207 01208 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts) 01209 { 01210 struct tcphdr *th = skb->h.th; 01211 struct { 01212 struct tcphdr th; 01213 u32 tsopt[3]; 01214 } rep; 01215 struct ip_reply_arg arg; 01216 01217 memset(&rep.th, 0, sizeof(struct tcphdr)); 01218 memset(&arg, 0, sizeof arg); 01219 01220 arg.iov[0].iov_base = (unsigned char *)&rep; 01221 arg.iov[0].iov_len = sizeof(rep.th); 01222 arg.n_iov = 1; 01223 if (ts) { 01224 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | 01225 (TCPOPT_NOP << 16) | 01226 (TCPOPT_TIMESTAMP << 8) | 01227 TCPOLEN_TIMESTAMP); 01228 rep.tsopt[1] = htonl(tcp_time_stamp); 01229 rep.tsopt[2] = htonl(ts); 01230 arg.iov[0].iov_len = sizeof(rep); 01231 } 01232 01233 /* Swap the send and the receive. */ 01234 rep.th.dest = th->source; 01235 rep.th.source = th->dest; 01236 rep.th.doff = arg.iov[0].iov_len/4; 01237 rep.th.seq = htonl(seq); 01238 rep.th.ack_seq = htonl(ack); 01239 rep.th.ack = 1; 01240 rep.th.window = htons(win); 01241 01242 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, 01243 skb->nh.iph->saddr, /*XXX*/ 01244 arg.iov[0].iov_len, 01245 IPPROTO_TCP, 01246 0); 01247 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 01248 01249 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len); 01250 01251 TCP_INC_STATS_BH(TcpOutSegs); 01252 } 01253 01254 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 01255 { 01256 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; 01257 01258 tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, 01259 tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent); 01260 01261 tcp_tw_put(tw); 01262 } 01263 01264 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req) 01265 { 01266 tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, 01267 req->ts_recent); 01268 } 01269 01270 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req) 01271 { 01272 struct rtable *rt; 01273 struct ip_options *opt; 01274 01275 opt = req->af.v4_req.opt; 01276 if(ip_route_output(&rt, ((opt && opt->srr) ? 01277 opt->faddr : 01278 req->af.v4_req.rmt_addr), 01279 req->af.v4_req.loc_addr, 01280 RT_CONN_FLAGS(sk), sk->bound_dev_if)) { 01281 IP_INC_STATS_BH(IpOutNoRoutes); 01282 return NULL; 01283 } 01284 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { 01285 ip_rt_put(rt); 01286 IP_INC_STATS_BH(IpOutNoRoutes); 01287 return NULL; 01288 } 01289 return &rt->u.dst; 01290 } 01291 01292 /* 01293 * Send a SYN-ACK after having received an ACK. 01294 * This still operates on a open_request only, not on a big 01295 * socket. 01296 */ 01297 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req, 01298 struct dst_entry *dst) 01299 { 01300 int err = -1; 01301 struct sk_buff * skb; 01302 01303 /* First, grab a route. */ 01304 if (dst == NULL && 01305 (dst = tcp_v4_route_req(sk, req)) == NULL) 01306 goto out; 01307 01308 skb = tcp_make_synack(sk, dst, req); 01309 01310 if (skb) { 01311 struct tcphdr *th = skb->h.th; 01312 01313 th->check = tcp_v4_check(th, skb->len, 01314 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr, 01315 csum_partial((char *)th, skb->len, skb->csum)); 01316 01317 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, 01318 req->af.v4_req.rmt_addr, req->af.v4_req.opt); 01319 if (err == NET_XMIT_CN) 01320 err = 0; 01321 } 01322 01323 out: 01324 dst_release(dst); 01325 return err; 01326 } 01327 01328 /* 01329 * IPv4 open_request destructor. 01330 */ 01331 static void tcp_v4_or_free(struct open_request *req) 01332 { 01333 if (req->af.v4_req.opt) 01334 kfree(req->af.v4_req.opt); 01335 } 01336 01337 static inline void syn_flood_warning(struct sk_buff *skb) 01338 { 01339 static unsigned long warntime; 01340 01341 if (jiffies - warntime > HZ*60) { 01342 warntime = jiffies; 01343 printk(KERN_INFO 01344 "possible SYN flooding on port %d. Sending cookies.\n", 01345 ntohs(skb->h.th->dest)); 01346 } 01347 } 01348 01349 /* 01350 * Save and compile IPv4 options into the open_request if needed. 01351 */ 01352 static inline struct ip_options * 01353 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb) 01354 { 01355 struct ip_options *opt = &(IPCB(skb)->opt); 01356 struct ip_options *dopt = NULL; 01357 01358 if (opt && opt->optlen) { 01359 int opt_size = optlength(opt); 01360 dopt = kmalloc(opt_size, GFP_ATOMIC); 01361 if (dopt) { 01362 if (ip_options_echo(dopt, skb)) { 01363 kfree(dopt); 01364 dopt = NULL; 01365 } 01366 } 01367 } 01368 return dopt; 01369 } 01370 01371 /* 01372 * Maximum number of SYN_RECV sockets in queue per LISTEN socket. 01373 * One SYN_RECV socket costs about 80bytes on a 32bit machine. 01374 * It would be better to replace it with a global counter for all sockets 01375 * but then some measure against one socket starving all other sockets 01376 * would be needed. 01377 * 01378 * It was 128 by default. Experiments with real servers show, that 01379 * it is absolutely not enough even at 100conn/sec. 256 cures most 01380 * of problems. This value is adjusted to 128 for very small machines 01381 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). 01382 * Further increasing requires to change hash table size. 01383 */ 01384 int sysctl_max_syn_backlog = 256; 01385 01386 struct or_calltable or_ipv4 = { 01387 PF_INET, 01388 tcp_v4_send_synack, 01389 tcp_v4_or_send_ack, 01390 tcp_v4_or_free, 01391 tcp_v4_send_reset 01392 }; 01393 01394 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 01395 { 01396 struct tcp_opt tp; 01397 struct open_request *req; 01398 __u32 saddr = skb->nh.iph->saddr; 01399 __u32 daddr = skb->nh.iph->daddr; 01400 __u32 isn = TCP_SKB_CB(skb)->when; 01401 struct dst_entry *dst = NULL; 01402 #ifdef CONFIG_SYN_COOKIES 01403 int want_cookie = 0; 01404 #else 01405 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ 01406 #endif 01407 01408 /* Never answer to SYNs send to broadcast or multicast */ 01409 if (((struct rtable *)skb->dst)->rt_flags & 01410 (RTCF_BROADCAST|RTCF_MULTICAST)) 01411 goto drop; 01412 01413 /* TW buckets are converted to open requests without 01414 * limitations, they conserve resources and peer is 01415 * evidently real one. 01416 */ 01417 if (tcp_synq_is_full(sk) && !isn) { 01418 #ifdef CONFIG_SYN_COOKIES 01419 if (sysctl_tcp_syncookies) { 01420 want_cookie = 1; 01421 } else 01422 #endif 01423 goto drop; 01424 } 01425 01426 /* Accept backlog is full. If we have already queued enough 01427 * of warm entries in syn queue, drop request. It is better than 01428 * clogging syn queue with openreqs with exponentially increasing 01429 * timeout. 01430 */ 01431 if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) 01432 goto drop; 01433 01434 req = tcp_openreq_alloc(); 01435 if (req == NULL) 01436 goto drop; 01437 01438 tcp_clear_options(&tp); 01439 tp.mss_clamp = 536; 01440 tp.user_mss = sk->tp_pinfo.af_tcp.user_mss; 01441 01442 tcp_parse_options(skb, &tp, 0); 01443 01444 if (want_cookie) { 01445 tcp_clear_options(&tp); 01446 tp.saw_tstamp = 0; 01447 } 01448 01449 if (tp.saw_tstamp && tp.rcv_tsval == 0) { 01450 /* Some OSes (unknown ones, but I see them on web server, which 01451 * contains information interesting only for windows' 01452 * users) do not send their stamp in SYN. It is easy case. 01453 * We simply do not advertise TS support. 01454 */ 01455 tp.saw_tstamp = 0; 01456 tp.tstamp_ok = 0; 01457 } 01458 tp.tstamp_ok = tp.saw_tstamp; 01459 01460 tcp_openreq_init(req, &tp, skb); 01461 01462 req->af.v4_req.loc_addr = daddr; 01463 req->af.v4_req.rmt_addr = saddr; 01464 req->af.v4_req.opt = tcp_v4_save_options(sk, skb); 01465 req->class = &or_ipv4; 01466 if (!want_cookie) 01467 TCP_ECN_create_request(req, skb->h.th); 01468 01469 if (want_cookie) { 01470 #ifdef CONFIG_SYN_COOKIES 01471 syn_flood_warning(skb); 01472 #endif 01473 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 01474 } else if (isn == 0) { 01475 struct inet_peer *peer = NULL; 01476 01477 /* VJ's idea. We save last timestamp seen 01478 * from the destination in peer table, when entering 01479 * state TIME-WAIT, and check against it before 01480 * accepting new connection request. 01481 * 01482 * If "isn" is not zero, this request hit alive 01483 * timewait bucket, so that all the necessary checks 01484 * are made in the function processing timewait state. 01485 */ 01486 if (tp.saw_tstamp && 01487 sysctl_tcp_tw_recycle && 01488 (dst = tcp_v4_route_req(sk, req)) != NULL && 01489 (peer = rt_get_peer((struct rtable*)dst)) != NULL && 01490 peer->v4daddr == saddr) { 01491 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && 01492 (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { 01493 NET_INC_STATS_BH(PAWSPassiveRejected); 01494 dst_release(dst); 01495 goto drop_and_free; 01496 } 01497 } 01498 /* Kill the following clause, if you dislike this way. */ 01499 else if (!sysctl_tcp_syncookies && 01500 (sysctl_max_syn_backlog - tcp_synq_len(sk) 01501 < (sysctl_max_syn_backlog>>2)) && 01502 (!peer || !peer->tcp_ts_stamp) && 01503 (!dst || !dst->rtt)) { 01504 /* Without syncookies last quarter of 01505 * backlog is filled with destinations, proven to be alive. 01506 * It means that we continue to communicate 01507 * to destinations, already remembered 01508 * to the moment of synflood. 01509 */ 01510 NETDEBUG(if (net_ratelimit()) \ 01511 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \ 01512 NIPQUAD(saddr), ntohs(skb->h.th->source))); 01513 dst_release(dst); 01514 goto drop_and_free; 01515 } 01516 01517 isn = tcp_v4_init_sequence(sk, skb); 01518 } 01519 req->snt_isn = isn; 01520 01521 if (tcp_v4_send_synack(sk, req, dst)) 01522 goto drop_and_free; 01523 01524 if (want_cookie) { 01525 tcp_openreq_free(req); 01526 } else { 01527 tcp_v4_synq_add(sk, req); 01528 } 01529 return 0; 01530 01531 drop_and_free: 01532 tcp_openreq_free(req); 01533 drop: 01534 TCP_INC_STATS_BH(TcpAttemptFails); 01535 return 0; 01536 } 01537 01538 01539 /* 01540 * The three way handshake has completed - we got a valid synack - 01541 * now create the new socket. 01542 */ 01543 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, 01544 struct open_request *req, 01545 struct dst_entry *dst) 01546 { 01547 struct tcp_opt *newtp; 01548 struct sock *newsk; 01549 01550 if (tcp_acceptq_is_full(sk)) 01551 goto exit_overflow; 01552 01553 if (dst == NULL && 01554 (dst = tcp_v4_route_req(sk, req)) == NULL) 01555 goto exit; 01556 01557 newsk = tcp_create_openreq_child(sk, req, skb); 01558 if (!newsk) 01559 goto exit; 01560 01561 newsk->dst_cache = dst; 01562 newsk->route_caps = dst->dev->features; 01563 01564 newtp = &(newsk->tp_pinfo.af_tcp); 01565 newsk->daddr = req->af.v4_req.rmt_addr; 01566 newsk->saddr = req->af.v4_req.loc_addr; 01567 newsk->rcv_saddr = req->af.v4_req.loc_addr; 01568 newsk->protinfo.af_inet.opt = req->af.v4_req.opt; 01569 req->af.v4_req.opt = NULL; 01570 newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb); 01571 newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl; 01572 newtp->ext_header_len = 0; 01573 if (newsk->protinfo.af_inet.opt) 01574 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen; 01575 newsk->protinfo.af_inet.id = newtp->write_seq^jiffies; 01576 01577 tcp_sync_mss(newsk, dst->pmtu); 01578 newtp->advmss = dst->advmss; 01579 tcp_initialize_rcv_mss(newsk); 01580 01581 __tcp_v4_hash(newsk, 0); 01582 __tcp_inherit_port(sk, newsk); 01583 01584 return newsk; 01585 01586 exit_overflow: 01587 NET_INC_STATS_BH(ListenOverflows); 01588 exit: 01589 NET_INC_STATS_BH(ListenDrops); 01590 dst_release(dst); 01591 return NULL; 01592 } 01593 01594 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) 01595 { 01596 struct open_request *req, **prev; 01597 struct tcphdr *th = skb->h.th; 01598 struct iphdr *iph = skb->nh.iph; 01599 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 01600 struct sock *nsk; 01601 01602 /* Find possible connection requests. */ 01603 req = tcp_v4_search_req(tp, &prev, 01604 th->source, 01605 iph->saddr, iph->daddr); 01606 if (req) 01607 return tcp_check_req(sk, skb, req, prev); 01608 01609 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, 01610 th->source, 01611 skb->nh.iph->daddr, 01612 ntohs(th->dest), 01613 tcp_v4_iif(skb)); 01614 01615 if (nsk) { 01616 if (nsk->state != TCP_TIME_WAIT) { 01617 bh_lock_sock(nsk); 01618 return nsk; 01619 } 01620 tcp_tw_put((struct tcp_tw_bucket*)nsk); 01621 return NULL; 01622 } 01623 01624 #ifdef CONFIG_SYN_COOKIES 01625 if (!th->rst && !th->syn && th->ack) 01626 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); 01627 #endif 01628 return sk; 01629 } 01630 01631 static int tcp_v4_checksum_init(struct sk_buff *skb) 01632 { 01633 if (skb->ip_summed == CHECKSUM_HW) { 01634 skb->ip_summed = CHECKSUM_UNNECESSARY; 01635 if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, 01636 skb->nh.iph->daddr,skb->csum)) 01637 return 0; 01638 01639 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n")); 01640 skb->ip_summed = CHECKSUM_NONE; 01641 } 01642 if (skb->len <= 76) { 01643 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, 01644 skb->nh.iph->daddr, 01645 skb_checksum(skb, 0, skb->len, 0))) 01646 return -1; 01647 skb->ip_summed = CHECKSUM_UNNECESSARY; 01648 } else { 01649 skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, 01650 skb->nh.iph->daddr,0); 01651 } 01652 return 0; 01653 } 01654 01655 01656 /* The socket must have it's spinlock held when we get 01657 * here. 01658 * 01659 * We have a potential double-lock case here, so even when 01660 * doing backlog processing we use the BH locking scheme. 01661 * This is because we cannot sleep with the original spinlock 01662 * held. 01663 */ 01664 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) 01665 { 01666 IP_INC_STATS_BH(IpInDelivers); 01667 01668 if (sk->state == TCP_ESTABLISHED) { /* Fast path */ 01669 TCP_CHECK_TIMER(sk); 01670 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) 01671 goto reset; 01672 TCP_CHECK_TIMER(sk); 01673 return 0; 01674 } 01675 01676 if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb)) 01677 goto csum_err; 01678 01679 if (sk->state == TCP_LISTEN) { 01680 struct sock *nsk = tcp_v4_hnd_req(sk, skb); 01681 if (!nsk) 01682 goto discard; 01683 01684 if (nsk != sk) { 01685 if (tcp_child_process(sk, nsk, skb)) 01686 goto reset; 01687 return 0; 01688 } 01689 } 01690 01691 TCP_CHECK_TIMER(sk); 01692 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) 01693 goto reset; 01694 TCP_CHECK_TIMER(sk); 01695 return 0; 01696 01697 reset: 01698 tcp_v4_send_reset(skb); 01699 discard: 01700 kfree_skb(skb); 01701 /* Be careful here. If this function gets more complicated and 01702 * gcc suffers from register pressure on the x86, sk (in %ebx) 01703 * might be destroyed here. This current version compiles correctly, 01704 * but you have been warned. 01705 */ 01706 return 0; 01707 01708 csum_err: 01709 TCP_INC_STATS_BH(TcpInErrs); 01710 goto discard; 01711 } 01712 01713 /* 01714 * From tcp_input.c 01715 */ 01716 01717 int tcp_v4_rcv(struct sk_buff *skb) 01718 { 01719 struct tcphdr *th; 01720 struct sock *sk; 01721 int ret; 01722 01723 if (skb->pkt_type!=PACKET_HOST) 01724 goto discard_it; 01725 01726 /* Count it even if it's bad */ 01727 TCP_INC_STATS_BH(TcpInSegs); 01728 01729 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 01730 goto discard_it; 01731 01732 th = skb->h.th; 01733 01734 if (th->doff < sizeof(struct tcphdr)/4) 01735 goto bad_packet; 01736 if (!pskb_may_pull(skb, th->doff*4)) 01737 goto discard_it; 01738 01739 /* An explanation is required here, I think. 01740 * Packet length and doff are validated by header prediction, 01741 * provided case of th->doff==0 is elimineted. 01742 * So, we defer the checks. */ 01743 if ((skb->ip_summed != CHECKSUM_UNNECESSARY && 01744 tcp_v4_checksum_init(skb) < 0)) 01745 goto bad_packet; 01746 01747 th = skb->h.th; 01748 TCP_SKB_CB(skb)->seq = ntohl(th->seq); 01749 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + 01750 skb->len - th->doff*4); 01751 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 01752 TCP_SKB_CB(skb)->when = 0; 01753 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; 01754 TCP_SKB_CB(skb)->sacked = 0; 01755 01756 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, 01757 skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb)); 01758 01759 if (!sk) 01760 goto no_tcp_socket; 01761 01762 process: 01763 if(!ipsec_sk_policy(sk,skb)) 01764 goto discard_and_relse; 01765 01766 if (sk->state == TCP_TIME_WAIT) 01767 goto do_time_wait; 01768 01769 if (sk_filter(sk, skb, 0)) 01770 goto discard_and_relse; 01771 01772 skb->dev = NULL; 01773 01774 bh_lock_sock(sk); 01775 ret = 0; 01776 if (!sk->lock.users) { 01777 if (!tcp_prequeue(sk, skb)) 01778 ret = tcp_v4_do_rcv(sk, skb); 01779 } else 01780 sk_add_backlog(sk, skb); 01781 bh_unlock_sock(sk); 01782 01783 sock_put(sk); 01784 01785 return ret; 01786 01787 no_tcp_socket: 01788 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { 01789 bad_packet: 01790 TCP_INC_STATS_BH(TcpInErrs); 01791 } else { 01792 tcp_v4_send_reset(skb); 01793 } 01794 01795 discard_it: 01796 /* Discard frame. */ 01797 kfree_skb(skb); 01798 return 0; 01799 01800 discard_and_relse: 01801 sock_put(sk); 01802 goto discard_it; 01803 01804 do_time_wait: 01805 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { 01806 TCP_INC_STATS_BH(TcpInErrs); 01807 goto discard_and_relse; 01808 } 01809 switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, 01810 skb, th, skb->len)) { 01811 case TCP_TW_SYN: 01812 { 01813 struct sock *sk2; 01814 01815 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb)); 01816 if (sk2 != NULL) { 01817 tcp_tw_deschedule((struct tcp_tw_bucket *)sk); 01818 tcp_timewait_kill((struct tcp_tw_bucket *)sk); 01819 tcp_tw_put((struct tcp_tw_bucket *)sk); 01820 sk = sk2; 01821 goto process; 01822 } 01823 /* Fall through to ACK */ 01824 } 01825 case TCP_TW_ACK: 01826 tcp_v4_timewait_ack(sk, skb); 01827 break; 01828 case TCP_TW_RST: 01829 goto no_tcp_socket; 01830 case TCP_TW_SUCCESS:; 01831 } 01832 goto discard_it; 01833 } 01834 01835 /* With per-bucket locks this operation is not-atomic, so that 01836 * this version is not worse. 01837 */ 01838 static void __tcp_v4_rehash(struct sock *sk) 01839 { 01840 sk->prot->unhash(sk); 01841 sk->prot->hash(sk); 01842 } 01843 01844 static int tcp_v4_reselect_saddr(struct sock *sk) 01845 { 01846 int err; 01847 struct rtable *rt; 01848 __u32 old_saddr = sk->saddr; 01849 __u32 new_saddr; 01850 __u32 daddr = sk->daddr; 01851 01852 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) 01853 daddr = sk->protinfo.af_inet.opt->faddr; 01854 01855 /* Query new route. */ 01856 err = ip_route_connect(&rt, daddr, 0, 01857 RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute, 01858 sk->bound_dev_if); 01859 if (err) 01860 return err; 01861 01862 __sk_dst_set(sk, &rt->u.dst); 01863 sk->route_caps = rt->u.dst.dev->features; 01864 01865 new_saddr = rt->rt_src; 01866 01867 if (new_saddr == old_saddr) 01868 return 0; 01869 01870 if (sysctl_ip_dynaddr > 1) { 01871 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr " 01872 "from %d.%d.%d.%d to %d.%d.%d.%d\n", 01873 NIPQUAD(old_saddr), 01874 NIPQUAD(new_saddr)); 01875 } 01876 01877 sk->saddr = new_saddr; 01878 sk->rcv_saddr = new_saddr; 01879 01880 /* XXX The only one ugly spot where we need to 01881 * XXX really change the sockets identity after 01882 * XXX it has entered the hashes. -DaveM 01883 * 01884 * Besides that, it does not check for connection 01885 * uniqueness. Wait for troubles. 01886 */ 01887 __tcp_v4_rehash(sk); 01888 return 0; 01889 } 01890 01891 int tcp_v4_rebuild_header(struct sock *sk) 01892 { 01893 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); 01894 u32 daddr; 01895 int err; 01896 01897 /* Route is OK, nothing to do. */ 01898 if (rt != NULL) 01899 return 0; 01900 01901 /* Reroute. */ 01902 daddr = sk->daddr; 01903 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) 01904 daddr = sk->protinfo.af_inet.opt->faddr; 01905 01906 err = ip_route_output(&rt, daddr, sk->saddr, 01907 RT_CONN_FLAGS(sk), sk->bound_dev_if); 01908 if (!err) { 01909 __sk_dst_set(sk, &rt->u.dst); 01910 sk->route_caps = rt->u.dst.dev->features; 01911 return 0; 01912 } 01913 01914 /* Routing failed... */ 01915 sk->route_caps = 0; 01916 01917 if (!sysctl_ip_dynaddr || 01918 sk->state != TCP_SYN_SENT || 01919 (sk->userlocks & SOCK_BINDADDR_LOCK) || 01920 (err = tcp_v4_reselect_saddr(sk)) != 0) 01921 sk->err_soft=-err; 01922 01923 return err; 01924 } 01925 01926 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) 01927 { 01928 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; 01929 01930 sin->sin_family = AF_INET; 01931 sin->sin_addr.s_addr = sk->daddr; 01932 sin->sin_port = sk->dport; 01933 } 01934 01935 /* VJ's idea. Save last timestamp seen from this destination 01936 * and hold it at least for normal timewait interval to use for duplicate 01937 * segment detection in subsequent connections, before they enter synchronized 01938 * state. 01939 */ 01940 01941 int tcp_v4_remember_stamp(struct sock *sk) 01942 { 01943 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; 01944 struct rtable *rt = (struct rtable*)__sk_dst_get(sk); 01945 struct inet_peer *peer = NULL; 01946 int release_it = 0; 01947 01948 if (rt == NULL || rt->rt_dst != sk->daddr) { 01949 peer = inet_getpeer(sk->daddr, 1); 01950 release_it = 1; 01951 } else { 01952 if (rt->peer == NULL) 01953 rt_bind_peer(rt, 1); 01954 peer = rt->peer; 01955 } 01956 01957 if (peer) { 01958 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 || 01959 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && 01960 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) { 01961 peer->tcp_ts_stamp = tp->ts_recent_stamp; 01962 peer->tcp_ts = tp->ts_recent; 01963 } 01964 if (release_it) 01965 inet_putpeer(peer); 01966 return 1; 01967 } 01968 01969 return 0; 01970 } 01971 01972 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) 01973 { 01974 struct inet_peer *peer = NULL; 01975 01976 peer = inet_getpeer(tw->daddr, 1); 01977 01978 if (peer) { 01979 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 || 01980 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && 01981 peer->tcp_ts_stamp <= tw->ts_recent_stamp)) { 01982 peer->tcp_ts_stamp = tw->ts_recent_stamp; 01983 peer->tcp_ts = tw->ts_recent; 01984 } 01985 inet_putpeer(peer); 01986 return 1; 01987 } 01988 01989 return 0; 01990 } 01991 01992 struct tcp_func ipv4_specific = { 01993 ip_queue_xmit, 01994 tcp_v4_send_check, 01995 tcp_v4_rebuild_header, 01996 tcp_v4_conn_request, 01997 tcp_v4_syn_recv_sock, 01998 tcp_v4_remember_stamp, 01999 sizeof(struct iphdr), 02000 02001 ip_setsockopt, 02002 ip_getsockopt, 02003 v4_addr2sockaddr, 02004 sizeof(struct sockaddr_in) 02005 }; 02006 02007 /* NOTE: A lot of things set to zero explicitly by call to 02008 * sk_alloc() so need not be done here. 02009 */ 02010 static int tcp_v4_init_sock(struct sock *sk) 02011 { 02012 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 02013 02014 skb_queue_head_init(&tp->out_of_order_queue); 02015 tcp_init_xmit_timers(sk); 02016 tcp_prequeue_init(tp); 02017 02018 tp->rto = TCP_TIMEOUT_INIT; 02019 tp->mdev = TCP_TIMEOUT_INIT; 02020 02021 /* So many TCP implementations out there (incorrectly) count the 02022 * initial SYN frame in their delayed-ACK and congestion control 02023 * algorithms that we must have the following bandaid to talk 02024 * efficiently to them. -DaveM 02025 */ 02026 tp->snd_cwnd = 2; 02027 02028 /* See draft-stevens-tcpca-spec-01 for discussion of the 02029 * initialization of these values. 02030 */ 02031 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 02032 tp->snd_cwnd_clamp = ~0; 02033 tp->mss_cache = 536; 02034 02035 tp->reordering = sysctl_tcp_reordering; 02036 02037 sk->state = TCP_CLOSE; 02038 02039 sk->write_space = tcp_write_space; 02040 sk->use_write_queue = 1; 02041 02042 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific; 02043 02044 sk->sndbuf = sysctl_tcp_wmem[1]; 02045 sk->rcvbuf = sysctl_tcp_rmem[1]; 02046 02047 atomic_inc(&tcp_sockets_allocated); 02048 02049 return 0; 02050 } 02051 02052 static int tcp_v4_destroy_sock(struct sock *sk) 02053 { 02054 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 02055 02056 tcp_clear_xmit_timers(sk); 02057 02058 /* Cleanup up the write buffer. */ 02059 tcp_writequeue_purge(sk); 02060 02061 /* Cleans up our, hopefully empty, out_of_order_queue. */ 02062 __skb_queue_purge(&tp->out_of_order_queue); 02063 02064 /* Clean prequeue, it must be empty really */ 02065 __skb_queue_purge(&tp->ucopy.prequeue); 02066 02067 /* Clean up a referenced TCP bind bucket. */ 02068 if(sk->prev != NULL) 02069 tcp_put_port(sk); 02070 02071 /* If sendmsg cached page exists, toss it. */ 02072 if (tp->sndmsg_page != NULL) 02073 __free_page(tp->sndmsg_page); 02074 02075 atomic_dec(&tcp_sockets_allocated); 02076 02077 return 0; 02078 } 02079 02080 /* Proc filesystem TCP sock list dumping. */ 02081 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid) 02082 { 02083 int ttd = req->expires - jiffies; 02084 02085 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" 02086 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p", 02087 i, 02088 req->af.v4_req.loc_addr, 02089 ntohs(sk->sport), 02090 req->af.v4_req.rmt_addr, 02091 ntohs(req->rmt_port), 02092 TCP_SYN_RECV, 02093 0,0, /* could print option size, but that is af dependent. */ 02094 1, /* timers active (only the expire timer) */ 02095 ttd, 02096 req->retrans, 02097 uid, 02098 0, /* non standard timer */ 02099 0, /* open_requests have no inode */ 02100 atomic_read(&sk->refcnt), 02101 req 02102 ); 02103 } 02104 02105 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i) 02106 { 02107 unsigned int dest, src; 02108 __u16 destp, srcp; 02109 int timer_active; 02110 unsigned long timer_expires; 02111 struct tcp_opt *tp = &sp->tp_pinfo.af_tcp; 02112 02113 dest = sp->daddr; 02114 src = sp->rcv_saddr; 02115 destp = ntohs(sp->dport); 02116 srcp = ntohs(sp->sport); 02117 if (tp->pending == TCP_TIME_RETRANS) { 02118 timer_active = 1; 02119 timer_expires = tp->timeout; 02120 } else if (tp->pending == TCP_TIME_PROBE0) { 02121 timer_active = 4; 02122 timer_expires = tp->timeout; 02123 } else if (timer_pending(&sp->timer)) { 02124 timer_active = 2; 02125 timer_expires = sp->timer.expires; 02126 } else { 02127 timer_active = 0; 02128 timer_expires = jiffies; 02129 } 02130 02131 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" 02132 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d", 02133 i, src, srcp, dest, destp, sp->state, 02134 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq, 02135 timer_active, timer_expires-jiffies, 02136 tp->retransmits, 02137 sock_i_uid(sp), 02138 tp->probes_out, 02139 sock_i_ino(sp), 02140 atomic_read(&sp->refcnt), sp, 02141 tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong, 02142 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh 02143 ); 02144 } 02145 02146 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) 02147 { 02148 unsigned int dest, src; 02149 __u16 destp, srcp; 02150 int ttd = tw->ttd - jiffies; 02151 02152 if (ttd < 0) 02153 ttd = 0; 02154 02155 dest = tw->daddr; 02156 src = tw->rcv_saddr; 02157 destp = ntohs(tw->dport); 02158 srcp = ntohs(tw->sport); 02159 02160 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" 02161 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p", 02162 i, src, srcp, dest, destp, tw->substate, 0, 0, 02163 3, ttd, 0, 0, 0, 0, 02164 atomic_read(&tw->refcnt), tw); 02165 } 02166 02167 #define TMPSZ 150 02168 02169 int tcp_get_info(char *buffer, char **start, off_t offset, int length) 02170 { 02171 int len = 0, num = 0, i; 02172 off_t begin, pos = 0; 02173 char tmpbuf[TMPSZ+1]; 02174 02175 if (offset < TMPSZ) 02176 len += sprintf(buffer, "%-*s\n", TMPSZ-1, 02177 " sl local_address rem_address st tx_queue " 02178 "rx_queue tr tm->when retrnsmt uid timeout inode"); 02179 02180 pos = TMPSZ; 02181 02182 /* First, walk listening socket table. */ 02183 tcp_listen_lock(); 02184 for(i = 0; i < TCP_LHTABLE_SIZE; i++) { 02185 struct sock *sk; 02186 struct tcp_listen_opt *lopt; 02187 int k; 02188 02189 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) { 02190 struct open_request *req; 02191 int uid; 02192 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); 02193 02194 if (!TCP_INET_FAMILY(sk->family)) 02195 goto skip_listen; 02196 02197 pos += TMPSZ; 02198 if (pos >= offset) { 02199 get_tcp_sock(sk, tmpbuf, num); 02200 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); 02201 if (pos >= offset + length) { 02202 tcp_listen_unlock(); 02203 goto out_no_bh; 02204 } 02205 } 02206 02207 skip_listen: 02208 uid = sock_i_uid(sk); 02209 read_lock_bh(&tp->syn_wait_lock); 02210 lopt = tp->listen_opt; 02211 if (lopt && lopt->qlen != 0) { 02212 for (k=0; k<TCP_SYNQ_HSIZE; k++) { 02213 for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) { 02214 if (!TCP_INET_FAMILY(req->class->family)) 02215 continue; 02216 02217 pos += TMPSZ; 02218 if (pos <= offset) 02219 continue; 02220 get_openreq(sk, req, tmpbuf, num, uid); 02221 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); 02222 if (pos >= offset + length) { 02223 read_unlock_bh(&tp->syn_wait_lock); 02224 tcp_listen_unlock(); 02225 goto out_no_bh; 02226 } 02227 } 02228 } 02229 } 02230 read_unlock_bh(&tp->syn_wait_lock); 02231 02232 /* Completed requests are in normal socket hash table */ 02233 } 02234 } 02235 tcp_listen_unlock(); 02236 02237 local_bh_disable(); 02238 02239 /* Next, walk established hash chain. */ 02240 for (i = 0; i < tcp_ehash_size; i++) { 02241 struct tcp_ehash_bucket *head = &tcp_ehash[i]; 02242 struct sock *sk; 02243 struct tcp_tw_bucket *tw; 02244 02245 read_lock(&head->lock); 02246 for(sk = head->chain; sk; sk = sk->next, num++) { 02247 if (!TCP_INET_FAMILY(sk->family)) 02248 continue; 02249 pos += TMPSZ; 02250 if (pos <= offset) 02251 continue; 02252 get_tcp_sock(sk, tmpbuf, num); 02253 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); 02254 if (pos >= offset + length) { 02255 read_unlock(&head->lock); 02256 goto out; 02257 } 02258 } 02259 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain; 02260 tw != NULL; 02261 tw = (struct tcp_tw_bucket *)tw->next, num++) { 02262 if (!TCP_INET_FAMILY(tw->family)) 02263 continue; 02264 pos += TMPSZ; 02265 if (pos <= offset) 02266 continue; 02267 get_timewait_sock(tw, tmpbuf, num); 02268 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); 02269 if (pos >= offset + length) { 02270 read_unlock(&head->lock); 02271 goto out; 02272 } 02273 } 02274 read_unlock(&head->lock); 02275 } 02276 02277 out: 02278 local_bh_enable(); 02279 out_no_bh: 02280 02281 begin = len - (pos - offset); 02282 *start = buffer + begin; 02283 len -= begin; 02284 if (len > length) 02285 len = length; 02286 if (len < 0) 02287 len = 0; 02288 return len; 02289 } 02290 02291 struct proto tcp_prot = { 02292 name: "TCP", 02293 close: tcp_close, 02294 connect: tcp_v4_connect, 02295 disconnect: tcp_disconnect, 02296 accept: tcp_accept, 02297 ioctl: tcp_ioctl, 02298 init: tcp_v4_init_sock, 02299 destroy: tcp_v4_destroy_sock, 02300 shutdown: tcp_shutdown, 02301 setsockopt: tcp_setsockopt, 02302 getsockopt: tcp_getsockopt, 02303 sendmsg: tcp_sendmsg, 02304 recvmsg: tcp_recvmsg, 02305 backlog_rcv: tcp_v4_do_rcv, 02306 hash: tcp_v4_hash, 02307 unhash: tcp_unhash, 02308 get_port: tcp_v4_get_port, 02309 }; 02310 02311 02312 02313 void __init tcp_v4_init(struct net_proto_family *ops) 02314 { 02315 int err; 02316 02317 tcp_inode.i_mode = S_IFSOCK; 02318 tcp_inode.i_sock = 1; 02319 tcp_inode.i_uid = 0; 02320 tcp_inode.i_gid = 0; 02321 init_waitqueue_head(&tcp_inode.i_wait); 02322 init_waitqueue_head(&tcp_inode.u.socket_i.wait); 02323 02324 tcp_socket->inode = &tcp_inode; 02325 tcp_socket->state = SS_UNCONNECTED; 02326 tcp_socket->type=SOCK_RAW; 02327 02328 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0) 02329 panic("Failed to create the TCP control socket.\n"); 02330 tcp_socket->sk->allocation=GFP_ATOMIC; 02331 tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL; 02332 02333 /* Unhash it so that IP input processing does not even 02334 * see it, we do not wish this socket to see incoming 02335 * packets. 02336 */ 02337 tcp_socket->sk->prot->unhash(tcp_socket->sk); 02338 }

Generated on Wed Dec 1 21:25:33 2004 for Linux 2.4.23 Networking by doxygen 1.3.8