Use const and static as needed, disable unused code
[qemu] / slirp / tcp_input.c
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the University of
16  *      California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *      @(#)tcp_input.c 8.5 (Berkeley) 4/10/94
34  * tcp_input.c,v 1.10 1994/10/13 18:36:32 wollman Exp
35  */
36
37 /*
38  * Changes and additions relating to SLiRP
39  * Copyright (c) 1995 Danny Gasparovski.
40  *
41  * Please read the file COPYRIGHT for the
42  * terms and conditions of the copyright.
43  */
44
45 #include <slirp.h>
46 #include "ip_icmp.h"
47
48 struct socket tcb;
49
50 #define TCPREXMTTHRESH 3
51 struct  socket *tcp_last_so = &tcb;
52
53 tcp_seq tcp_iss;                /* tcp initial send seq # */
54
55 #define TCP_PAWS_IDLE   (24 * 24 * 60 * 60 * PR_SLOWHZ)
56
57 /* for modulo comparisons of timestamps */
58 #define TSTMP_LT(a,b)   ((int)((a)-(b)) < 0)
59 #define TSTMP_GEQ(a,b)  ((int)((a)-(b)) >= 0)
60
61 /*
62  * Insert segment ti into reassembly queue of tcp with
63  * control block tp.  Return TH_FIN if reassembly now includes
64  * a segment with FIN.  The macro form does the common case inline
65  * (segment is the next to be received on an established connection,
66  * and the queue is empty), avoiding linkage into and removal
67  * from the queue and repetition of various conversions.
68  * Set DELACK for segments received in order, but ack immediately
69  * when segments are out of order (so fast retransmit can work).
70  */
71 #ifdef TCP_ACK_HACK
72 #define TCP_REASS(tp, ti, m, so, flags) {\
73        if ((ti)->ti_seq == (tp)->rcv_nxt && \
74            (tp)->seg_next == (tcpiphdrp_32)(tp) && \
75            (tp)->t_state == TCPS_ESTABLISHED) {\
76                if (ti->ti_flags & TH_PUSH) \
77                        tp->t_flags |= TF_ACKNOW; \
78                else \
79                        tp->t_flags |= TF_DELACK; \
80                (tp)->rcv_nxt += (ti)->ti_len; \
81                flags = (ti)->ti_flags & TH_FIN; \
82                STAT(tcpstat.tcps_rcvpack++);         \
83                STAT(tcpstat.tcps_rcvbyte += (ti)->ti_len);   \
84                if (so->so_emu) { \
85                        if (tcp_emu((so),(m))) sbappend((so), (m)); \
86                } else \
87                        sbappend((so), (m)); \
88 /*               sorwakeup(so); */ \
89         } else {\
90                (flags) = tcp_reass((tp), (ti), (m)); \
91                tp->t_flags |= TF_ACKNOW; \
92        } \
93 }
94 #else
95 #define TCP_REASS(tp, ti, m, so, flags) { \
96         if ((ti)->ti_seq == (tp)->rcv_nxt && \
97             (tp)->seg_next == (tcpiphdrp_32)(tp) && \
98             (tp)->t_state == TCPS_ESTABLISHED) { \
99                 tp->t_flags |= TF_DELACK; \
100                 (tp)->rcv_nxt += (ti)->ti_len; \
101                 flags = (ti)->ti_flags & TH_FIN; \
102                 STAT(tcpstat.tcps_rcvpack++);        \
103                 STAT(tcpstat.tcps_rcvbyte += (ti)->ti_len);  \
104                 if (so->so_emu) { \
105                         if (tcp_emu((so),(m))) sbappend(so, (m)); \
106                 } else \
107                         sbappend((so), (m)); \
108 /*              sorwakeup(so); */ \
109         } else { \
110                 (flags) = tcp_reass((tp), (ti), (m)); \
111                 tp->t_flags |= TF_ACKNOW; \
112         } \
113 }
114 #endif
115 static void tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt,
116                           struct tcpiphdr *ti);
117 static void tcp_xmit_timer(register struct tcpcb *tp, int rtt);
118
119 static int
120 tcp_reass(register struct tcpcb *tp, register struct tcpiphdr *ti,
121           struct mbuf *m)
122 {
123         register struct tcpiphdr *q;
124         struct socket *so = tp->t_socket;
125         int flags;
126
127         /*
128          * Call with ti==0 after become established to
129          * force pre-ESTABLISHED data up to user socket.
130          */
131         if (ti == 0)
132                 goto present;
133
134         /*
135          * Find a segment which begins after this one does.
136          */
137         for (q = (struct tcpiphdr *)tp->seg_next; q != (struct tcpiphdr *)tp;
138             q = (struct tcpiphdr *)q->ti_next)
139                 if (SEQ_GT(q->ti_seq, ti->ti_seq))
140                         break;
141
142         /*
143          * If there is a preceding segment, it may provide some of
144          * our data already.  If so, drop the data from the incoming
145          * segment.  If it provides all of our data, drop us.
146          */
147         if ((struct tcpiphdr *)q->ti_prev != (struct tcpiphdr *)tp) {
148                 register int i;
149                 q = (struct tcpiphdr *)q->ti_prev;
150                 /* conversion to int (in i) handles seq wraparound */
151                 i = q->ti_seq + q->ti_len - ti->ti_seq;
152                 if (i > 0) {
153                         if (i >= ti->ti_len) {
154                                 STAT(tcpstat.tcps_rcvduppack++);
155                                 STAT(tcpstat.tcps_rcvdupbyte += ti->ti_len);
156                                 m_freem(m);
157                                 /*
158                                  * Try to present any queued data
159                                  * at the left window edge to the user.
160                                  * This is needed after the 3-WHS
161                                  * completes.
162                                  */
163                                 goto present;   /* ??? */
164                         }
165                         m_adj(m, i);
166                         ti->ti_len -= i;
167                         ti->ti_seq += i;
168                 }
169                 q = (struct tcpiphdr *)(q->ti_next);
170         }
171         STAT(tcpstat.tcps_rcvoopack++);
172         STAT(tcpstat.tcps_rcvoobyte += ti->ti_len);
173         REASS_MBUF(ti) = (mbufp_32) m;          /* XXX */
174
175         /*
176          * While we overlap succeeding segments trim them or,
177          * if they are completely covered, dequeue them.
178          */
179         while (q != (struct tcpiphdr *)tp) {
180                 register int i = (ti->ti_seq + ti->ti_len) - q->ti_seq;
181                 if (i <= 0)
182                         break;
183                 if (i < q->ti_len) {
184                         q->ti_seq += i;
185                         q->ti_len -= i;
186                         m_adj((struct mbuf *) REASS_MBUF(q), i);
187                         break;
188                 }
189                 q = (struct tcpiphdr *)q->ti_next;
190                 m = (struct mbuf *) REASS_MBUF((struct tcpiphdr *)q->ti_prev);
191                 remque_32((void *)(q->ti_prev));
192                 m_freem(m);
193         }
194
195         /*
196          * Stick new segment in its place.
197          */
198         insque_32(ti, (void *)(q->ti_prev));
199
200 present:
201         /*
202          * Present data to user, advancing rcv_nxt through
203          * completed sequence space.
204          */
205         if (!TCPS_HAVEESTABLISHED(tp->t_state))
206                 return (0);
207         ti = (struct tcpiphdr *) tp->seg_next;
208         if (ti == (struct tcpiphdr *)tp || ti->ti_seq != tp->rcv_nxt)
209                 return (0);
210         if (tp->t_state == TCPS_SYN_RECEIVED && ti->ti_len)
211                 return (0);
212         do {
213                 tp->rcv_nxt += ti->ti_len;
214                 flags = ti->ti_flags & TH_FIN;
215                 remque_32(ti);
216                 m = (struct mbuf *) REASS_MBUF(ti); /* XXX */
217                 ti = (struct tcpiphdr *)ti->ti_next;
218 /*              if (so->so_state & SS_FCANTRCVMORE) */
219                 if (so->so_state & SS_FCANTSENDMORE)
220                         m_freem(m);
221                 else {
222                         if (so->so_emu) {
223                                 if (tcp_emu(so,m)) sbappend(so, m);
224                         } else
225                                 sbappend(so, m);
226                 }
227         } while (ti != (struct tcpiphdr *)tp && ti->ti_seq == tp->rcv_nxt);
228 /*      sorwakeup(so); */
229         return (flags);
230 }
231
232 /*
233  * TCP input routine, follows pages 65-76 of the
234  * protocol specification dated September, 1981 very closely.
235  */
236 void
237 tcp_input(m, iphlen, inso)
238         register struct mbuf *m;
239         int iphlen;
240         struct socket *inso;
241 {
242         struct ip save_ip, *ip;
243         register struct tcpiphdr *ti;
244         caddr_t optp = NULL;
245         int optlen = 0;
246         int len, tlen, off;
247         register struct tcpcb *tp = 0;
248         register int tiflags;
249         struct socket *so = 0;
250         int todrop, acked, ourfinisacked, needoutput = 0;
251 /*      int dropsocket = 0; */
252         int iss = 0;
253         u_long tiwin;
254         int ret;
255 /*      int ts_present = 0; */
256
257         DEBUG_CALL("tcp_input");
258         DEBUG_ARGS((dfd," m = %8lx  iphlen = %2d  inso = %lx\n",
259                     (long )m, iphlen, (long )inso ));
260
261         /*
262          * If called with m == 0, then we're continuing the connect
263          */
264         if (m == NULL) {
265                 so = inso;
266
267                 /* Re-set a few variables */
268                 tp = sototcpcb(so);
269                 m = so->so_m;
270                 so->so_m = 0;
271                 ti = so->so_ti;
272                 tiwin = ti->ti_win;
273                 tiflags = ti->ti_flags;
274
275                 goto cont_conn;
276         }
277
278
279         STAT(tcpstat.tcps_rcvtotal++);
280         /*
281          * Get IP and TCP header together in first mbuf.
282          * Note: IP leaves IP header in first mbuf.
283          */
284         ti = mtod(m, struct tcpiphdr *);
285         if (iphlen > sizeof(struct ip )) {
286           ip_stripoptions(m, (struct mbuf *)0);
287           iphlen=sizeof(struct ip );
288         }
289         /* XXX Check if too short */
290
291
292         /*
293          * Save a copy of the IP header in case we want restore it
294          * for sending an ICMP error message in response.
295          */
296         ip=mtod(m, struct ip *);
297         save_ip = *ip;
298         save_ip.ip_len+= iphlen;
299
300         /*
301          * Checksum extended TCP header and data.
302          */
303         tlen = ((struct ip *)ti)->ip_len;
304         ti->ti_next = ti->ti_prev = 0;
305         ti->ti_x1 = 0;
306         ti->ti_len = htons((u_int16_t)tlen);
307         len = sizeof(struct ip ) + tlen;
308         /* keep checksum for ICMP reply
309          * ti->ti_sum = cksum(m, len);
310          * if (ti->ti_sum) { */
311         if(cksum(m, len)) {
312           STAT(tcpstat.tcps_rcvbadsum++);
313           goto drop;
314         }
315
316         /*
317          * Check that TCP offset makes sense,
318          * pull out TCP options and adjust length.              XXX
319          */
320         off = ti->ti_off << 2;
321         if (off < sizeof (struct tcphdr) || off > tlen) {
322           STAT(tcpstat.tcps_rcvbadoff++);
323           goto drop;
324         }
325         tlen -= off;
326         ti->ti_len = tlen;
327         if (off > sizeof (struct tcphdr)) {
328           optlen = off - sizeof (struct tcphdr);
329           optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
330
331                 /*
332                  * Do quick retrieval of timestamp options ("options
333                  * prediction?").  If timestamp is the only option and it's
334                  * formatted as recommended in RFC 1323 appendix A, we
335                  * quickly get the values now and not bother calling
336                  * tcp_dooptions(), etc.
337                  */
338 /*              if ((optlen == TCPOLEN_TSTAMP_APPA ||
339  *                   (optlen > TCPOLEN_TSTAMP_APPA &&
340  *                      optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
341  *                   *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
342  *                   (ti->ti_flags & TH_SYN) == 0) {
343  *                      ts_present = 1;
344  *                      ts_val = ntohl(*(u_int32_t *)(optp + 4));
345  *                      ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
346  *                      optp = NULL;   / * we've parsed the options * /
347  *              }
348  */
349         }
350         tiflags = ti->ti_flags;
351
352         /*
353          * Convert TCP protocol specific fields to host format.
354          */
355         NTOHL(ti->ti_seq);
356         NTOHL(ti->ti_ack);
357         NTOHS(ti->ti_win);
358         NTOHS(ti->ti_urp);
359
360         /*
361          * Drop TCP, IP headers and TCP options.
362          */
363         m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
364         m->m_len  -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
365
366         /*
367          * Locate pcb for segment.
368          */
369 findso:
370         so = tcp_last_so;
371         if (so->so_fport != ti->ti_dport ||
372             so->so_lport != ti->ti_sport ||
373             so->so_laddr.s_addr != ti->ti_src.s_addr ||
374             so->so_faddr.s_addr != ti->ti_dst.s_addr) {
375                 so = solookup(&tcb, ti->ti_src, ti->ti_sport,
376                                ti->ti_dst, ti->ti_dport);
377                 if (so)
378                         tcp_last_so = so;
379                 STAT(tcpstat.tcps_socachemiss++);
380         }
381
382         /*
383          * If the state is CLOSED (i.e., TCB does not exist) then
384          * all data in the incoming segment is discarded.
385          * If the TCB exists but is in CLOSED state, it is embryonic,
386          * but should either do a listen or a connect soon.
387          *
388          * state == CLOSED means we've done socreate() but haven't
389          * attached it to a protocol yet...
390          *
391          * XXX If a TCB does not exist, and the TH_SYN flag is
392          * the only flag set, then create a session, mark it
393          * as if it was LISTENING, and continue...
394          */
395         if (so == 0) {
396           if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN)
397             goto dropwithreset;
398
399           if ((so = socreate()) == NULL)
400             goto dropwithreset;
401           if (tcp_attach(so) < 0) {
402             free(so); /* Not sofree (if it failed, it's not insqued) */
403             goto dropwithreset;
404           }
405
406           sbreserve(&so->so_snd, TCP_SNDSPACE);
407           sbreserve(&so->so_rcv, TCP_RCVSPACE);
408
409           /*            tcp_last_so = so; */  /* XXX ? */
410           /*            tp = sototcpcb(so);    */
411
412           so->so_laddr = ti->ti_src;
413           so->so_lport = ti->ti_sport;
414           so->so_faddr = ti->ti_dst;
415           so->so_fport = ti->ti_dport;
416
417           if ((so->so_iptos = tcp_tos(so)) == 0)
418             so->so_iptos = ((struct ip *)ti)->ip_tos;
419
420           tp = sototcpcb(so);
421           tp->t_state = TCPS_LISTEN;
422         }
423
424         /*
425          * If this is a still-connecting socket, this probably
426          * a retransmit of the SYN.  Whether it's a retransmit SYN
427          * or something else, we nuke it.
428          */
429         if (so->so_state & SS_ISFCONNECTING)
430                 goto drop;
431
432         tp = sototcpcb(so);
433
434         /* XXX Should never fail */
435         if (tp == 0)
436                 goto dropwithreset;
437         if (tp->t_state == TCPS_CLOSED)
438                 goto drop;
439
440         /* Unscale the window into a 32-bit value. */
441 /*      if ((tiflags & TH_SYN) == 0)
442  *              tiwin = ti->ti_win << tp->snd_scale;
443  *      else
444  */
445                 tiwin = ti->ti_win;
446
447         /*
448          * Segment received on connection.
449          * Reset idle time and keep-alive timer.
450          */
451         tp->t_idle = 0;
452         if (SO_OPTIONS)
453            tp->t_timer[TCPT_KEEP] = TCPTV_KEEPINTVL;
454         else
455            tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_IDLE;
456
457         /*
458          * Process options if not in LISTEN state,
459          * else do it below (after getting remote address).
460          */
461         if (optp && tp->t_state != TCPS_LISTEN)
462                 tcp_dooptions(tp, (u_char *)optp, optlen, ti);
463 /* , */
464 /*                      &ts_present, &ts_val, &ts_ecr); */
465
466         /*
467          * Header prediction: check for the two common cases
468          * of a uni-directional data xfer.  If the packet has
469          * no control flags, is in-sequence, the window didn't
470          * change and we're not retransmitting, it's a
471          * candidate.  If the length is zero and the ack moved
472          * forward, we're the sender side of the xfer.  Just
473          * free the data acked & wake any higher level process
474          * that was blocked waiting for space.  If the length
475          * is non-zero and the ack didn't move, we're the
476          * receiver side.  If we're getting packets in-order
477          * (the reassembly queue is empty), add the data to
478          * the socket buffer and note that we need a delayed ack.
479          *
480          * XXX Some of these tests are not needed
481          * eg: the tiwin == tp->snd_wnd prevents many more
482          * predictions.. with no *real* advantage..
483          */
484         if (tp->t_state == TCPS_ESTABLISHED &&
485             (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
486 /*          (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && */
487             ti->ti_seq == tp->rcv_nxt &&
488             tiwin && tiwin == tp->snd_wnd &&
489             tp->snd_nxt == tp->snd_max) {
490                 /*
491                  * If last ACK falls within this segment's sequence numbers,
492                  *  record the timestamp.
493                  */
494 /*              if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
495  *                 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len)) {
496  *                      tp->ts_recent_age = tcp_now;
497  *                      tp->ts_recent = ts_val;
498  *              }
499  */
500                 if (ti->ti_len == 0) {
501                         if (SEQ_GT(ti->ti_ack, tp->snd_una) &&
502                             SEQ_LEQ(ti->ti_ack, tp->snd_max) &&
503                             tp->snd_cwnd >= tp->snd_wnd) {
504                                 /*
505                                  * this is a pure ack for outstanding data.
506                                  */
507                                 STAT(tcpstat.tcps_predack++);
508 /*                              if (ts_present)
509  *                                      tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
510  *                              else
511  */                                  if (tp->t_rtt &&
512                                             SEQ_GT(ti->ti_ack, tp->t_rtseq))
513                                         tcp_xmit_timer(tp, tp->t_rtt);
514                                 acked = ti->ti_ack - tp->snd_una;
515                                 STAT(tcpstat.tcps_rcvackpack++);
516                                 STAT(tcpstat.tcps_rcvackbyte += acked);
517                                 sbdrop(&so->so_snd, acked);
518                                 tp->snd_una = ti->ti_ack;
519                                 m_freem(m);
520
521                                 /*
522                                  * If all outstanding data are acked, stop
523                                  * retransmit timer, otherwise restart timer
524                                  * using current (possibly backed-off) value.
525                                  * If process is waiting for space,
526                                  * wakeup/selwakeup/signal.  If data
527                                  * are ready to send, let tcp_output
528                                  * decide between more output or persist.
529                                  */
530                                 if (tp->snd_una == tp->snd_max)
531                                         tp->t_timer[TCPT_REXMT] = 0;
532                                 else if (tp->t_timer[TCPT_PERSIST] == 0)
533                                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
534
535                                 /*
536                                  * There's room in so_snd, sowwakup will read()
537                                  * from the socket if we can
538                                  */
539 /*                              if (so->so_snd.sb_flags & SB_NOTIFY)
540  *                                      sowwakeup(so);
541  */
542                                 /*
543                                  * This is called because sowwakeup might have
544                                  * put data into so_snd.  Since we don't so sowwakeup,
545                                  * we don't need this.. XXX???
546                                  */
547                                 if (so->so_snd.sb_cc)
548                                         (void) tcp_output(tp);
549
550                                 return;
551                         }
552                 } else if (ti->ti_ack == tp->snd_una &&
553                     tp->seg_next == (tcpiphdrp_32)tp &&
554                     ti->ti_len <= sbspace(&so->so_rcv)) {
555                         /*
556                          * this is a pure, in-sequence data packet
557                          * with nothing on the reassembly queue and
558                          * we have enough buffer space to take it.
559                          */
560                         STAT(tcpstat.tcps_preddat++);
561                         tp->rcv_nxt += ti->ti_len;
562                         STAT(tcpstat.tcps_rcvpack++);
563                         STAT(tcpstat.tcps_rcvbyte += ti->ti_len);
564                         /*
565                          * Add data to socket buffer.
566                          */
567                         if (so->so_emu) {
568                                 if (tcp_emu(so,m)) sbappend(so, m);
569                         } else
570                                 sbappend(so, m);
571
572                         /*
573                          * XXX This is called when data arrives.  Later, check
574                          * if we can actually write() to the socket
575                          * XXX Need to check? It's be NON_BLOCKING
576                          */
577 /*                      sorwakeup(so); */
578
579                         /*
580                          * If this is a short packet, then ACK now - with Nagel
581                          *      congestion avoidance sender won't send more until
582                          *      he gets an ACK.
583                          *
584                          * It is better to not delay acks at all to maximize
585                          * TCP throughput.  See RFC 2581.
586                          */
587                         tp->t_flags |= TF_ACKNOW;
588                         tcp_output(tp);
589                         return;
590                 }
591         } /* header prediction */
592         /*
593          * Calculate amount of space in receive window,
594          * and then do TCP input processing.
595          * Receive window is amount of space in rcv queue,
596          * but not less than advertised window.
597          */
598         { int win;
599           win = sbspace(&so->so_rcv);
600           if (win < 0)
601             win = 0;
602           tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt));
603         }
604
605         switch (tp->t_state) {
606
607         /*
608          * If the state is LISTEN then ignore segment if it contains an RST.
609          * If the segment contains an ACK then it is bad and send a RST.
610          * If it does not contain a SYN then it is not interesting; drop it.
611          * Don't bother responding if the destination was a broadcast.
612          * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
613          * tp->iss, and send a segment:
614          *     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
615          * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
616          * Fill in remote peer address fields if not previously specified.
617          * Enter SYN_RECEIVED state, and process any other fields of this
618          * segment in this state.
619          */
620         case TCPS_LISTEN: {
621
622           if (tiflags & TH_RST)
623             goto drop;
624           if (tiflags & TH_ACK)
625             goto dropwithreset;
626           if ((tiflags & TH_SYN) == 0)
627             goto drop;
628
629           /*
630            * This has way too many gotos...
631            * But a bit of spaghetti code never hurt anybody :)
632            */
633
634           /*
635            * If this is destined for the control address, then flag to
636            * tcp_ctl once connected, otherwise connect
637            */
638           if ((so->so_faddr.s_addr&htonl(0xffffff00)) == special_addr.s_addr) {
639             int lastbyte=ntohl(so->so_faddr.s_addr) & 0xff;
640             if (lastbyte!=CTL_ALIAS && lastbyte!=CTL_DNS) {
641 #if 0
642               if(lastbyte==CTL_CMD || lastbyte==CTL_EXEC) {
643                 /* Command or exec adress */
644                 so->so_state |= SS_CTL;
645               } else
646 #endif
647               {
648                 /* May be an add exec */
649                 struct ex_list *ex_ptr;
650                 for(ex_ptr = exec_list; ex_ptr; ex_ptr = ex_ptr->ex_next) {
651                   if(ex_ptr->ex_fport == so->so_fport &&
652                      lastbyte == ex_ptr->ex_addr) {
653                     so->so_state |= SS_CTL;
654                     break;
655                   }
656                 }
657               }
658               if(so->so_state & SS_CTL) goto cont_input;
659             }
660             /* CTL_ALIAS: Do nothing, tcp_fconnect will be called on it */
661           }
662
663           if (so->so_emu & EMU_NOCONNECT) {
664             so->so_emu &= ~EMU_NOCONNECT;
665             goto cont_input;
666           }
667
668           if((tcp_fconnect(so) == -1) && (errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
669             u_char code=ICMP_UNREACH_NET;
670             DEBUG_MISC((dfd," tcp fconnect errno = %d-%s\n",
671                         errno,strerror(errno)));
672             if(errno == ECONNREFUSED) {
673               /* ACK the SYN, send RST to refuse the connection */
674               tcp_respond(tp, ti, m, ti->ti_seq+1, (tcp_seq)0,
675                           TH_RST|TH_ACK);
676             } else {
677               if(errno == EHOSTUNREACH) code=ICMP_UNREACH_HOST;
678               HTONL(ti->ti_seq);             /* restore tcp header */
679               HTONL(ti->ti_ack);
680               HTONS(ti->ti_win);
681               HTONS(ti->ti_urp);
682               m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
683               m->m_len  += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
684               *ip=save_ip;
685               icmp_error(m, ICMP_UNREACH,code, 0,strerror(errno));
686             }
687             tp = tcp_close(tp);
688             m_free(m);
689           } else {
690             /*
691              * Haven't connected yet, save the current mbuf
692              * and ti, and return
693              * XXX Some OS's don't tell us whether the connect()
694              * succeeded or not.  So we must time it out.
695              */
696             so->so_m = m;
697             so->so_ti = ti;
698             tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
699             tp->t_state = TCPS_SYN_RECEIVED;
700           }
701           return;
702
703         cont_conn:
704           /* m==NULL
705            * Check if the connect succeeded
706            */
707           if (so->so_state & SS_NOFDREF) {
708             tp = tcp_close(tp);
709             goto dropwithreset;
710           }
711         cont_input:
712           tcp_template(tp);
713
714           if (optp)
715             tcp_dooptions(tp, (u_char *)optp, optlen, ti);
716           /* , */
717           /*                            &ts_present, &ts_val, &ts_ecr); */
718
719           if (iss)
720             tp->iss = iss;
721           else
722             tp->iss = tcp_iss;
723           tcp_iss += TCP_ISSINCR/2;
724           tp->irs = ti->ti_seq;
725           tcp_sendseqinit(tp);
726           tcp_rcvseqinit(tp);
727           tp->t_flags |= TF_ACKNOW;
728           tp->t_state = TCPS_SYN_RECEIVED;
729           tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
730           STAT(tcpstat.tcps_accepts++);
731           goto trimthenstep6;
732         } /* case TCPS_LISTEN */
733
734         /*
735          * If the state is SYN_SENT:
736          *      if seg contains an ACK, but not for our SYN, drop the input.
737          *      if seg contains a RST, then drop the connection.
738          *      if seg does not contain SYN, then drop it.
739          * Otherwise this is an acceptable SYN segment
740          *      initialize tp->rcv_nxt and tp->irs
741          *      if seg contains ack then advance tp->snd_una
742          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
743          *      arrange for segment to be acked (eventually)
744          *      continue processing rest of data/controls, beginning with URG
745          */
746         case TCPS_SYN_SENT:
747                 if ((tiflags & TH_ACK) &&
748                     (SEQ_LEQ(ti->ti_ack, tp->iss) ||
749                      SEQ_GT(ti->ti_ack, tp->snd_max)))
750                         goto dropwithreset;
751
752                 if (tiflags & TH_RST) {
753                         if (tiflags & TH_ACK)
754                                 tp = tcp_drop(tp,0); /* XXX Check t_softerror! */
755                         goto drop;
756                 }
757
758                 if ((tiflags & TH_SYN) == 0)
759                         goto drop;
760                 if (tiflags & TH_ACK) {
761                         tp->snd_una = ti->ti_ack;
762                         if (SEQ_LT(tp->snd_nxt, tp->snd_una))
763                                 tp->snd_nxt = tp->snd_una;
764                 }
765
766                 tp->t_timer[TCPT_REXMT] = 0;
767                 tp->irs = ti->ti_seq;
768                 tcp_rcvseqinit(tp);
769                 tp->t_flags |= TF_ACKNOW;
770                 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
771                         STAT(tcpstat.tcps_connects++);
772                         soisfconnected(so);
773                         tp->t_state = TCPS_ESTABLISHED;
774
775                         /* Do window scaling on this connection? */
776 /*                      if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
777  *                              (TF_RCVD_SCALE|TF_REQ_SCALE)) {
778  *                              tp->snd_scale = tp->requested_s_scale;
779  *                              tp->rcv_scale = tp->request_r_scale;
780  *                      }
781  */
782                         (void) tcp_reass(tp, (struct tcpiphdr *)0,
783                                 (struct mbuf *)0);
784                         /*
785                          * if we didn't have to retransmit the SYN,
786                          * use its rtt as our initial srtt & rtt var.
787                          */
788                         if (tp->t_rtt)
789                                 tcp_xmit_timer(tp, tp->t_rtt);
790                 } else
791                         tp->t_state = TCPS_SYN_RECEIVED;
792
793 trimthenstep6:
794                 /*
795                  * Advance ti->ti_seq to correspond to first data byte.
796                  * If data, trim to stay within window,
797                  * dropping FIN if necessary.
798                  */
799                 ti->ti_seq++;
800                 if (ti->ti_len > tp->rcv_wnd) {
801                         todrop = ti->ti_len - tp->rcv_wnd;
802                         m_adj(m, -todrop);
803                         ti->ti_len = tp->rcv_wnd;
804                         tiflags &= ~TH_FIN;
805                         STAT(tcpstat.tcps_rcvpackafterwin++);
806                         STAT(tcpstat.tcps_rcvbyteafterwin += todrop);
807                 }
808                 tp->snd_wl1 = ti->ti_seq - 1;
809                 tp->rcv_up = ti->ti_seq;
810                 goto step6;
811         } /* switch tp->t_state */
812         /*
813          * States other than LISTEN or SYN_SENT.
814          * First check timestamp, if present.
815          * Then check that at least some bytes of segment are within
816          * receive window.  If segment begins before rcv_nxt,
817          * drop leading data (and SYN); if nothing left, just ack.
818          *
819          * RFC 1323 PAWS: If we have a timestamp reply on this segment
820          * and it's less than ts_recent, drop it.
821          */
822 /*      if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
823  *          TSTMP_LT(ts_val, tp->ts_recent)) {
824  *
825  */             /* Check to see if ts_recent is over 24 days old.  */
826 /*              if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
827  */                     /*
828  *                       * Invalidate ts_recent.  If this segment updates
829  *                       * ts_recent, the age will be reset later and ts_recent
830  *                       * will get a valid value.  If it does not, setting
831  *                       * ts_recent to zero will at least satisfy the
832  *                       * requirement that zero be placed in the timestamp
833  *                       * echo reply when ts_recent isn't valid.  The
834  *                       * age isn't reset until we get a valid ts_recent
835  *                       * because we don't want out-of-order segments to be
836  *                       * dropped when ts_recent is old.
837  *                       */
838 /*                      tp->ts_recent = 0;
839  *              } else {
840  *                      tcpstat.tcps_rcvduppack++;
841  *                      tcpstat.tcps_rcvdupbyte += ti->ti_len;
842  *                      tcpstat.tcps_pawsdrop++;
843  *                      goto dropafterack;
844  *              }
845  *      }
846  */
847
848         todrop = tp->rcv_nxt - ti->ti_seq;
849         if (todrop > 0) {
850                 if (tiflags & TH_SYN) {
851                         tiflags &= ~TH_SYN;
852                         ti->ti_seq++;
853                         if (ti->ti_urp > 1)
854                                 ti->ti_urp--;
855                         else
856                                 tiflags &= ~TH_URG;
857                         todrop--;
858                 }
859                 /*
860                  * Following if statement from Stevens, vol. 2, p. 960.
861                  */
862                 if (todrop > ti->ti_len
863                     || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) {
864                         /*
865                          * Any valid FIN must be to the left of the window.
866                          * At this point the FIN must be a duplicate or out
867                          * of sequence; drop it.
868                          */
869                         tiflags &= ~TH_FIN;
870
871                         /*
872                          * Send an ACK to resynchronize and drop any data.
873                          * But keep on processing for RST or ACK.
874                          */
875                         tp->t_flags |= TF_ACKNOW;
876                         todrop = ti->ti_len;
877                         STAT(tcpstat.tcps_rcvduppack++);
878                         STAT(tcpstat.tcps_rcvdupbyte += todrop);
879                 } else {
880                         STAT(tcpstat.tcps_rcvpartduppack++);
881                         STAT(tcpstat.tcps_rcvpartdupbyte += todrop);
882                 }
883                 m_adj(m, todrop);
884                 ti->ti_seq += todrop;
885                 ti->ti_len -= todrop;
886                 if (ti->ti_urp > todrop)
887                         ti->ti_urp -= todrop;
888                 else {
889                         tiflags &= ~TH_URG;
890                         ti->ti_urp = 0;
891                 }
892         }
893         /*
894          * If new data are received on a connection after the
895          * user processes are gone, then RST the other end.
896          */
897         if ((so->so_state & SS_NOFDREF) &&
898             tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) {
899                 tp = tcp_close(tp);
900                 STAT(tcpstat.tcps_rcvafterclose++);
901                 goto dropwithreset;
902         }
903
904         /*
905          * If segment ends after window, drop trailing data
906          * (and PUSH and FIN); if nothing left, just ACK.
907          */
908         todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
909         if (todrop > 0) {
910                 STAT(tcpstat.tcps_rcvpackafterwin++);
911                 if (todrop >= ti->ti_len) {
912                         STAT(tcpstat.tcps_rcvbyteafterwin += ti->ti_len);
913                         /*
914                          * If a new connection request is received
915                          * while in TIME_WAIT, drop the old connection
916                          * and start over if the sequence numbers
917                          * are above the previous ones.
918                          */
919                         if (tiflags & TH_SYN &&
920                             tp->t_state == TCPS_TIME_WAIT &&
921                             SEQ_GT(ti->ti_seq, tp->rcv_nxt)) {
922                                 iss = tp->rcv_nxt + TCP_ISSINCR;
923                                 tp = tcp_close(tp);
924                                 goto findso;
925                         }
926                         /*
927                          * If window is closed can only take segments at
928                          * window edge, and have to drop data and PUSH from
929                          * incoming segments.  Continue processing, but
930                          * remember to ack.  Otherwise, drop segment
931                          * and ack.
932                          */
933                         if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) {
934                                 tp->t_flags |= TF_ACKNOW;
935                                 STAT(tcpstat.tcps_rcvwinprobe++);
936                         } else
937                                 goto dropafterack;
938                 } else
939                         STAT(tcpstat.tcps_rcvbyteafterwin += todrop);
940                 m_adj(m, -todrop);
941                 ti->ti_len -= todrop;
942                 tiflags &= ~(TH_PUSH|TH_FIN);
943         }
944
945         /*
946          * If last ACK falls within this segment's sequence numbers,
947          * record its timestamp.
948          */
949 /*      if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
950  *          SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len +
951  *                 ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
952  *              tp->ts_recent_age = tcp_now;
953  *              tp->ts_recent = ts_val;
954  *      }
955  */
956
957         /*
958          * If the RST bit is set examine the state:
959          *    SYN_RECEIVED STATE:
960          *      If passive open, return to LISTEN state.
961          *      If active open, inform user that connection was refused.
962          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
963          *      Inform user that connection was reset, and close tcb.
964          *    CLOSING, LAST_ACK, TIME_WAIT STATES
965          *      Close the tcb.
966          */
967         if (tiflags&TH_RST) switch (tp->t_state) {
968
969         case TCPS_SYN_RECEIVED:
970 /*              so->so_error = ECONNREFUSED; */
971                 goto close;
972
973         case TCPS_ESTABLISHED:
974         case TCPS_FIN_WAIT_1:
975         case TCPS_FIN_WAIT_2:
976         case TCPS_CLOSE_WAIT:
977 /*              so->so_error = ECONNRESET; */
978         close:
979                 tp->t_state = TCPS_CLOSED;
980                 STAT(tcpstat.tcps_drops++);
981                 tp = tcp_close(tp);
982                 goto drop;
983
984         case TCPS_CLOSING:
985         case TCPS_LAST_ACK:
986         case TCPS_TIME_WAIT:
987                 tp = tcp_close(tp);
988                 goto drop;
989         }
990
991         /*
992          * If a SYN is in the window, then this is an
993          * error and we send an RST and drop the connection.
994          */
995         if (tiflags & TH_SYN) {
996                 tp = tcp_drop(tp,0);
997                 goto dropwithreset;
998         }
999
1000         /*
1001          * If the ACK bit is off we drop the segment and return.
1002          */
1003         if ((tiflags & TH_ACK) == 0) goto drop;
1004
1005         /*
1006          * Ack processing.
1007          */
1008         switch (tp->t_state) {
1009         /*
1010          * In SYN_RECEIVED state if the ack ACKs our SYN then enter
1011          * ESTABLISHED state and continue processing, otherwise
1012          * send an RST.  una<=ack<=max
1013          */
1014         case TCPS_SYN_RECEIVED:
1015
1016                 if (SEQ_GT(tp->snd_una, ti->ti_ack) ||
1017                     SEQ_GT(ti->ti_ack, tp->snd_max))
1018                         goto dropwithreset;
1019                 STAT(tcpstat.tcps_connects++);
1020                 tp->t_state = TCPS_ESTABLISHED;
1021                 /*
1022                  * The sent SYN is ack'ed with our sequence number +1
1023                  * The first data byte already in the buffer will get
1024                  * lost if no correction is made.  This is only needed for
1025                  * SS_CTL since the buffer is empty otherwise.
1026                  * tp->snd_una++; or:
1027                  */
1028                 tp->snd_una=ti->ti_ack;
1029                 if (so->so_state & SS_CTL) {
1030                   /* So tcp_ctl reports the right state */
1031                   ret = tcp_ctl(so);
1032                   if (ret == 1) {
1033                     soisfconnected(so);
1034                     so->so_state &= ~SS_CTL;   /* success XXX */
1035                   } else if (ret == 2) {
1036                     so->so_state = SS_NOFDREF; /* CTL_CMD */
1037                   } else {
1038                     needoutput = 1;
1039                     tp->t_state = TCPS_FIN_WAIT_1;
1040                   }
1041                 } else {
1042                   soisfconnected(so);
1043                 }
1044
1045                 /* Do window scaling? */
1046 /*              if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1047  *                      (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1048  *                      tp->snd_scale = tp->requested_s_scale;
1049  *                      tp->rcv_scale = tp->request_r_scale;
1050  *              }
1051  */
1052                 (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0);
1053                 tp->snd_wl1 = ti->ti_seq - 1;
1054                 /* Avoid ack processing; snd_una==ti_ack  =>  dup ack */
1055                 goto synrx_to_est;
1056                 /* fall into ... */
1057
1058         /*
1059          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1060          * ACKs.  If the ack is in the range
1061          *      tp->snd_una < ti->ti_ack <= tp->snd_max
1062          * then advance tp->snd_una to ti->ti_ack and drop
1063          * data from the retransmission queue.  If this ACK reflects
1064          * more up to date window information we update our window information.
1065          */
1066         case TCPS_ESTABLISHED:
1067         case TCPS_FIN_WAIT_1:
1068         case TCPS_FIN_WAIT_2:
1069         case TCPS_CLOSE_WAIT:
1070         case TCPS_CLOSING:
1071         case TCPS_LAST_ACK:
1072         case TCPS_TIME_WAIT:
1073
1074                 if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) {
1075                         if (ti->ti_len == 0 && tiwin == tp->snd_wnd) {
1076                           STAT(tcpstat.tcps_rcvdupack++);
1077                           DEBUG_MISC((dfd," dup ack  m = %lx  so = %lx \n",
1078                                       (long )m, (long )so));
1079                                 /*
1080                                  * If we have outstanding data (other than
1081                                  * a window probe), this is a completely
1082                                  * duplicate ack (ie, window info didn't
1083                                  * change), the ack is the biggest we've
1084                                  * seen and we've seen exactly our rexmt
1085                                  * threshold of them, assume a packet
1086                                  * has been dropped and retransmit it.
1087                                  * Kludge snd_nxt & the congestion
1088                                  * window so we send only this one
1089                                  * packet.
1090                                  *
1091                                  * We know we're losing at the current
1092                                  * window size so do congestion avoidance
1093                                  * (set ssthresh to half the current window
1094                                  * and pull our congestion window back to
1095                                  * the new ssthresh).
1096                                  *
1097                                  * Dup acks mean that packets have left the
1098                                  * network (they're now cached at the receiver)
1099                                  * so bump cwnd by the amount in the receiver
1100                                  * to keep a constant cwnd packets in the
1101                                  * network.
1102                                  */
1103                                 if (tp->t_timer[TCPT_REXMT] == 0 ||
1104                                     ti->ti_ack != tp->snd_una)
1105                                         tp->t_dupacks = 0;
1106                                 else if (++tp->t_dupacks == TCPREXMTTHRESH) {
1107                                         tcp_seq onxt = tp->snd_nxt;
1108                                         u_int win =
1109                                             min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1110                                                 tp->t_maxseg;
1111
1112                                         if (win < 2)
1113                                                 win = 2;
1114                                         tp->snd_ssthresh = win * tp->t_maxseg;
1115                                         tp->t_timer[TCPT_REXMT] = 0;
1116                                         tp->t_rtt = 0;
1117                                         tp->snd_nxt = ti->ti_ack;
1118                                         tp->snd_cwnd = tp->t_maxseg;
1119                                         (void) tcp_output(tp);
1120                                         tp->snd_cwnd = tp->snd_ssthresh +
1121                                                tp->t_maxseg * tp->t_dupacks;
1122                                         if (SEQ_GT(onxt, tp->snd_nxt))
1123                                                 tp->snd_nxt = onxt;
1124                                         goto drop;
1125                                 } else if (tp->t_dupacks > TCPREXMTTHRESH) {
1126                                         tp->snd_cwnd += tp->t_maxseg;
1127                                         (void) tcp_output(tp);
1128                                         goto drop;
1129                                 }
1130                         } else
1131                                 tp->t_dupacks = 0;
1132                         break;
1133                 }
1134         synrx_to_est:
1135                 /*
1136                  * If the congestion window was inflated to account
1137                  * for the other side's cached packets, retract it.
1138                  */
1139                 if (tp->t_dupacks > TCPREXMTTHRESH &&
1140                     tp->snd_cwnd > tp->snd_ssthresh)
1141                         tp->snd_cwnd = tp->snd_ssthresh;
1142                 tp->t_dupacks = 0;
1143                 if (SEQ_GT(ti->ti_ack, tp->snd_max)) {
1144                         STAT(tcpstat.tcps_rcvacktoomuch++);
1145                         goto dropafterack;
1146                 }
1147                 acked = ti->ti_ack - tp->snd_una;
1148                 STAT(tcpstat.tcps_rcvackpack++);
1149                 STAT(tcpstat.tcps_rcvackbyte += acked);
1150
1151                 /*
1152                  * If we have a timestamp reply, update smoothed
1153                  * round trip time.  If no timestamp is present but
1154                  * transmit timer is running and timed sequence
1155                  * number was acked, update smoothed round trip time.
1156                  * Since we now have an rtt measurement, cancel the
1157                  * timer backoff (cf., Phil Karn's retransmit alg.).
1158                  * Recompute the initial retransmit timer.
1159                  */
1160 /*              if (ts_present)
1161  *                      tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
1162  *              else
1163  */
1164                      if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1165                         tcp_xmit_timer(tp,tp->t_rtt);
1166
1167                 /*
1168                  * If all outstanding data is acked, stop retransmit
1169                  * timer and remember to restart (more output or persist).
1170                  * If there is more data to be acked, restart retransmit
1171                  * timer, using current (possibly backed-off) value.
1172                  */
1173                 if (ti->ti_ack == tp->snd_max) {
1174                         tp->t_timer[TCPT_REXMT] = 0;
1175                         needoutput = 1;
1176                 } else if (tp->t_timer[TCPT_PERSIST] == 0)
1177                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1178                 /*
1179                  * When new data is acked, open the congestion window.
1180                  * If the window gives us less than ssthresh packets
1181                  * in flight, open exponentially (maxseg per packet).
1182                  * Otherwise open linearly: maxseg per window
1183                  * (maxseg^2 / cwnd per packet).
1184                  */
1185                 {
1186                   register u_int cw = tp->snd_cwnd;
1187                   register u_int incr = tp->t_maxseg;
1188
1189                   if (cw > tp->snd_ssthresh)
1190                     incr = incr * incr / cw;
1191                   tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1192                 }
1193                 if (acked > so->so_snd.sb_cc) {
1194                         tp->snd_wnd -= so->so_snd.sb_cc;
1195                         sbdrop(&so->so_snd, (int )so->so_snd.sb_cc);
1196                         ourfinisacked = 1;
1197                 } else {
1198                         sbdrop(&so->so_snd, acked);
1199                         tp->snd_wnd -= acked;
1200                         ourfinisacked = 0;
1201                 }
1202                 /*
1203                  * XXX sowwakup is called when data is acked and there's room for
1204                  * for more data... it should read() the socket
1205                  */
1206 /*              if (so->so_snd.sb_flags & SB_NOTIFY)
1207  *                      sowwakeup(so);
1208  */
1209                 tp->snd_una = ti->ti_ack;
1210                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1211                         tp->snd_nxt = tp->snd_una;
1212
1213                 switch (tp->t_state) {
1214
1215                 /*
1216                  * In FIN_WAIT_1 STATE in addition to the processing
1217                  * for the ESTABLISHED state if our FIN is now acknowledged
1218                  * then enter FIN_WAIT_2.
1219                  */
1220                 case TCPS_FIN_WAIT_1:
1221                         if (ourfinisacked) {
1222                                 /*
1223                                  * If we can't receive any more
1224                                  * data, then closing user can proceed.
1225                                  * Starting the timer is contrary to the
1226                                  * specification, but if we don't get a FIN
1227                                  * we'll hang forever.
1228                                  */
1229                                 if (so->so_state & SS_FCANTRCVMORE) {
1230                                         soisfdisconnected(so);
1231                                         tp->t_timer[TCPT_2MSL] = TCP_MAXIDLE;
1232                                 }
1233                                 tp->t_state = TCPS_FIN_WAIT_2;
1234                         }
1235                         break;
1236
1237                 /*
1238                  * In CLOSING STATE in addition to the processing for
1239                  * the ESTABLISHED state if the ACK acknowledges our FIN
1240                  * then enter the TIME-WAIT state, otherwise ignore
1241                  * the segment.
1242                  */
1243                 case TCPS_CLOSING:
1244                         if (ourfinisacked) {
1245                                 tp->t_state = TCPS_TIME_WAIT;
1246                                 tcp_canceltimers(tp);
1247                                 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1248                                 soisfdisconnected(so);
1249                         }
1250                         break;
1251
1252                 /*
1253                  * In LAST_ACK, we may still be waiting for data to drain
1254                  * and/or to be acked, as well as for the ack of our FIN.
1255                  * If our FIN is now acknowledged, delete the TCB,
1256                  * enter the closed state and return.
1257                  */
1258                 case TCPS_LAST_ACK:
1259                         if (ourfinisacked) {
1260                                 tp = tcp_close(tp);
1261                                 goto drop;
1262                         }
1263                         break;
1264
1265                 /*
1266                  * In TIME_WAIT state the only thing that should arrive
1267                  * is a retransmission of the remote FIN.  Acknowledge
1268                  * it and restart the finack timer.
1269                  */
1270                 case TCPS_TIME_WAIT:
1271                         tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1272                         goto dropafterack;
1273                 }
1274         } /* switch(tp->t_state) */
1275
1276 step6:
1277         /*
1278          * Update window information.
1279          * Don't look at window if no ACK: TAC's send garbage on first SYN.
1280          */
1281         if ((tiflags & TH_ACK) &&
1282             (SEQ_LT(tp->snd_wl1, ti->ti_seq) ||
1283             (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) ||
1284             (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) {
1285                 /* keep track of pure window updates */
1286                 if (ti->ti_len == 0 &&
1287                     tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd)
1288                         STAT(tcpstat.tcps_rcvwinupd++);
1289                 tp->snd_wnd = tiwin;
1290                 tp->snd_wl1 = ti->ti_seq;
1291                 tp->snd_wl2 = ti->ti_ack;
1292                 if (tp->snd_wnd > tp->max_sndwnd)
1293                         tp->max_sndwnd = tp->snd_wnd;
1294                 needoutput = 1;
1295         }
1296
1297         /*
1298          * Process segments with URG.
1299          */
1300         if ((tiflags & TH_URG) && ti->ti_urp &&
1301             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1302                 /*
1303                  * This is a kludge, but if we receive and accept
1304                  * random urgent pointers, we'll crash in
1305                  * soreceive.  It's hard to imagine someone
1306                  * actually wanting to send this much urgent data.
1307                  */
1308                 if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen) {
1309                         ti->ti_urp = 0;
1310                         tiflags &= ~TH_URG;
1311                         goto dodata;
1312                 }
1313                 /*
1314                  * If this segment advances the known urgent pointer,
1315                  * then mark the data stream.  This should not happen
1316                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1317                  * a FIN has been received from the remote side.
1318                  * In these states we ignore the URG.
1319                  *
1320                  * According to RFC961 (Assigned Protocols),
1321                  * the urgent pointer points to the last octet
1322                  * of urgent data.  We continue, however,
1323                  * to consider it to indicate the first octet
1324                  * of data past the urgent section as the original
1325                  * spec states (in one of two places).
1326                  */
1327                 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) {
1328                         tp->rcv_up = ti->ti_seq + ti->ti_urp;
1329                         so->so_urgc =  so->so_rcv.sb_cc +
1330                                 (tp->rcv_up - tp->rcv_nxt); /* -1; */
1331                         tp->rcv_up = ti->ti_seq + ti->ti_urp;
1332
1333                 }
1334         } else
1335                 /*
1336                  * If no out of band data is expected,
1337                  * pull receive urgent pointer along
1338                  * with the receive window.
1339                  */
1340                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1341                         tp->rcv_up = tp->rcv_nxt;
1342 dodata:
1343
1344         /*
1345          * Process the segment text, merging it into the TCP sequencing queue,
1346          * and arranging for acknowledgment of receipt if necessary.
1347          * This process logically involves adjusting tp->rcv_wnd as data
1348          * is presented to the user (this happens in tcp_usrreq.c,
1349          * case PRU_RCVD).  If a FIN has already been received on this
1350          * connection then we just ignore the text.
1351          */
1352         if ((ti->ti_len || (tiflags&TH_FIN)) &&
1353             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1354                 TCP_REASS(tp, ti, m, so, tiflags);
1355                 /*
1356                  * Note the amount of data that peer has sent into
1357                  * our window, in order to estimate the sender's
1358                  * buffer size.
1359                  */
1360                 len = so->so_rcv.sb_datalen - (tp->rcv_adv - tp->rcv_nxt);
1361         } else {
1362                 m_free(m);
1363                 tiflags &= ~TH_FIN;
1364         }
1365
1366         /*
1367          * If FIN is received ACK the FIN and let the user know
1368          * that the connection is closing.
1369          */
1370         if (tiflags & TH_FIN) {
1371                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1372                         /*
1373                          * If we receive a FIN we can't send more data,
1374                          * set it SS_FDRAIN
1375                          * Shutdown the socket if there is no rx data in the
1376                          * buffer.
1377                          * soread() is called on completion of shutdown() and
1378                          * will got to TCPS_LAST_ACK, and use tcp_output()
1379                          * to send the FIN.
1380                          */
1381 /*                      sofcantrcvmore(so); */
1382                         sofwdrain(so);
1383
1384                         tp->t_flags |= TF_ACKNOW;
1385                         tp->rcv_nxt++;
1386                 }
1387                 switch (tp->t_state) {
1388
1389                 /*
1390                  * In SYN_RECEIVED and ESTABLISHED STATES
1391                  * enter the CLOSE_WAIT state.
1392                  */
1393                 case TCPS_SYN_RECEIVED:
1394                 case TCPS_ESTABLISHED:
1395                   if(so->so_emu == EMU_CTL)        /* no shutdown on socket */
1396                     tp->t_state = TCPS_LAST_ACK;
1397                   else
1398                     tp->t_state = TCPS_CLOSE_WAIT;
1399                   break;
1400
1401                 /*
1402                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
1403                  * enter the CLOSING state.
1404                  */
1405                 case TCPS_FIN_WAIT_1:
1406                         tp->t_state = TCPS_CLOSING;
1407                         break;
1408
1409                 /*
1410                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
1411                  * starting the time-wait timer, turning off the other
1412                  * standard timers.
1413                  */
1414                 case TCPS_FIN_WAIT_2:
1415                         tp->t_state = TCPS_TIME_WAIT;
1416                         tcp_canceltimers(tp);
1417                         tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1418                         soisfdisconnected(so);
1419                         break;
1420
1421                 /*
1422                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
1423                  */
1424                 case TCPS_TIME_WAIT:
1425                         tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1426                         break;
1427                 }
1428         }
1429
1430         /*
1431          * If this is a small packet, then ACK now - with Nagel
1432          *      congestion avoidance sender won't send more until
1433          *      he gets an ACK.
1434          *
1435          * See above.
1436          */
1437 /*      if (ti->ti_len && (unsigned)ti->ti_len < tp->t_maxseg) {
1438  */
1439 /*      if ((ti->ti_len && (unsigned)ti->ti_len < tp->t_maxseg &&
1440  *              (so->so_iptos & IPTOS_LOWDELAY) == 0) ||
1441  *             ((so->so_iptos & IPTOS_LOWDELAY) &&
1442  *             ((struct tcpiphdr_2 *)ti)->first_char == (char)27)) {
1443  */
1444         if (ti->ti_len && (unsigned)ti->ti_len <= 5 &&
1445             ((struct tcpiphdr_2 *)ti)->first_char == (char)27) {
1446                 tp->t_flags |= TF_ACKNOW;
1447         }
1448
1449         /*
1450          * Return any desired output.
1451          */
1452         if (needoutput || (tp->t_flags & TF_ACKNOW)) {
1453                 (void) tcp_output(tp);
1454         }
1455         return;
1456
1457 dropafterack:
1458         /*
1459          * Generate an ACK dropping incoming segment if it occupies
1460          * sequence space, where the ACK reflects our state.
1461          */
1462         if (tiflags & TH_RST)
1463                 goto drop;
1464         m_freem(m);
1465         tp->t_flags |= TF_ACKNOW;
1466         (void) tcp_output(tp);
1467         return;
1468
1469 dropwithreset:
1470         /* reuses m if m!=NULL, m_free() unnecessary */
1471         if (tiflags & TH_ACK)
1472                 tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1473         else {
1474                 if (tiflags & TH_SYN) ti->ti_len++;
1475                 tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1476                     TH_RST|TH_ACK);
1477         }
1478
1479         return;
1480
1481 drop:
1482         /*
1483          * Drop space held by incoming segment and return.
1484          */
1485         m_free(m);
1486
1487         return;
1488 }
1489
1490  /* , ts_present, ts_val, ts_ecr) */
1491 /*      int *ts_present;
1492  *      u_int32_t *ts_val, *ts_ecr;
1493  */
1494 static void
1495 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcpiphdr *ti)
1496 {
1497         u_int16_t mss;
1498         int opt, optlen;
1499
1500         DEBUG_CALL("tcp_dooptions");
1501         DEBUG_ARGS((dfd," tp = %lx  cnt=%i \n", (long )tp, cnt));
1502
1503         for (; cnt > 0; cnt -= optlen, cp += optlen) {
1504                 opt = cp[0];
1505                 if (opt == TCPOPT_EOL)
1506                         break;
1507                 if (opt == TCPOPT_NOP)
1508                         optlen = 1;
1509                 else {
1510                         optlen = cp[1];
1511                         if (optlen <= 0)
1512                                 break;
1513                 }
1514                 switch (opt) {
1515
1516                 default:
1517                         continue;
1518
1519                 case TCPOPT_MAXSEG:
1520                         if (optlen != TCPOLEN_MAXSEG)
1521                                 continue;
1522                         if (!(ti->ti_flags & TH_SYN))
1523                                 continue;
1524                         memcpy((char *) &mss, (char *) cp + 2, sizeof(mss));
1525                         NTOHS(mss);
1526                         (void) tcp_mss(tp, mss);        /* sets t_maxseg */
1527                         break;
1528
1529 /*              case TCPOPT_WINDOW:
1530  *                      if (optlen != TCPOLEN_WINDOW)
1531  *                              continue;
1532  *                      if (!(ti->ti_flags & TH_SYN))
1533  *                              continue;
1534  *                      tp->t_flags |= TF_RCVD_SCALE;
1535  *                      tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
1536  *                      break;
1537  */
1538 /*              case TCPOPT_TIMESTAMP:
1539  *                      if (optlen != TCPOLEN_TIMESTAMP)
1540  *                              continue;
1541  *                      *ts_present = 1;
1542  *                      memcpy((char *) ts_val, (char *)cp + 2, sizeof(*ts_val));
1543  *                      NTOHL(*ts_val);
1544  *                      memcpy((char *) ts_ecr, (char *)cp + 6, sizeof(*ts_ecr));
1545  *                      NTOHL(*ts_ecr);
1546  *
1547  */                     /*
1548  *                       * A timestamp received in a SYN makes
1549  *                       * it ok to send timestamp requests and replies.
1550  *                       */
1551 /*                      if (ti->ti_flags & TH_SYN) {
1552  *                              tp->t_flags |= TF_RCVD_TSTMP;
1553  *                              tp->ts_recent = *ts_val;
1554  *                              tp->ts_recent_age = tcp_now;
1555  *                      }
1556  */                     break;
1557                 }
1558         }
1559 }
1560
1561
1562 /*
1563  * Pull out of band byte out of a segment so
1564  * it doesn't appear in the user's data queue.
1565  * It is still reflected in the segment length for
1566  * sequencing purposes.
1567  */
1568
1569 #ifdef notdef
1570
1571 void
1572 tcp_pulloutofband(so, ti, m)
1573         struct socket *so;
1574         struct tcpiphdr *ti;
1575         register struct mbuf *m;
1576 {
1577         int cnt = ti->ti_urp - 1;
1578
1579         while (cnt >= 0) {
1580                 if (m->m_len > cnt) {
1581                         char *cp = mtod(m, caddr_t) + cnt;
1582                         struct tcpcb *tp = sototcpcb(so);
1583
1584                         tp->t_iobc = *cp;
1585                         tp->t_oobflags |= TCPOOB_HAVEDATA;
1586                         memcpy(sp, cp+1, (unsigned)(m->m_len - cnt - 1));
1587                         m->m_len--;
1588                         return;
1589                 }
1590                 cnt -= m->m_len;
1591                 m = m->m_next; /* XXX WRONG! Fix it! */
1592                 if (m == 0)
1593                         break;
1594         }
1595         panic("tcp_pulloutofband");
1596 }
1597
1598 #endif /* notdef */
1599
1600 /*
1601  * Collect new round-trip time estimate
1602  * and update averages and current timeout.
1603  */
1604
1605 static void
1606 tcp_xmit_timer(register struct tcpcb *tp, int rtt)
1607 {
1608         register short delta;
1609
1610         DEBUG_CALL("tcp_xmit_timer");
1611         DEBUG_ARG("tp = %lx", (long)tp);
1612         DEBUG_ARG("rtt = %d", rtt);
1613
1614         STAT(tcpstat.tcps_rttupdated++);
1615         if (tp->t_srtt != 0) {
1616                 /*
1617                  * srtt is stored as fixed point with 3 bits after the
1618                  * binary point (i.e., scaled by 8).  The following magic
1619                  * is equivalent to the smoothing algorithm in rfc793 with
1620                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
1621                  * point).  Adjust rtt to origin 0.
1622                  */
1623                 delta = rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT);
1624                 if ((tp->t_srtt += delta) <= 0)
1625                         tp->t_srtt = 1;
1626                 /*
1627                  * We accumulate a smoothed rtt variance (actually, a
1628                  * smoothed mean difference), then set the retransmit
1629                  * timer to smoothed rtt + 4 times the smoothed variance.
1630                  * rttvar is stored as fixed point with 2 bits after the
1631                  * binary point (scaled by 4).  The following is
1632                  * equivalent to rfc793 smoothing with an alpha of .75
1633                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
1634                  * rfc793's wired-in beta.
1635                  */
1636                 if (delta < 0)
1637                         delta = -delta;
1638                 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
1639                 if ((tp->t_rttvar += delta) <= 0)
1640                         tp->t_rttvar = 1;
1641         } else {
1642                 /*
1643                  * No rtt measurement yet - use the unsmoothed rtt.
1644                  * Set the variance to half the rtt (so our first
1645                  * retransmit happens at 3*rtt).
1646                  */
1647                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
1648                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
1649         }
1650         tp->t_rtt = 0;
1651         tp->t_rxtshift = 0;
1652
1653         /*
1654          * the retransmit should happen at rtt + 4 * rttvar.
1655          * Because of the way we do the smoothing, srtt and rttvar
1656          * will each average +1/2 tick of bias.  When we compute
1657          * the retransmit timer, we want 1/2 tick of rounding and
1658          * 1 extra tick because of +-1/2 tick uncertainty in the
1659          * firing of the timer.  The bias will give us exactly the
1660          * 1.5 tick we need.  But, because the bias is
1661          * statistical, we have to test that we don't drop below
1662          * the minimum feasible timer (which is 2 ticks).
1663          */
1664         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
1665             (short)tp->t_rttmin, TCPTV_REXMTMAX); /* XXX */
1666
1667         /*
1668          * We received an ack for a packet that wasn't retransmitted;
1669          * it is probably safe to discard any error indications we've
1670          * received recently.  This isn't quite right, but close enough
1671          * for now (a route might have failed after we sent a segment,
1672          * and the return path might not be symmetrical).
1673          */
1674         tp->t_softerror = 0;
1675 }
1676
1677 /*
1678  * Determine a reasonable value for maxseg size.
1679  * If the route is known, check route for mtu.
1680  * If none, use an mss that can be handled on the outgoing
1681  * interface without forcing IP to fragment; if bigger than
1682  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
1683  * to utilize large mbufs.  If no route is found, route has no mtu,
1684  * or the destination isn't local, use a default, hopefully conservative
1685  * size (usually 512 or the default IP max size, but no more than the mtu
1686  * of the interface), as we can't discover anything about intervening
1687  * gateways or networks.  We also initialize the congestion/slow start
1688  * window to be a single segment if the destination isn't local.
1689  * While looking at the routing entry, we also initialize other path-dependent
1690  * parameters from pre-set or cached values in the routing entry.
1691  */
1692
1693 int
1694 tcp_mss(tp, offer)
1695         register struct tcpcb *tp;
1696         u_int offer;
1697 {
1698         struct socket *so = tp->t_socket;
1699         int mss;
1700
1701         DEBUG_CALL("tcp_mss");
1702         DEBUG_ARG("tp = %lx", (long)tp);
1703         DEBUG_ARG("offer = %d", offer);
1704
1705         mss = min(IF_MTU, IF_MRU) - sizeof(struct tcpiphdr);
1706         if (offer)
1707                 mss = min(mss, offer);
1708         mss = max(mss, 32);
1709         if (mss < tp->t_maxseg || offer != 0)
1710            tp->t_maxseg = mss;
1711
1712         tp->snd_cwnd = mss;
1713
1714         sbreserve(&so->so_snd, TCP_SNDSPACE + ((TCP_SNDSPACE % mss) ?
1715                                                (mss - (TCP_SNDSPACE % mss)) :
1716                                                0));
1717         sbreserve(&so->so_rcv, TCP_RCVSPACE + ((TCP_RCVSPACE % mss) ?
1718                                                (mss - (TCP_RCVSPACE % mss)) :
1719                                                0));
1720
1721         DEBUG_MISC((dfd, " returning mss = %d\n", mss));
1722
1723         return mss;
1724 }