40 #include <linux/module.h> |
40 #include <linux/module.h> |
41 |
41 |
42 /* People can turn this off for buggy TCP's found in printers etc. */ |
42 /* People can turn this off for buggy TCP's found in printers etc. */ |
43 int sysctl_tcp_retrans_collapse __read_mostly = 1; |
43 int sysctl_tcp_retrans_collapse __read_mostly = 1; |
44 |
44 |
45 /* People can turn this on to work with those rare, broken TCPs that |
45 /* People can turn this on to work with those rare, broken TCPs that |
46 * interpret the window field as a signed quantity. |
46 * interpret the window field as a signed quantity. |
47 */ |
47 */ |
48 int sysctl_tcp_workaround_signed_windows __read_mostly = 0; |
48 int sysctl_tcp_workaround_signed_windows __read_mostly = 0; |
49 |
49 |
50 /* This limits the percentage of the congestion window which we |
50 /* This limits the percentage of the congestion window which we |
482 opts->tsecr = tp->rx_opt.ts_recent; |
482 opts->tsecr = tp->rx_opt.ts_recent; |
483 size += TCPOLEN_TSTAMP_ALIGNED; |
483 size += TCPOLEN_TSTAMP_ALIGNED; |
484 } |
484 } |
485 if (likely(sysctl_tcp_window_scaling)) { |
485 if (likely(sysctl_tcp_window_scaling)) { |
486 opts->ws = tp->rx_opt.rcv_wscale; |
486 opts->ws = tp->rx_opt.rcv_wscale; |
487 if(likely(opts->ws)) |
487 if (likely(opts->ws)) |
488 size += TCPOLEN_WSCALE_ALIGNED; |
488 size += TCPOLEN_WSCALE_ALIGNED; |
489 } |
489 } |
490 if (likely(sysctl_tcp_sack)) { |
490 if (likely(sysctl_tcp_sack)) { |
491 opts->options |= OPTION_SACK_ADVERTISE; |
491 opts->options |= OPTION_SACK_ADVERTISE; |
492 if (unlikely(!(OPTION_TS & opts->options))) |
492 if (unlikely(!(OPTION_TS & opts->options))) |
524 opts->mss = mss; |
524 opts->mss = mss; |
525 size += TCPOLEN_MSS_ALIGNED; |
525 size += TCPOLEN_MSS_ALIGNED; |
526 |
526 |
527 if (likely(ireq->wscale_ok)) { |
527 if (likely(ireq->wscale_ok)) { |
528 opts->ws = ireq->rcv_wscale; |
528 opts->ws = ireq->rcv_wscale; |
529 if(likely(opts->ws)) |
529 if (likely(opts->ws)) |
530 size += TCPOLEN_WSCALE_ALIGNED; |
530 size += TCPOLEN_WSCALE_ALIGNED; |
531 } |
531 } |
532 if (likely(doing_ts)) { |
532 if (likely(doing_ts)) { |
533 opts->options |= OPTION_TS; |
533 opts->options |= OPTION_TS; |
534 opts->tsval = TCP_SKB_CB(skb)->when; |
534 opts->tsval = TCP_SKB_CB(skb)->when; |
1517 * account rare use of URG, this is not a big flaw. |
1517 * account rare use of URG, this is not a big flaw. |
1518 * |
1518 * |
1519 * Returns 1, if no segments are in flight and we have queued segments, but |
1519 * Returns 1, if no segments are in flight and we have queued segments, but |
1520 * cannot send anything now because of SWS or another problem. |
1520 * cannot send anything now because of SWS or another problem. |
1521 */ |
1521 */ |
1522 static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) |
1522 static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
|
1523 int push_one, gfp_t gfp) |
1523 { |
1524 { |
1524 struct tcp_sock *tp = tcp_sk(sk); |
1525 struct tcp_sock *tp = tcp_sk(sk); |
1525 struct sk_buff *skb; |
1526 struct sk_buff *skb; |
1526 unsigned int tso_segs, sent_pkts; |
1527 unsigned int tso_segs, sent_pkts; |
1527 int cwnd_quota; |
1528 int cwnd_quota; |
1528 int result; |
1529 int result; |
1529 |
1530 |
1530 /* If we are closed, the bytes will have to remain here. |
|
1531 * In time closedown will finish, we empty the write queue and all |
|
1532 * will be happy. |
|
1533 */ |
|
1534 if (unlikely(sk->sk_state == TCP_CLOSE)) |
|
1535 return 0; |
|
1536 |
|
1537 sent_pkts = 0; |
1531 sent_pkts = 0; |
1538 |
1532 |
1539 /* Do MTU probing. */ |
1533 if (!push_one) { |
1540 if ((result = tcp_mtu_probe(sk)) == 0) { |
1534 /* Do MTU probing. */ |
1541 return 0; |
1535 result = tcp_mtu_probe(sk); |
1542 } else if (result > 0) { |
1536 if (!result) { |
1543 sent_pkts = 1; |
1537 return 0; |
|
1538 } else if (result > 0) { |
|
1539 sent_pkts = 1; |
|
1540 } |
1544 } |
1541 } |
1545 |
1542 |
1546 while ((skb = tcp_send_head(sk))) { |
1543 while ((skb = tcp_send_head(sk))) { |
1547 unsigned int limit; |
1544 unsigned int limit; |
1548 |
1545 |
1575 unlikely(tso_fragment(sk, skb, limit, mss_now))) |
1572 unlikely(tso_fragment(sk, skb, limit, mss_now))) |
1576 break; |
1573 break; |
1577 |
1574 |
1578 TCP_SKB_CB(skb)->when = tcp_time_stamp; |
1575 TCP_SKB_CB(skb)->when = tcp_time_stamp; |
1579 |
1576 |
1580 if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) |
1577 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) |
1581 break; |
1578 break; |
1582 |
1579 |
1583 /* Advance the send_head. This one is sent out. |
1580 /* Advance the send_head. This one is sent out. |
1584 * This call will increment packets_out. |
1581 * This call will increment packets_out. |
1585 */ |
1582 */ |
1586 tcp_event_new_data_sent(sk, skb); |
1583 tcp_event_new_data_sent(sk, skb); |
1587 |
1584 |
1588 tcp_minshall_update(tp, mss_now, skb); |
1585 tcp_minshall_update(tp, mss_now, skb); |
1589 sent_pkts++; |
1586 sent_pkts++; |
|
1587 |
|
1588 if (push_one) |
|
1589 break; |
1590 } |
1590 } |
1591 |
1591 |
1592 if (likely(sent_pkts)) { |
1592 if (likely(sent_pkts)) { |
1593 tcp_cwnd_validate(sk); |
1593 tcp_cwnd_validate(sk); |
1594 return 0; |
1594 return 0; |
1603 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, |
1603 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, |
1604 int nonagle) |
1604 int nonagle) |
1605 { |
1605 { |
1606 struct sk_buff *skb = tcp_send_head(sk); |
1606 struct sk_buff *skb = tcp_send_head(sk); |
1607 |
1607 |
1608 if (skb) { |
1608 if (!skb) |
1609 if (tcp_write_xmit(sk, cur_mss, nonagle)) |
1609 return; |
1610 tcp_check_probe_timer(sk); |
1610 |
1611 } |
1611 /* If we are closed, the bytes will have to remain here. |
|
1612 * In time closedown will finish, we empty the write queue and |
|
1613 * all will be happy. |
|
1614 */ |
|
1615 if (unlikely(sk->sk_state == TCP_CLOSE)) |
|
1616 return; |
|
1617 |
|
1618 if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) |
|
1619 tcp_check_probe_timer(sk); |
1612 } |
1620 } |
1613 |
1621 |
1614 /* Send _single_ skb sitting at the send head. This function requires |
1622 /* Send _single_ skb sitting at the send head. This function requires |
1615 * true push pending frames to setup probe timer etc. |
1623 * true push pending frames to setup probe timer etc. |
1616 */ |
1624 */ |
1617 void tcp_push_one(struct sock *sk, unsigned int mss_now) |
1625 void tcp_push_one(struct sock *sk, unsigned int mss_now) |
1618 { |
1626 { |
1619 struct tcp_sock *tp = tcp_sk(sk); |
|
1620 struct sk_buff *skb = tcp_send_head(sk); |
1627 struct sk_buff *skb = tcp_send_head(sk); |
1621 unsigned int tso_segs, cwnd_quota; |
|
1622 |
1628 |
1623 BUG_ON(!skb || skb->len < mss_now); |
1629 BUG_ON(!skb || skb->len < mss_now); |
1624 |
1630 |
1625 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); |
1631 tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); |
1626 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); |
|
1627 |
|
1628 if (likely(cwnd_quota)) { |
|
1629 unsigned int limit; |
|
1630 |
|
1631 BUG_ON(!tso_segs); |
|
1632 |
|
1633 limit = mss_now; |
|
1634 if (tso_segs > 1 && !tcp_urg_mode(tp)) |
|
1635 limit = tcp_mss_split_point(sk, skb, mss_now, |
|
1636 cwnd_quota); |
|
1637 |
|
1638 if (skb->len > limit && |
|
1639 unlikely(tso_fragment(sk, skb, limit, mss_now))) |
|
1640 return; |
|
1641 |
|
1642 /* Send it out now. */ |
|
1643 TCP_SKB_CB(skb)->when = tcp_time_stamp; |
|
1644 |
|
1645 if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { |
|
1646 tcp_event_new_data_sent(sk, skb); |
|
1647 tcp_cwnd_validate(sk); |
|
1648 return; |
|
1649 } |
|
1650 } |
|
1651 } |
1632 } |
1652 |
1633 |
1653 /* This function returns the amount that we can raise the |
1634 /* This function returns the amount that we can raise the |
1654 * usable window based on the following constraints |
1635 * usable window based on the following constraints |
1655 * |
1636 * |
1765 } |
1746 } |
1766 |
1747 |
1767 return window; |
1748 return window; |
1768 } |
1749 } |
1769 |
1750 |
1770 /* Attempt to collapse two adjacent SKB's during retransmission. */ |
1751 /* Collapses two adjacent SKB's during retransmission. */ |
1771 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, |
1752 static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) |
1772 int mss_now) |
|
1773 { |
1753 { |
1774 struct tcp_sock *tp = tcp_sk(sk); |
1754 struct tcp_sock *tp = tcp_sk(sk); |
1775 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); |
1755 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); |
1776 int skb_size, next_skb_size; |
1756 int skb_size, next_skb_size; |
1777 u16 flags; |
1757 u16 flags; |
1778 |
1758 |
1779 /* The first test we must make is that neither of these two |
|
1780 * SKB's are still referenced by someone else. |
|
1781 */ |
|
1782 if (skb_cloned(skb) || skb_cloned(next_skb)) |
|
1783 return; |
|
1784 |
|
1785 skb_size = skb->len; |
1759 skb_size = skb->len; |
1786 next_skb_size = next_skb->len; |
1760 next_skb_size = next_skb->len; |
1787 flags = TCP_SKB_CB(skb)->flags; |
1761 flags = TCP_SKB_CB(skb)->flags; |
1788 |
1762 |
1789 /* Also punt if next skb has been SACK'd. */ |
|
1790 if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) |
|
1791 return; |
|
1792 |
|
1793 /* Next skb is out of window. */ |
|
1794 if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp))) |
|
1795 return; |
|
1796 |
|
1797 /* Punt if not enough space exists in the first SKB for |
|
1798 * the data in the second, or the total combined payload |
|
1799 * would exceed the MSS. |
|
1800 */ |
|
1801 if ((next_skb_size > skb_tailroom(skb)) || |
|
1802 ((skb_size + next_skb_size) > mss_now)) |
|
1803 return; |
|
1804 |
|
1805 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); |
1763 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); |
1806 |
1764 |
1807 tcp_highest_sack_combine(sk, next_skb, skb); |
1765 tcp_highest_sack_combine(sk, next_skb, skb); |
1808 |
1766 |
1809 /* Ok. We will be able to collapse the packet. */ |
|
1810 tcp_unlink_write_queue(next_skb, sk); |
1767 tcp_unlink_write_queue(next_skb, sk); |
1811 |
1768 |
1812 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), |
1769 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), |
1813 next_skb_size); |
1770 next_skb_size); |
1814 |
1771 |
1846 tp->retransmit_skb_hint = skb; |
1803 tp->retransmit_skb_hint = skb; |
1847 |
1804 |
1848 sk_wmem_free_skb(sk, next_skb); |
1805 sk_wmem_free_skb(sk, next_skb); |
1849 } |
1806 } |
1850 |
1807 |
1851 /* Do a simple retransmit without using the backoff mechanisms in |
1808 static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) |
1852 * tcp_timer. This is used for path mtu discovery. |
1809 { |
1853 * The socket is already locked here. |
1810 if (tcp_skb_pcount(skb) > 1) |
1854 */ |
1811 return 0; |
1855 void tcp_simple_retransmit(struct sock *sk) |
1812 /* TODO: SACK collapsing could be used to remove this condition */ |
1856 { |
1813 if (skb_shinfo(skb)->nr_frags != 0) |
1857 const struct inet_connection_sock *icsk = inet_csk(sk); |
1814 return 0; |
1858 struct tcp_sock *tp = tcp_sk(sk); |
1815 if (skb_cloned(skb)) |
1859 struct sk_buff *skb; |
1816 return 0; |
1860 unsigned int mss = tcp_current_mss(sk, 0); |
1817 if (skb == tcp_send_head(sk)) |
1861 u32 prior_lost = tp->lost_out; |
1818 return 0; |
1862 |
1819 /* Some heurestics for collapsing over SACK'd could be invented */ |
1863 tcp_for_write_queue(skb, sk) { |
1820 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
1864 if (skb == tcp_send_head(sk)) |
1821 return 0; |
|
1822 |
|
1823 return 1; |
|
1824 } |
|
1825 |
|
1826 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, |
|
1827 int space) |
|
1828 { |
|
1829 struct tcp_sock *tp = tcp_sk(sk); |
|
1830 struct sk_buff *skb = to, *tmp; |
|
1831 int first = 1; |
|
1832 |
|
1833 if (!sysctl_tcp_retrans_collapse) |
|
1834 return; |
|
1835 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) |
|
1836 return; |
|
1837 |
|
1838 tcp_for_write_queue_from_safe(skb, tmp, sk) { |
|
1839 if (!tcp_can_collapse(sk, skb)) |
1865 break; |
1840 break; |
1866 if (skb->len > mss && |
1841 |
1867 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { |
1842 space -= skb->len; |
1868 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { |
1843 |
1869 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
1844 if (first) { |
1870 tp->retrans_out -= tcp_skb_pcount(skb); |
1845 first = 0; |
1871 } |
1846 continue; |
1872 tcp_skb_mark_lost_uncond_verify(tp, skb); |
|
1873 } |
1847 } |
1874 } |
1848 |
1875 |
1849 if (space < 0) |
1876 tcp_clear_retrans_hints_partial(tp); |
1850 break; |
1877 |
1851 /* Punt if not enough space exists in the first SKB for |
1878 if (prior_lost == tp->lost_out) |
1852 * the data in the second |
1879 return; |
1853 */ |
1880 |
1854 if (skb->len > skb_tailroom(to)) |
1881 if (tcp_is_reno(tp)) |
1855 break; |
1882 tcp_limit_reno_sacked(tp); |
1856 |
1883 |
1857 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) |
1884 tcp_verify_left_out(tp); |
1858 break; |
1885 |
1859 |
1886 /* Don't muck with the congestion window here. |
1860 tcp_collapse_retrans(sk, to); |
1887 * Reason is that we do not increase amount of _data_ |
1861 } |
1888 * in network, but units changed and effective |
|
1889 * cwnd/ssthresh really reduced now. |
|
1890 */ |
|
1891 if (icsk->icsk_ca_state != TCP_CA_Loss) { |
|
1892 tp->high_seq = tp->snd_nxt; |
|
1893 tp->snd_ssthresh = tcp_current_ssthresh(sk); |
|
1894 tp->prior_ssthresh = 0; |
|
1895 tp->undo_marker = 0; |
|
1896 tcp_set_ca_state(sk, TCP_CA_Loss); |
|
1897 } |
|
1898 tcp_xmit_retransmit_queue(sk); |
|
1899 } |
1862 } |
1900 |
1863 |
1901 /* This retransmits one SKB. Policy decisions and retransmit queue |
1864 /* This retransmits one SKB. Policy decisions and retransmit queue |
1902 * state updates are done by the caller. Returns non-zero if an |
1865 * state updates are done by the caller. Returns non-zero if an |
1903 * error occurred which prevented the send. |
1866 * error occurred which prevented the send. |
1945 if (skb->len > cur_mss) { |
1908 if (skb->len > cur_mss) { |
1946 if (tcp_fragment(sk, skb, cur_mss, cur_mss)) |
1909 if (tcp_fragment(sk, skb, cur_mss, cur_mss)) |
1947 return -ENOMEM; /* We'll try again later. */ |
1910 return -ENOMEM; /* We'll try again later. */ |
1948 } |
1911 } |
1949 |
1912 |
1950 /* Collapse two adjacent packets if worthwhile and we can. */ |
1913 tcp_retrans_try_collapse(sk, skb, cur_mss); |
1951 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && |
|
1952 (skb->len < (cur_mss >> 1)) && |
|
1953 (!tcp_skb_is_last(sk, skb)) && |
|
1954 (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && |
|
1955 (skb_shinfo(skb)->nr_frags == 0 && |
|
1956 skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && |
|
1957 (tcp_skb_pcount(skb) == 1 && |
|
1958 tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) && |
|
1959 (sysctl_tcp_retrans_collapse != 0)) |
|
1960 tcp_retrans_try_collapse(sk, skb, cur_mss); |
|
1961 |
1914 |
1962 /* Some Solaris stacks overoptimize and ignore the FIN on a |
1915 /* Some Solaris stacks overoptimize and ignore the FIN on a |
1963 * retransmit when old data is attached. So strip it off |
1916 * retransmit when old data is attached. So strip it off |
1964 * since it is cheap to do so and saves bytes on the network. |
1917 * since it is cheap to do so and saves bytes on the network. |
1965 */ |
1918 */ |