net/ipv4/tcp_cubic.c
changeset 2 d1f6d8b6f81c
parent 0 aa628870c1d3
equal deleted inserted replaced
1:0056487c491e 2:d1f6d8b6f81c
     1 /*
     1 /*
     2  * TCP CUBIC: Binary Increase Congestion control for TCP v2.2
     2  * TCP CUBIC: Binary Increase Congestion control for TCP v2.3
     3  * Home page:
     3  * Home page:
     4  *      http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
     4  *      http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
     5  * This is from the implementation of CUBIC TCP in
     5  * This is from the implementation of CUBIC TCP in
     6  * Injong Rhee, Lisong Xu.
     6  * Sangtae Ha, Injong Rhee and Lisong Xu,
     7  *  "CUBIC: A New TCP-Friendly High-Speed TCP Variant
     7  *  "CUBIC: A New TCP-Friendly High-Speed TCP Variant"
     8  *  in PFLDnet 2005
     8  *  in ACM SIGOPS Operating System Review, July 2008.
     9  * Available from:
     9  * Available from:
    10  *  http://netsrv.csc.ncsu.edu/export/cubic-paper.pdf
    10  *  http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf
       
    11  *
       
    12  * CUBIC integrates a new slow start algorithm, called HyStart.
       
    13  * The details of HyStart are presented in
       
    14  *  Sangtae Ha and Injong Rhee,
       
    15  *  "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008.
       
    16  * Available from:
       
    17  *  http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf
       
    18  *
       
    19  * All testing results are available from:
       
    20  * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing
    11  *
    21  *
    12  * Unless CUBIC is enabled and congestion window is large
    22  * Unless CUBIC is enabled and congestion window is large
    13  * this behaves the same as the original Reno.
    23  * this behaves the same as the original Reno.
    14  */
    24  */
    15 
    25 
    20 
    30 
    21 #define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
    31 #define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
    22 					 * max_cwnd = snd_cwnd * beta
    32 					 * max_cwnd = snd_cwnd * beta
    23 					 */
    33 					 */
    24 #define	BICTCP_HZ		10	/* BIC HZ 2^10 = 1024 */
    34 #define	BICTCP_HZ		10	/* BIC HZ 2^10 = 1024 */
       
    35 
       
    36 /* Two methods of hybrid slow start */
       
    37 #define HYSTART_ACK_TRAIN	0x1
       
    38 #define HYSTART_DELAY		0x2
       
    39 
       
    40 /* Number of delay samples for detecting the increase of delay */
       
    41 #define HYSTART_MIN_SAMPLES	8
       
    42 #define HYSTART_DELAY_MIN	(2U<<3)
       
    43 #define HYSTART_DELAY_MAX	(16U<<3)
       
    44 #define HYSTART_DELAY_THRESH(x)	clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
    25 
    45 
    26 static int fast_convergence __read_mostly = 1;
    46 static int fast_convergence __read_mostly = 1;
    27 static int beta __read_mostly = 717;	/* = 717/1024 (BICTCP_BETA_SCALE) */
    47 static int beta __read_mostly = 717;	/* = 717/1024 (BICTCP_BETA_SCALE) */
    28 static int initial_ssthresh __read_mostly;
    48 static int initial_ssthresh __read_mostly;
    29 static int bic_scale __read_mostly = 41;
    49 static int bic_scale __read_mostly = 41;
    30 static int tcp_friendliness __read_mostly = 1;
    50 static int tcp_friendliness __read_mostly = 1;
       
    51 
       
    52 static int hystart __read_mostly = 1;
       
    53 static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
       
    54 static int hystart_low_window __read_mostly = 16;
    31 
    55 
    32 static u32 cube_rtt_scale __read_mostly;
    56 static u32 cube_rtt_scale __read_mostly;
    33 static u32 beta_scale __read_mostly;
    57 static u32 beta_scale __read_mostly;
    34 static u64 cube_factor __read_mostly;
    58 static u64 cube_factor __read_mostly;
    35 
    59 
    42 MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
    66 MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
    43 module_param(bic_scale, int, 0444);
    67 module_param(bic_scale, int, 0444);
    44 MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
    68 MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
    45 module_param(tcp_friendliness, int, 0644);
    69 module_param(tcp_friendliness, int, 0644);
    46 MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
    70 MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
       
    71 module_param(hystart, int, 0644);
       
    72 MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
       
    73 module_param(hystart_detect, int, 0644);
       
    74 MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
       
    75 		 " 1: packet-train 2: delay 3: both packet-train and delay");
       
    76 module_param(hystart_low_window, int, 0644);
       
    77 MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
    47 
    78 
    48 /* BIC TCP Parameters */
    79 /* BIC TCP Parameters */
    49 struct bictcp {
    80 struct bictcp {
    50 	u32	cnt;		/* increase cwnd by 1 after ACKs */
    81 	u32	cnt;		/* increase cwnd by 1 after ACKs */
    51 	u32 	last_max_cwnd;	/* last maximum snd_cwnd */
    82 	u32 	last_max_cwnd;	/* last maximum snd_cwnd */
    57 	u32	delay_min;	/* min delay */
    88 	u32	delay_min;	/* min delay */
    58 	u32	epoch_start;	/* beginning of an epoch */
    89 	u32	epoch_start;	/* beginning of an epoch */
    59 	u32	ack_cnt;	/* number of acks */
    90 	u32	ack_cnt;	/* number of acks */
    60 	u32	tcp_cwnd;	/* estimated tcp cwnd */
    91 	u32	tcp_cwnd;	/* estimated tcp cwnd */
    61 #define ACK_RATIO_SHIFT	4
    92 #define ACK_RATIO_SHIFT	4
    62 	u32	delayed_ack;	/* estimate the ratio of Packets/ACKs << 4 */
    93 	u16	delayed_ack;	/* estimate the ratio of Packets/ACKs << 4 */
       
    94 	u8	sample_cnt;	/* number of samples to decide curr_rtt */
       
    95 	u8	found;		/* the exit point is found? */
       
    96 	u32	round_start;	/* beginning of each round */
       
    97 	u32	end_seq;	/* end_seq of the round */
       
    98 	u32	last_jiffies;	/* last time when the ACK spacing is close */
       
    99 	u32	curr_rtt;	/* the minimum rtt of current round */
    63 };
   100 };
    64 
   101 
    65 static inline void bictcp_reset(struct bictcp *ca)
   102 static inline void bictcp_reset(struct bictcp *ca)
    66 {
   103 {
    67 	ca->cnt = 0;
   104 	ca->cnt = 0;
    74 	ca->delay_min = 0;
   111 	ca->delay_min = 0;
    75 	ca->epoch_start = 0;
   112 	ca->epoch_start = 0;
    76 	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
   113 	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
    77 	ca->ack_cnt = 0;
   114 	ca->ack_cnt = 0;
    78 	ca->tcp_cwnd = 0;
   115 	ca->tcp_cwnd = 0;
       
   116 	ca->found = 0;
       
   117 }
       
   118 
       
   119 static inline void bictcp_hystart_reset(struct sock *sk)
       
   120 {
       
   121 	struct tcp_sock *tp = tcp_sk(sk);
       
   122 	struct bictcp *ca = inet_csk_ca(sk);
       
   123 
       
   124 	ca->round_start = ca->last_jiffies = jiffies;
       
   125 	ca->end_seq = tp->snd_nxt;
       
   126 	ca->curr_rtt = 0;
       
   127 	ca->sample_cnt = 0;
    79 }
   128 }
    80 
   129 
    81 static void bictcp_init(struct sock *sk)
   130 static void bictcp_init(struct sock *sk)
    82 {
   131 {
    83 	bictcp_reset(inet_csk_ca(sk));
   132 	bictcp_reset(inet_csk_ca(sk));
    84 	if (initial_ssthresh)
   133 
       
   134 	if (hystart)
       
   135 		bictcp_hystart_reset(sk);
       
   136 
       
   137 	if (!hystart && initial_ssthresh)
    85 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
   138 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
    86 }
   139 }
    87 
   140 
    88 /* calculate the cubic root of x using a table lookup followed by one
   141 /* calculate the cubic root of x using a table lookup followed by one
    89  * Newton-Raphson iteration.
   142  * Newton-Raphson iteration.
   233 	struct bictcp *ca = inet_csk_ca(sk);
   286 	struct bictcp *ca = inet_csk_ca(sk);
   234 
   287 
   235 	if (!tcp_is_cwnd_limited(sk, in_flight))
   288 	if (!tcp_is_cwnd_limited(sk, in_flight))
   236 		return;
   289 		return;
   237 
   290 
   238 	if (tp->snd_cwnd <= tp->snd_ssthresh)
   291 	if (tp->snd_cwnd <= tp->snd_ssthresh) {
       
   292 		if (hystart && after(ack, ca->end_seq))
       
   293 			bictcp_hystart_reset(sk);
   239 		tcp_slow_start(tp);
   294 		tcp_slow_start(tp);
   240 	else {
   295 	} else {
   241 		bictcp_update(ca, tp->snd_cwnd);
   296 		bictcp_update(ca, tp->snd_cwnd);
   242 
   297 
   243 		/* In dangerous area, increase slowly.
   298 		/* In dangerous area, increase slowly.
   244 		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
   299 		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
   245 		 */
   300 		 */
   279 	return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
   334 	return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
   280 }
   335 }
   281 
   336 
   282 static void bictcp_state(struct sock *sk, u8 new_state)
   337 static void bictcp_state(struct sock *sk, u8 new_state)
   283 {
   338 {
   284 	if (new_state == TCP_CA_Loss)
   339 	if (new_state == TCP_CA_Loss) {
   285 		bictcp_reset(inet_csk_ca(sk));
   340 		bictcp_reset(inet_csk_ca(sk));
       
   341 		bictcp_hystart_reset(sk);
       
   342 	}
       
   343 }
       
   344 
       
   345 static void hystart_update(struct sock *sk, u32 delay)
       
   346 {
       
   347 	struct tcp_sock *tp = tcp_sk(sk);
       
   348 	struct bictcp *ca = inet_csk_ca(sk);
       
   349 
       
   350 	if (!(ca->found & hystart_detect)) {
       
   351 		u32 curr_jiffies = jiffies;
       
   352 
       
   353 		/* first detection parameter - ack-train detection */
       
   354 		if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) {
       
   355 			ca->last_jiffies = curr_jiffies;
       
   356 			if (curr_jiffies - ca->round_start >= ca->delay_min>>4)
       
   357 				ca->found |= HYSTART_ACK_TRAIN;
       
   358 		}
       
   359 
       
   360 		/* obtain the minimum delay of more than sampling packets */
       
   361 		if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
       
   362 			if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
       
   363 				ca->curr_rtt = delay;
       
   364 
       
   365 			ca->sample_cnt++;
       
   366 		} else {
       
   367 			if (ca->curr_rtt > ca->delay_min +
       
   368 			    HYSTART_DELAY_THRESH(ca->delay_min>>4))
       
   369 				ca->found |= HYSTART_DELAY;
       
   370 		}
       
   371 		/*
       
   372 		 * Either one of two conditions are met,
       
   373 		 * we exit from slow start immediately.
       
   374 		 */
       
   375 		if (ca->found & hystart_detect)
       
   376 			tp->snd_ssthresh = tp->snd_cwnd;
       
   377 	}
   286 }
   378 }
   287 
   379 
   288 /* Track delayed acknowledgment ratio using sliding window
   380 /* Track delayed acknowledgment ratio using sliding window
   289  * ratio = (15*ratio + sample) / 16
   381  * ratio = (15*ratio + sample) / 16
   290  */
   382  */
   291 static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
   383 static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
   292 {
   384 {
   293 	const struct inet_connection_sock *icsk = inet_csk(sk);
   385 	const struct inet_connection_sock *icsk = inet_csk(sk);
       
   386 	const struct tcp_sock *tp = tcp_sk(sk);
   294 	struct bictcp *ca = inet_csk_ca(sk);
   387 	struct bictcp *ca = inet_csk_ca(sk);
   295 	u32 delay;
   388 	u32 delay;
   296 
   389 
   297 	if (icsk->icsk_ca_state == TCP_CA_Open) {
   390 	if (icsk->icsk_ca_state == TCP_CA_Open) {
   298 		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
   391 		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
   312 		delay = 1;
   405 		delay = 1;
   313 
   406 
   314 	/* first time call or link delay decreases */
   407 	/* first time call or link delay decreases */
   315 	if (ca->delay_min == 0 || ca->delay_min > delay)
   408 	if (ca->delay_min == 0 || ca->delay_min > delay)
   316 		ca->delay_min = delay;
   409 		ca->delay_min = delay;
       
   410 
       
   411 	/* hystart triggers when cwnd is larger than some threshold */
       
   412 	if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
       
   413 	    tp->snd_cwnd >= hystart_low_window)
       
   414 		hystart_update(sk, delay);
   317 }
   415 }
   318 
   416 
   319 static struct tcp_congestion_ops cubictcp = {
   417 static struct tcp_congestion_ops cubictcp = {
   320 	.init		= bictcp_init,
   418 	.init		= bictcp_init,
   321 	.ssthresh	= bictcp_recalc_ssthresh,
   419 	.ssthresh	= bictcp_recalc_ssthresh,
   370 module_exit(cubictcp_unregister);
   468 module_exit(cubictcp_unregister);
   371 
   469 
   372 MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
   470 MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
   373 MODULE_LICENSE("GPL");
   471 MODULE_LICENSE("GPL");
   374 MODULE_DESCRIPTION("CUBIC TCP");
   472 MODULE_DESCRIPTION("CUBIC TCP");
   375 MODULE_VERSION("2.2");
   473 MODULE_VERSION("2.3");