// SPDX-License-Identifier: GPL-2.0-or-later /* RACK-TLP [RFC8958] Implementation * * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "ar-internal.h" static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1, ktime_t t2, rxrpc_seq_t seq2) { if (ktime_after(t1, t2)) return true; return t1 == t2 && after(seq1, seq2); } /* * Mark a packet lost. */ static void rxrpc_rack_mark_lost(struct rxrpc_call *call, struct rxrpc_txqueue *tq, unsigned int ix) { if (__test_and_set_bit(ix, &tq->segment_lost)) { if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) call->tx_nr_resent--; } else { call->tx_nr_lost++; } tq->segment_xmit_ts[ix] = UINT_MAX; } /* * Get the transmission time of a packet in the Tx queue. */ static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix) { if (tq->segment_xmit_ts[ix] == UINT_MAX) return KTIME_MAX; return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); } /* * Get a bitmask of nack bits for a queue segment and mask off any that aren't * yet reported. */ static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq) { unsigned long nacks = ~tq->segment_acked; if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE) nacks &= (1UL << tq->nr_reported_acks) - 1; return nacks; } /* * Update the RACK state for the most recently sent packet that has been * delivered [RFC8958 6.2 Step 2]. */ static void rxrpc_rack_update(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, struct rxrpc_txqueue *tq, unsigned int ix) { rxrpc_seq_t seq = tq->qbase + ix; ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts); if (__test_and_clear_bit(ix, &tq->segment_lost)) call->tx_nr_lost--; if (test_bit(ix, &tq->segment_retransmitted)) { /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */ if (before(call->acks_highest_serial, tq->segment_serial[ix])) return; if (rtt < minmax_get(&call->min_rtt)) return; } /* The RACK algorithm requires the segment ACKs to be traversed in * order of segment transmission - but the only thing this seems to * matter for is that RACK.rtt is set to the rtt of the most recently * transmitted segment. We should be able to achieve the same by only * setting RACK.rtt if the xmit time is greater. */ if (ktime_after(xmit_ts, call->rack_rtt_ts)) { call->rack_rtt = rtt; call->rack_rtt_ts = xmit_ts; } if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) { call->rack_rtt = rtt; call->rack_xmit_ts = xmit_ts; call->rack_end_seq = seq; } } /* * Detect data segment reordering [RFC8958 6.2 Step 3]. */ static void rxrpc_rack_detect_reordering(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, struct rxrpc_txqueue *tq, unsigned int ix) { rxrpc_seq_t seq = tq->qbase + ix; /* Track the highest sequence number so far ACK'd. This is not * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer * could put a NACK in the last SACK slot. */ if (after(seq, call->rack_fack)) call->rack_fack = seq; else if (before(seq, call->rack_fack) && test_bit(ix, &tq->segment_retransmitted)) call->rack_reordering_seen = true; } void rxrpc_input_rack_one(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, struct rxrpc_txqueue *tq, unsigned int ix) { rxrpc_rack_update(call, summary, tq, ix); rxrpc_rack_detect_reordering(call, summary, tq, ix); } void rxrpc_input_rack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, struct rxrpc_txqueue *tq, unsigned long new_acks) { while (new_acks) { unsigned int ix = __ffs(new_acks); __clear_bit(ix, &new_acks); rxrpc_input_rack_one(call, summary, tq, ix); } trace_rxrpc_rack_update(call, summary); } /* * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated * duration of the reordering window. * * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can * be given a 'DUPLICATE' reason with the serial number referring to the * duplicated DATA packet. Rx does not inform as to whether this was a * reception of the same packet twice or of a retransmission of a packet we * already received (though this could be determined by the transmitter based * on the serial number). */ static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) { rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */ bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE; int dup_thresh = 3; /* DSACK-based reordering window adaptation */ if (!call->rack_dsack_round_none && after_eq(snd_una, call->rack_dsack_round)) call->rack_dsack_round_none = true; /* Grow the reordering window per round that sees DSACK. Reset the * window after 16 DSACK-free recoveries. */ if (call->rack_dsack_round_none && have_dsack_option) { call->rack_dsack_round_none = false; call->rack_dsack_round = snd_nxt; call->rack_reo_wnd_mult++; call->rack_reo_wnd_persist = 16; } else if (summary->exiting_fast_or_rto_recovery) { call->rack_reo_wnd_persist--; if (call->rack_reo_wnd_persist <= 0) call->rack_reo_wnd_mult = 1; } if (!call->rack_reordering_seen) { if (summary->in_fast_or_rto_recovery) return 0; if (call->acks_nr_sacks >= dup_thresh) return 0; } return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4, call->srtt_us >> 3)); } /* * Detect losses [RFC8958 6.2 Step 5]. */ static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) { struct rxrpc_txqueue *tq; ktime_t timeout = 0, lost_after, now = ktime_get_real(); call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary); lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); trace_rxrpc_rack_scan_loss(call); for (tq = call->tx_queue; tq; tq = tq->next) { unsigned long nacks = rxrpc_tq_nacks(tq); if (after(tq->qbase, call->tx_transmitted)) break; trace_rxrpc_rack_scan_loss_tq(call, tq, nacks); /* Skip ones marked lost but not yet retransmitted */ nacks &= ~tq->segment_lost | tq->segment_retransmitted; while (nacks) { unsigned int ix = __ffs(nacks); rxrpc_seq_t seq = tq->qbase + ix; ktime_t remaining; ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); __clear_bit(ix, &nacks); if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq, xmit_ts, seq)) { remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now); if (remaining <= 0) { rxrpc_rack_mark_lost(call, tq, ix); trace_rxrpc_rack_detect_loss(call, summary, seq); } else { timeout = max(remaining, timeout); } } } } return timeout; } /* * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5]. */ void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) { ktime_t timeout = rxrpc_rack_detect_loss(call, summary); if (timeout) { call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER; call->rack_timo_at = ktime_add(ktime_get_real(), timeout); trace_rxrpc_rack_timer(call, timeout, false); trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo); } } /* * Handle RACK-TLP RTO expiration [RFC8958 6.3]. */ static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call) { struct rxrpc_txqueue *tq; rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); ktime_t deadline = ktime_sub(ktime_get_real(), lost_after); for (tq = call->tx_queue; tq; tq = tq->next) { unsigned long unacked = ~tq->segment_acked; trace_rxrpc_rack_mark_loss_tq(call, tq); while (unacked) { unsigned int ix = __ffs(unacked); rxrpc_seq_t seq = tq->qbase + ix; ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); if (after(seq, call->tx_transmitted)) return; __clear_bit(ix, &unacked); if (seq == snd_una || ktime_before(xmit_ts, deadline)) rxrpc_rack_mark_lost(call, tq, ix); } } } /* * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2]. */ ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now) { unsigned int flight_size = rxrpc_tx_in_flight(call); ktime_t rto_at = ktime_add(call->tx_last_sent, rxrpc_get_rto_backoff(call, false)); ktime_t pto; if (call->rtt_count > 0) { /* Use 2*SRTT as the timeout. */ pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4); if (flight_size) pto = ktime_add(pto, call->tlp_max_ack_delay); } else { pto = NSEC_PER_SEC; } if (ktime_after(ktime_add(now, pto), rto_at)) pto = ktime_sub(rto_at, now); return pto; } /* * Send a TLP loss probe on PTO expiration [RFC8958 7.3]. */ void rxrpc_tlp_send_probe(struct rxrpc_call *call) { unsigned int in_flight = rxrpc_tx_in_flight(call); if (after_eq(call->acks_hard_ack, call->tx_transmitted)) return; /* Everything we transmitted has been acked. */ /* There must be no other loss probe still in flight and we need to * have taken a new RTT sample since last probe or the start of * connection. */ if (!call->tlp_serial && call->tlp_rtt_taken != call->rtt_taken) { call->tlp_is_retrans = false; if (after(call->send_top, call->tx_transmitted) && rxrpc_tx_window_space(call) > 0) { /* Transmit the lowest-sequence unsent DATA */ call->tx_last_serial = 0; rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data); call->tlp_serial = call->tx_last_serial; call->tlp_seq = call->tx_transmitted; trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new); in_flight = rxrpc_tx_in_flight(call); } else { /* Retransmit the highest-sequence DATA sent */ call->tx_last_serial = 0; rxrpc_resend_tlp(call); call->tlp_is_retrans = true; trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit); } } else { trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy); } if (in_flight != 0) { ktime_t rto = rxrpc_get_rto_backoff(call, false); call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO; call->rack_timo_at = ktime_add(ktime_get_real(), rto); trace_rxrpc_rack_timer(call, rto, false); trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto); } } /* * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4]. */ void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) { if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack)) return; if (!call->tlp_is_retrans) { /* TLP of new data delivered */ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data); call->tlp_serial = 0; } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE && summary->acked_serial == call->tlp_serial) { /* General Case: Detected packet losses using RACK [7.4.1] */ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked); call->tlp_serial = 0; } else if (after(call->acks_hard_ack, call->tlp_seq)) { /* Repaired the single loss */ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond); call->tlp_serial = 0; // TODO: Invoke congestion control to react to the loss // event the probe has repaired } else if (summary->tlp_probe_acked) { trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked); /* Special Case: Detected a single loss repaired by the loss * probe [7.4.2] */ call->tlp_serial = 0; } else { trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete); } } /* * Handle RACK timer expiration; returns true to request a resend. */ void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by) { struct rxrpc_ack_summary summary = {}; enum rxrpc_rack_timer_mode mode = call->rack_timer_mode; trace_rxrpc_rack_timer(call, overran_by, true); call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; switch (mode) { case RXRPC_CALL_RACKTIMER_RACK_REORDER: rxrpc_rack_detect_loss_and_arm_timer(call, &summary); break; case RXRPC_CALL_RACKTIMER_TLP_PTO: rxrpc_tlp_send_probe(call); break; case RXRPC_CALL_RACKTIMER_RTO: // Might need to poke the congestion algo in some way rxrpc_rack_mark_losses_on_rto(call); break; //case RXRPC_CALL_RACKTIMER_ZEROWIN: default: pr_warn("Unexpected rack timer %u", call->rack_timer_mode); } }